001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
026import org.apache.hadoop.hbase.util.Threads;
027import org.apache.yetus.audience.InterfaceAudience;
028import org.apache.yetus.audience.InterfaceStability;
029
030/**
031 * This class defines methods that can help with managing HBase clusters from unit tests and system
032 * tests. There are 3 types of cluster deployments:
033 * <ul>
034 * <li><b>SingleProcessHBaseCluster:</b> each server is run in the same JVM in separate threads,
035 * used by unit tests</li>
036 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
037 * interact with the cluster.</li>
038 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
039 * </li>
040 * </ul>
041 * <p>
042 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
043 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
044 * of nodes during execution of integration tests.
045 * <p>
046 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
047 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
048 * some tests will still need to mock stuff and introspect internal state. For those use cases from
049 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
050 * this class does not abstract away <strong>every</strong> interface that SingleProcessHBaseCluster
051 * or DistributedHBaseCluster provide.
052 */
053@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.PHOENIX)
054@InterfaceStability.Evolving
055public abstract class HBaseClusterInterface implements Closeable, Configurable {
056
057  protected Configuration conf;
058
059  /** the status of the cluster before we begin */
060  protected ClusterMetrics initialClusterStatus;
061
062  /**
063   * Construct an HBaseCluster
064   * @param conf Configuration to be used for cluster
065   */
066  public HBaseClusterInterface(Configuration conf) {
067    setConf(conf);
068  }
069
070  @Override
071  public void setConf(Configuration conf) {
072    this.conf = conf;
073  }
074
075  @Override
076  public Configuration getConf() {
077    return conf;
078  }
079
080  /**
081   * Returns a ClusterMetrics for this HBase cluster.
082   * @see #getInitialClusterMetrics()
083   */
084  public abstract ClusterMetrics getClusterMetrics() throws IOException;
085
086  /**
087   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
088   */
089  public ClusterMetrics getInitialClusterMetrics() throws IOException {
090    return initialClusterStatus;
091  }
092
093  /**
094   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
095   * region server locally.
096   * @param hostname the hostname to start the regionserver on
097   * @throws IOException if something goes wrong
098   */
099  public abstract void startRegionServer(String hostname, int port) throws IOException;
100
101  /**
102   * Kills the region server process if this is a distributed cluster, otherwise this causes the
103   * region server to exit doing basic clean up only.
104   * @throws IOException if something goes wrong
105   */
106  public abstract void killRegionServer(ServerName serverName) throws IOException;
107
108  /**
109   * Keeping track of killed servers and being able to check if a particular server was killed makes
110   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
111   * example of such case is - killing servers and waiting for all regions of a particular table to
112   * be assigned. We can check for server column in META table and that its value is not one of the
113   * killed servers.
114   */
115  public abstract boolean isKilledRS(ServerName serverName);
116
117  /**
118   * Stops the given region server, by attempting a gradual stop.
119   * @throws IOException if something goes wrong
120   */
121  public abstract void stopRegionServer(ServerName serverName) throws IOException;
122
123  /**
124   * Wait for the specified region server to join the cluster
125   * @throws IOException if something goes wrong or timeout occurs
126   */
127  public void waitForRegionServerToStart(String hostname, int port, long timeout)
128    throws IOException {
129    long start = EnvironmentEdgeManager.currentTime();
130    while ((EnvironmentEdgeManager.currentTime() - start) < timeout) {
131      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
132        if (server.getHostname().equals(hostname) && server.getPort() == port) {
133          return;
134        }
135      }
136      Threads.sleep(100);
137    }
138    throw new IOException(
139      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
140  }
141
142  /**
143   * Wait for the specified region server to stop the thread / process.
144   * @throws IOException if something goes wrong or timeout occurs
145   */
146  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
147    throws IOException;
148
149  /**
150   * Suspend the region server
151   * @param serverName the hostname to suspend the regionserver on
152   * @throws IOException if something goes wrong
153   */
154  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
155
156  /**
157   * Resume the region server
158   * @param serverName the hostname to resume the regionserver on
159   * @throws IOException if something goes wrong
160   */
161  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
162
163  /**
164   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
165   * logs warning message.
166   * @param hostname the hostname to start the regionserver on
167   * @throws IOException if something goes wrong
168   */
169  public abstract void startZkNode(String hostname, int port) throws IOException;
170
171  /**
172   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
173   * master to exit doing basic clean up only.
174   * @throws IOException if something goes wrong
175   */
176  public abstract void killZkNode(ServerName serverName) throws IOException;
177
178  /**
179   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
180   * message.
181   * @throws IOException if something goes wrong
182   */
183  public abstract void stopZkNode(ServerName serverName) throws IOException;
184
185  /**
186   * Wait for the specified zookeeper node to join the cluster
187   * @throws IOException if something goes wrong or timeout occurs
188   */
189  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
190
191  /**
192   * Wait for the specified zookeeper node to stop the thread / process.
193   * @throws IOException if something goes wrong or timeout occurs
194   */
195  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
196
197  /**
198   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
199   * warning message.
200   * @throws IOException if something goes wrong
201   */
202  public abstract void startDataNode(ServerName serverName) throws IOException;
203
204  /**
205   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
206   * exit doing basic clean up only.
207   * @throws IOException if something goes wrong
208   */
209  public abstract void killDataNode(ServerName serverName) throws IOException;
210
211  /**
212   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
213   * @throws IOException if something goes wrong
214   */
215  public abstract void stopDataNode(ServerName serverName) throws IOException;
216
217  /**
218   * Wait for the specified datanode to join the cluster
219   * @throws IOException if something goes wrong or timeout occurs
220   */
221  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
222    throws IOException;
223
224  /**
225   * Wait for the specified datanode to stop the thread / process.
226   * @throws IOException if something goes wrong or timeout occurs
227   */
228  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
229    throws IOException;
230
231  /**
232   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
233   * warning message.
234   * @throws IOException if something goes wrong
235   */
236  public abstract void startNameNode(ServerName serverName) throws IOException;
237
238  /**
239   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
240   * exit doing basic clean up only.
241   * @throws IOException if something goes wrong
242   */
243  public abstract void killNameNode(ServerName serverName) throws IOException;
244
245  /**
246   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
247   * @throws IOException if something goes wrong
248   */
249  public abstract void stopNameNode(ServerName serverName) throws IOException;
250
251  /**
252   * Wait for the specified namenode to join the cluster
253   * @throws IOException if something goes wrong or timeout occurs
254   */
255  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
256    throws IOException;
257
258  /**
259   * Wait for the specified namenode to stop
260   * @throws IOException if something goes wrong or timeout occurs
261   */
262  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
263    throws IOException;
264
265  /**
266   * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
267   * logs warning message.
268   * @throws IOException if something goes wrong
269   */
270  public abstract void startJournalNode(ServerName serverName) throws IOException;
271
272  /**
273   * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
274   * to exit doing basic clean up only.
275   * @throws IOException if something goes wrong
276   */
277  public abstract void killJournalNode(ServerName serverName) throws IOException;
278
279  /**
280   * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
281   * message.
282   * @throws IOException if something goes wrong
283   */
284  public abstract void stopJournalNode(ServerName serverName) throws IOException;
285
286  /**
287   * Wait for the specified journalnode to join the cluster
288   * @throws IOException if something goes wrong or timeout occurs
289   */
290  public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
291    throws IOException;
292
293  /**
294   * Wait for the specified journalnode to stop
295   * @throws IOException if something goes wrong or timeout occurs
296   */
297  public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
298    throws IOException;
299
300  /**
301   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
302   * locally.
303   * @param hostname the hostname to start the master on
304   * @throws IOException if something goes wrong
305   */
306  public abstract void startMaster(String hostname, int port) throws IOException;
307
308  /**
309   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
310   * exit doing basic clean up only.
311   * @throws IOException if something goes wrong
312   */
313  public abstract void killMaster(ServerName serverName) throws IOException;
314
315  /**
316   * Stops the given master, by attempting a gradual stop.
317   * @throws IOException if something goes wrong
318   */
319  public abstract void stopMaster(ServerName serverName) throws IOException;
320
321  /**
322   * Wait for the specified master to stop the thread / process.
323   * @throws IOException if something goes wrong or timeout occurs
324   */
325  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
326
327  /**
328   * Blocks until there is an active master and that master has completed initialization.
329   * @return true if an active master becomes available. false if there are no masters left.
330   * @throws IOException if something goes wrong or timeout occurs
331   */
332  public boolean waitForActiveAndReadyMaster() throws IOException {
333    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
334  }
335
336  /**
337   * Blocks until there is an active master and that master has completed initialization.
338   * @param timeout the timeout limit in ms
339   * @return true if an active master becomes available. false if there are no masters left.
340   */
341  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
342
343  /**
344   * Wait for HBase Cluster to shut down.
345   */
346  public abstract void waitUntilShutDown() throws IOException;
347
348  /**
349   * Shut down the HBase cluster
350   */
351  public abstract void shutdown() throws IOException;
352
353  /**
354   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
355   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
356   * etc. restoration might be partial.
357   * @return whether restoration is complete
358   */
359  public boolean restoreInitialStatus() throws IOException {
360    return restoreClusterMetrics(getInitialClusterMetrics());
361  }
362
363  /**
364   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
365   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
366   * restoration might be partial.
367   * @return whether restoration is complete
368   */
369  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
370    return true;
371  }
372
373  /**
374   * Get the ServerName of region server serving the first hbase:meta region
375   */
376  public ServerName getServerHoldingMeta() throws IOException {
377    return getServerHoldingRegion(TableName.META_TABLE_NAME,
378      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
379  }
380
381  /**
382   * Get the ServerName of region server serving the specified region
383   * @param regionName Name of the region in bytes
384   * @param tn         Table name that has the region.
385   * @return ServerName that hosts the region or null
386   */
387  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
388    throws IOException;
389
390  /**
391   * @return whether we are interacting with a distributed cluster as opposed to an in-process
392   *         mini/local cluster.
393   */
394  public boolean isDistributedCluster() {
395    return false;
396  }
397
398  /**
399   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
400   * cluster.
401   * @see #shutdown()
402   */
403  @Override
404  public abstract void close() throws IOException;
405
406  /**
407   * Wait for the namenode.
408   */
409  public void waitForNamenodeAvailable() throws InterruptedException {
410  }
411
412  public void waitForDatanodesRegistered(int nbDN) throws Exception {
413  }
414}