001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022
023import org.apache.hadoop.conf.Configurable;
024import org.apache.hadoop.conf.Configuration;
025import org.apache.hadoop.hbase.client.RegionInfoBuilder;
026import org.apache.hadoop.hbase.util.Threads;
027import org.apache.yetus.audience.InterfaceAudience;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
033import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
034
035/**
036 * This class defines methods that can help with managing HBase clusters
037 * from unit tests and system tests. There are 3 types of cluster deployments:
038 * <ul>
039 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads,
040 * used by unit tests</li>
041 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
042 * interact with the cluster. </li>
043 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate
044 * JVMs. </li>
045 * </ul>
046 * <p>
047 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can
048 * be run against a mini-cluster during unit test execution, or a distributed cluster having
049 * tens/hundreds of nodes during execution of integration tests.
050 *
051 * <p>
052 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
053 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster,
054 * and some tests will still need to mock stuff and introspect internal state. For those use
055 * cases from unit tests, or if more control is needed, you can use the subclasses directly.
056 * In that sense, this class does not abstract away <strong>every</strong> interface that
057 * MiniHBaseCluster or DistributedHBaseCluster provide.
058 */
059@InterfaceAudience.Private
060public abstract class HBaseCluster implements Closeable, Configurable {
061  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
062  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
063  protected Configuration conf;
064
065  /** the status of the cluster before we begin */
066  protected ClusterMetrics initialClusterStatus;
067
068  /**
069   * Construct an HBaseCluster
070   * @param conf Configuration to be used for cluster
071   */
072  public HBaseCluster(Configuration conf) {
073    setConf(conf);
074  }
075
076  @Override
077  public void setConf(Configuration conf) {
078    this.conf = conf;
079  }
080
081  @Override
082  public Configuration getConf() {
083    return conf;
084  }
085
086  /**
087   * Returns a ClusterMetrics for this HBase cluster.
088   * @see #getInitialClusterMetrics()
089   */
090  public abstract ClusterMetrics getClusterMetrics() throws IOException;
091
092  /**
093   * Returns a ClusterStatus for this HBase cluster as observed at the
094   * starting of the HBaseCluster
095   */
096  public ClusterMetrics getInitialClusterMetrics() throws IOException {
097    return initialClusterStatus;
098  }
099
100  /**
101   * Returns an {@link MasterService.BlockingInterface} to the active master
102   */
103  public abstract MasterService.BlockingInterface getMasterAdminService()
104  throws IOException;
105
106  /**
107   * Returns an AdminProtocol interface to the regionserver
108   */
109  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
110  throws IOException;
111
112  /**
113   * Returns a ClientProtocol interface to the regionserver
114   */
115  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
116  throws IOException;
117
118  /**
119   * Starts a new region server on the given hostname or if this is a mini/local cluster,
120   * starts a region server locally.
121   * @param hostname the hostname to start the regionserver on
122   * @throws IOException if something goes wrong
123   */
124  public abstract void startRegionServer(String hostname, int port) throws IOException;
125
126  /**
127   * Kills the region server process if this is a distributed cluster, otherwise
128   * this causes the region server to exit doing basic clean up only.
129   * @throws IOException if something goes wrong
130   */
131  public abstract void killRegionServer(ServerName serverName) throws IOException;
132
133  /**
134   * Keeping track of killed servers and being able to check if a particular server was killed makes
135   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
136   * example of such case is - killing servers and waiting for all regions of a particular table
137   * to be assigned. We can check for server column in META table and that its value is not one
138   * of the killed servers.
139   */
140  public abstract boolean isKilledRS(ServerName serverName);
141
142  /**
143   * Stops the given region server, by attempting a gradual stop.
144   * @throws IOException if something goes wrong
145   */
146  public abstract void stopRegionServer(ServerName serverName) throws IOException;
147
148  /**
149   * Wait for the specified region server to join the cluster
150   * @throws IOException if something goes wrong or timeout occurs
151   */
152  public void waitForRegionServerToStart(String hostname, int port, long timeout)
153      throws IOException {
154    long start = System.currentTimeMillis();
155    while ((System.currentTimeMillis() - start) < timeout) {
156      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
157        if (server.getHostname().equals(hostname) && server.getPort() == port) {
158          return;
159        }
160      }
161      Threads.sleep(100);
162    }
163    throw new IOException("did timeout " + timeout + "ms waiting for region server to start: "
164        + hostname);
165  }
166
167  /**
168   * Wait for the specified region server to stop the thread / process.
169   * @throws IOException if something goes wrong or timeout occurs
170   */
171  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
172      throws IOException;
173
174  /**
175   * Suspend the region server
176   * @param serverName the hostname to suspend the regionserver on
177   * @throws IOException if something goes wrong
178   */
179  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
180
181  /**
182   * Resume the region server
183   * @param serverName the hostname to resume the regionserver on
184   * @throws IOException if something goes wrong
185   */
186  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
187
188  /**
189   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
190   * silently logs warning message.
191   * @param hostname the hostname to start the regionserver on
192   * @throws IOException if something goes wrong
193   */
194  public abstract void startZkNode(String hostname, int port) throws IOException;
195
196  /**
197   * Kills the zookeeper node process if this is a distributed cluster, otherwise,
198   * this causes master to exit doing basic clean up only.
199   * @throws IOException if something goes wrong
200   */
201  public abstract void killZkNode(ServerName serverName) throws IOException;
202
203  /**
204   * Stops the region zookeeper if this is a distributed cluster, otherwise
205   * silently logs warning message.
206   * @throws IOException if something goes wrong
207   */
208  public abstract void stopZkNode(ServerName serverName) throws IOException;
209
210  /**
211   * Wait for the specified zookeeper node to join the cluster
212   * @throws IOException if something goes wrong or timeout occurs
213   */
214  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout)
215    throws IOException;
216
217  /**
218   * Wait for the specified zookeeper node to stop the thread / process.
219   * @throws IOException if something goes wrong or timeout occurs
220   */
221  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout)
222    throws IOException;
223
224  /**
225   * Starts a new datanode on the given hostname or if this is a mini/local cluster,
226   * silently logs warning message.
227   * @throws IOException if something goes wrong
228   */
229  public abstract void startDataNode(ServerName serverName) throws IOException;
230
231  /**
232   * Kills the datanode process if this is a distributed cluster, otherwise,
233   * this causes master to exit doing basic clean up only.
234   * @throws IOException if something goes wrong
235   */
236  public abstract void killDataNode(ServerName serverName) throws IOException;
237
238  /**
239   * Stops the datanode if this is a distributed cluster, otherwise
240   * silently logs warning message.
241   * @throws IOException if something goes wrong
242   */
243  public abstract void stopDataNode(ServerName serverName) throws IOException;
244
245  /**
246   * Wait for the specified datanode to join the cluster
247   * @throws IOException if something goes wrong or timeout occurs
248   */
249  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
250    throws IOException;
251
252  /**
253   * Wait for the specified datanode to stop the thread / process.
254   * @throws IOException if something goes wrong or timeout occurs
255   */
256  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
257    throws IOException;
258
259  /**
260   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
261   * warning message.
262   * @throws IOException if something goes wrong
263   */
264  public abstract void startNameNode(ServerName serverName) throws IOException;
265
266  /**
267   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
268   * exit doing basic clean up only.
269   * @throws IOException if something goes wrong
270   */
271  public abstract void killNameNode(ServerName serverName) throws IOException;
272
273  /**
274   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
275   * @throws IOException if something goes wrong
276   */
277  public abstract void stopNameNode(ServerName serverName) throws IOException;
278
279  /**
280   * Wait for the specified namenode to join the cluster
281   * @throws IOException if something goes wrong or timeout occurs
282   */
283  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
284      throws IOException;
285
286  /**
287   * Wait for the specified namenode to stop
288   * @throws IOException if something goes wrong or timeout occurs
289   */
290  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
291      throws IOException;
292
293  /**
294   * Starts a new master on the given hostname or if this is a mini/local cluster,
295   * starts a master locally.
296   * @param hostname the hostname to start the master on
297   * @throws IOException if something goes wrong
298   */
299  public abstract void startMaster(String hostname, int port) throws IOException;
300
301  /**
302   * Kills the master process if this is a distributed cluster, otherwise,
303   * this causes master to exit doing basic clean up only.
304   * @throws IOException if something goes wrong
305   */
306  public abstract void killMaster(ServerName serverName) throws IOException;
307
308  /**
309   * Stops the given master, by attempting a gradual stop.
310   * @throws IOException if something goes wrong
311   */
312  public abstract void stopMaster(ServerName serverName) throws IOException;
313
314  /**
315   * Wait for the specified master to stop the thread / process.
316   * @throws IOException if something goes wrong or timeout occurs
317   */
318  public abstract void waitForMasterToStop(ServerName serverName, long timeout)
319      throws IOException;
320
321  /**
322   * Blocks until there is an active master and that master has completed
323   * initialization.
324   *
325   * @return true if an active master becomes available.  false if there are no
326   *         masters left.
327   * @throws IOException if something goes wrong or timeout occurs
328   */
329  public boolean waitForActiveAndReadyMaster()
330      throws IOException {
331    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
332  }
333
334  /**
335   * Blocks until there is an active master and that master has completed
336   * initialization.
337   * @param timeout the timeout limit in ms
338   * @return true if an active master becomes available.  false if there are no
339   *         masters left.
340   */
341  public abstract boolean waitForActiveAndReadyMaster(long timeout)
342      throws IOException;
343
344  /**
345   * Wait for HBase Cluster to shut down.
346   */
347  public abstract void waitUntilShutDown() throws IOException;
348
349  /**
350   * Shut down the HBase cluster
351   */
352  public abstract void shutdown() throws IOException;
353
354  /**
355   * Restores the cluster to it's initial state if this is a real cluster,
356   * otherwise does nothing.
357   * This is a best effort restore. If the servers are not reachable, or insufficient
358   * permissions, etc. restoration might be partial.
359   * @return whether restoration is complete
360   */
361  public boolean restoreInitialStatus() throws IOException {
362    return restoreClusterMetrics(getInitialClusterMetrics());
363  }
364
365  /**
366   * Restores the cluster to given state if this is a real cluster,
367   * otherwise does nothing.
368   * This is a best effort restore. If the servers are not reachable, or insufficient
369   * permissions, etc. restoration might be partial.
370   * @return whether restoration is complete
371   */
372  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
373    return true;
374  }
375
376  /**
377   * Get the ServerName of region server serving the first hbase:meta region
378   */
379  public ServerName getServerHoldingMeta() throws IOException {
380    return getServerHoldingRegion(TableName.META_TABLE_NAME,
381      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
382  }
383
384  /**
385   * Get the ServerName of region server serving the specified region
386   * @param regionName Name of the region in bytes
387   * @param tn Table name that has the region.
388   * @return ServerName that hosts the region or null
389   */
390  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
391      throws IOException;
392
393  /**
394   * @return whether we are interacting with a distributed cluster as opposed to an
395   * in-process mini/local cluster.
396   */
397  public boolean isDistributedCluster() {
398    return false;
399  }
400
401  /**
402   * Closes all the resources held open for this cluster. Note that this call does not shutdown
403   * the cluster.
404   * @see #shutdown()
405   */
406  @Override
407  public abstract void close() throws IOException;
408
409  /**
410   * Wait for the namenode.
411   *
412   * @throws InterruptedException
413   */
414  public void waitForNamenodeAvailable() throws InterruptedException {
415  }
416
417  public void waitForDatanodesRegistered(int nbDN) throws Exception {
418  }
419}