001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import java.io.Closeable; 021import java.io.IOException; 022 023import org.apache.hadoop.conf.Configurable; 024import org.apache.hadoop.conf.Configuration; 025import org.apache.hadoop.hbase.client.RegionInfoBuilder; 026import org.apache.hadoop.hbase.util.Threads; 027import org.apache.yetus.audience.InterfaceAudience; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService; 032import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService; 033import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService; 034 035/** 036 * This class defines methods that can help with managing HBase clusters 037 * from unit tests and system tests. There are 3 types of cluster deployments: 038 * <ul> 039 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, 040 * used by unit tests</li> 041 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can 042 * interact with the cluster. </li> 043 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate 044 * JVMs. </li> 045 * </ul> 046 * <p> 047 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can 048 * be run against a mini-cluster during unit test execution, or a distributed cluster having 049 * tens/hundreds of nodes during execution of integration tests. 050 * 051 * <p> 052 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume 053 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, 054 * and some tests will still need to mock stuff and introspect internal state. For those use 055 * cases from unit tests, or if more control is needed, you can use the subclasses directly. 056 * In that sense, this class does not abstract away <strong>every</strong> interface that 057 * MiniHBaseCluster or DistributedHBaseCluster provide. 058 */ 059@InterfaceAudience.Private 060public abstract class HBaseCluster implements Closeable, Configurable { 061 // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope 062 static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName()); 063 protected Configuration conf; 064 065 /** the status of the cluster before we begin */ 066 protected ClusterMetrics initialClusterStatus; 067 068 /** 069 * Construct an HBaseCluster 070 * @param conf Configuration to be used for cluster 071 */ 072 public HBaseCluster(Configuration conf) { 073 setConf(conf); 074 } 075 076 @Override 077 public void setConf(Configuration conf) { 078 this.conf = conf; 079 } 080 081 @Override 082 public Configuration getConf() { 083 return conf; 084 } 085 086 /** 087 * Returns a ClusterMetrics for this HBase cluster. 088 * @see #getInitialClusterMetrics() 089 */ 090 public abstract ClusterMetrics getClusterMetrics() throws IOException; 091 092 /** 093 * Returns a ClusterStatus for this HBase cluster as observed at the 094 * starting of the HBaseCluster 095 */ 096 public ClusterMetrics getInitialClusterMetrics() throws IOException { 097 return initialClusterStatus; 098 } 099 100 /** 101 * Returns an {@link MasterService.BlockingInterface} to the active master 102 */ 103 public abstract MasterService.BlockingInterface getMasterAdminService() 104 throws IOException; 105 106 /** 107 * Returns an AdminProtocol interface to the regionserver 108 */ 109 public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName) 110 throws IOException; 111 112 /** 113 * Returns a ClientProtocol interface to the regionserver 114 */ 115 public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName) 116 throws IOException; 117 118 /** 119 * Starts a new region server on the given hostname or if this is a mini/local cluster, 120 * starts a region server locally. 121 * @param hostname the hostname to start the regionserver on 122 * @throws IOException if something goes wrong 123 */ 124 public abstract void startRegionServer(String hostname, int port) throws IOException; 125 126 /** 127 * Kills the region server process if this is a distributed cluster, otherwise 128 * this causes the region server to exit doing basic clean up only. 129 * @throws IOException if something goes wrong 130 */ 131 public abstract void killRegionServer(ServerName serverName) throws IOException; 132 133 /** 134 * Keeping track of killed servers and being able to check if a particular server was killed makes 135 * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete 136 * example of such case is - killing servers and waiting for all regions of a particular table 137 * to be assigned. We can check for server column in META table and that its value is not one 138 * of the killed servers. 139 */ 140 public abstract boolean isKilledRS(ServerName serverName); 141 142 /** 143 * Stops the given region server, by attempting a gradual stop. 144 * @throws IOException if something goes wrong 145 */ 146 public abstract void stopRegionServer(ServerName serverName) throws IOException; 147 148 /** 149 * Wait for the specified region server to join the cluster 150 * @throws IOException if something goes wrong or timeout occurs 151 */ 152 public void waitForRegionServerToStart(String hostname, int port, long timeout) 153 throws IOException { 154 long start = System.currentTimeMillis(); 155 while ((System.currentTimeMillis() - start) < timeout) { 156 for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) { 157 if (server.getHostname().equals(hostname) && server.getPort() == port) { 158 return; 159 } 160 } 161 Threads.sleep(100); 162 } 163 throw new IOException("did timeout " + timeout + "ms waiting for region server to start: " 164 + hostname); 165 } 166 167 /** 168 * Wait for the specified region server to stop the thread / process. 169 * @throws IOException if something goes wrong or timeout occurs 170 */ 171 public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) 172 throws IOException; 173 174 /** 175 * Suspend the region server 176 * @param serverName the hostname to suspend the regionserver on 177 * @throws IOException if something goes wrong 178 */ 179 public abstract void suspendRegionServer(ServerName serverName) throws IOException; 180 181 /** 182 * Resume the region server 183 * @param serverName the hostname to resume the regionserver on 184 * @throws IOException if something goes wrong 185 */ 186 public abstract void resumeRegionServer(ServerName serverName) throws IOException; 187 188 /** 189 * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, 190 * silently logs warning message. 191 * @param hostname the hostname to start the regionserver on 192 * @throws IOException if something goes wrong 193 */ 194 public abstract void startZkNode(String hostname, int port) throws IOException; 195 196 /** 197 * Kills the zookeeper node process if this is a distributed cluster, otherwise, 198 * this causes master to exit doing basic clean up only. 199 * @throws IOException if something goes wrong 200 */ 201 public abstract void killZkNode(ServerName serverName) throws IOException; 202 203 /** 204 * Stops the region zookeeper if this is a distributed cluster, otherwise 205 * silently logs warning message. 206 * @throws IOException if something goes wrong 207 */ 208 public abstract void stopZkNode(ServerName serverName) throws IOException; 209 210 /** 211 * Wait for the specified zookeeper node to join the cluster 212 * @throws IOException if something goes wrong or timeout occurs 213 */ 214 public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) 215 throws IOException; 216 217 /** 218 * Wait for the specified zookeeper node to stop the thread / process. 219 * @throws IOException if something goes wrong or timeout occurs 220 */ 221 public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) 222 throws IOException; 223 224 /** 225 * Starts a new datanode on the given hostname or if this is a mini/local cluster, 226 * silently logs warning message. 227 * @throws IOException if something goes wrong 228 */ 229 public abstract void startDataNode(ServerName serverName) throws IOException; 230 231 /** 232 * Kills the datanode process if this is a distributed cluster, otherwise, 233 * this causes master to exit doing basic clean up only. 234 * @throws IOException if something goes wrong 235 */ 236 public abstract void killDataNode(ServerName serverName) throws IOException; 237 238 /** 239 * Stops the datanode if this is a distributed cluster, otherwise 240 * silently logs warning message. 241 * @throws IOException if something goes wrong 242 */ 243 public abstract void stopDataNode(ServerName serverName) throws IOException; 244 245 /** 246 * Wait for the specified datanode to join the cluster 247 * @throws IOException if something goes wrong or timeout occurs 248 */ 249 public abstract void waitForDataNodeToStart(ServerName serverName, long timeout) 250 throws IOException; 251 252 /** 253 * Wait for the specified datanode to stop the thread / process. 254 * @throws IOException if something goes wrong or timeout occurs 255 */ 256 public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) 257 throws IOException; 258 259 /** 260 * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs 261 * warning message. 262 * @throws IOException if something goes wrong 263 */ 264 public abstract void startNameNode(ServerName serverName) throws IOException; 265 266 /** 267 * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to 268 * exit doing basic clean up only. 269 * @throws IOException if something goes wrong 270 */ 271 public abstract void killNameNode(ServerName serverName) throws IOException; 272 273 /** 274 * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message. 275 * @throws IOException if something goes wrong 276 */ 277 public abstract void stopNameNode(ServerName serverName) throws IOException; 278 279 /** 280 * Wait for the specified namenode to join the cluster 281 * @throws IOException if something goes wrong or timeout occurs 282 */ 283 public abstract void waitForNameNodeToStart(ServerName serverName, long timeout) 284 throws IOException; 285 286 /** 287 * Wait for the specified namenode to stop 288 * @throws IOException if something goes wrong or timeout occurs 289 */ 290 public abstract void waitForNameNodeToStop(ServerName serverName, long timeout) 291 throws IOException; 292 293 /** 294 * Starts a new master on the given hostname or if this is a mini/local cluster, 295 * starts a master locally. 296 * @param hostname the hostname to start the master on 297 * @throws IOException if something goes wrong 298 */ 299 public abstract void startMaster(String hostname, int port) throws IOException; 300 301 /** 302 * Kills the master process if this is a distributed cluster, otherwise, 303 * this causes master to exit doing basic clean up only. 304 * @throws IOException if something goes wrong 305 */ 306 public abstract void killMaster(ServerName serverName) throws IOException; 307 308 /** 309 * Stops the given master, by attempting a gradual stop. 310 * @throws IOException if something goes wrong 311 */ 312 public abstract void stopMaster(ServerName serverName) throws IOException; 313 314 /** 315 * Wait for the specified master to stop the thread / process. 316 * @throws IOException if something goes wrong or timeout occurs 317 */ 318 public abstract void waitForMasterToStop(ServerName serverName, long timeout) 319 throws IOException; 320 321 /** 322 * Blocks until there is an active master and that master has completed 323 * initialization. 324 * 325 * @return true if an active master becomes available. false if there are no 326 * masters left. 327 * @throws IOException if something goes wrong or timeout occurs 328 */ 329 public boolean waitForActiveAndReadyMaster() 330 throws IOException { 331 return waitForActiveAndReadyMaster(Long.MAX_VALUE); 332 } 333 334 /** 335 * Blocks until there is an active master and that master has completed 336 * initialization. 337 * @param timeout the timeout limit in ms 338 * @return true if an active master becomes available. false if there are no 339 * masters left. 340 */ 341 public abstract boolean waitForActiveAndReadyMaster(long timeout) 342 throws IOException; 343 344 /** 345 * Wait for HBase Cluster to shut down. 346 */ 347 public abstract void waitUntilShutDown() throws IOException; 348 349 /** 350 * Shut down the HBase cluster 351 */ 352 public abstract void shutdown() throws IOException; 353 354 /** 355 * Restores the cluster to it's initial state if this is a real cluster, 356 * otherwise does nothing. 357 * This is a best effort restore. If the servers are not reachable, or insufficient 358 * permissions, etc. restoration might be partial. 359 * @return whether restoration is complete 360 */ 361 public boolean restoreInitialStatus() throws IOException { 362 return restoreClusterMetrics(getInitialClusterMetrics()); 363 } 364 365 /** 366 * Restores the cluster to given state if this is a real cluster, 367 * otherwise does nothing. 368 * This is a best effort restore. If the servers are not reachable, or insufficient 369 * permissions, etc. restoration might be partial. 370 * @return whether restoration is complete 371 */ 372 public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException { 373 return true; 374 } 375 376 /** 377 * Get the ServerName of region server serving the first hbase:meta region 378 */ 379 public ServerName getServerHoldingMeta() throws IOException { 380 return getServerHoldingRegion(TableName.META_TABLE_NAME, 381 RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName()); 382 } 383 384 /** 385 * Get the ServerName of region server serving the specified region 386 * @param regionName Name of the region in bytes 387 * @param tn Table name that has the region. 388 * @return ServerName that hosts the region or null 389 */ 390 public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName) 391 throws IOException; 392 393 /** 394 * @return whether we are interacting with a distributed cluster as opposed to an 395 * in-process mini/local cluster. 396 */ 397 public boolean isDistributedCluster() { 398 return false; 399 } 400 401 /** 402 * Closes all the resources held open for this cluster. Note that this call does not shutdown 403 * the cluster. 404 * @see #shutdown() 405 */ 406 @Override 407 public abstract void close() throws IOException; 408 409 /** 410 * Wait for the namenode. 411 * 412 * @throws InterruptedException 413 */ 414 public void waitForNamenodeAvailable() throws InterruptedException { 415 } 416 417 public void waitForDatanodesRegistered(int nbDN) throws Exception { 418 } 419}