001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase; 019 020import java.io.Closeable; 021import java.io.IOException; 022import org.apache.hadoop.conf.Configurable; 023import org.apache.hadoop.conf.Configuration; 024import org.apache.hadoop.hbase.client.RegionInfoBuilder; 025import org.apache.hadoop.hbase.util.Threads; 026import org.apache.yetus.audience.InterfaceAudience; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService; 031import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService; 032import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService; 033 034/** 035 * This class defines methods that can help with managing HBase clusters 036 * from unit tests and system tests. There are 3 types of cluster deployments: 037 * <ul> 038 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, 039 * used by unit tests</li> 040 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can 041 * interact with the cluster. </li> 042 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate 043 * JVMs. </li> 044 * </ul> 045 * <p> 046 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can 047 * be run against a mini-cluster during unit test execution, or a distributed cluster having 048 * tens/hundreds of nodes during execution of integration tests. 049 * 050 * <p> 051 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume 052 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, 053 * and some tests will still need to mock stuff and introspect internal state. For those use 054 * cases from unit tests, or if more control is needed, you can use the subclasses directly. 055 * In that sense, this class does not abstract away <strong>every</strong> interface that 056 * MiniHBaseCluster or DistributedHBaseCluster provide. 057 */ 058@InterfaceAudience.Private 059public abstract class HBaseCluster implements Closeable, Configurable { 060 // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope 061 static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName()); 062 protected Configuration conf; 063 064 /** the status of the cluster before we begin */ 065 protected ClusterMetrics initialClusterStatus; 066 067 /** 068 * Construct an HBaseCluster 069 * @param conf Configuration to be used for cluster 070 */ 071 public HBaseCluster(Configuration conf) { 072 setConf(conf); 073 } 074 075 @Override 076 public void setConf(Configuration conf) { 077 this.conf = conf; 078 } 079 080 @Override 081 public Configuration getConf() { 082 return conf; 083 } 084 085 /** 086 * Returns a ClusterMetrics for this HBase cluster. 087 * @see #getInitialClusterMetrics() 088 */ 089 public abstract ClusterMetrics getClusterMetrics() throws IOException; 090 091 /** 092 * Returns a ClusterStatus for this HBase cluster as observed at the 093 * starting of the HBaseCluster 094 */ 095 public ClusterMetrics getInitialClusterMetrics() throws IOException { 096 return initialClusterStatus; 097 } 098 099 /** 100 * Returns an {@link MasterService.BlockingInterface} to the active master 101 */ 102 public abstract MasterService.BlockingInterface getMasterAdminService() 103 throws IOException; 104 105 /** 106 * Returns an AdminProtocol interface to the regionserver 107 */ 108 public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName) 109 throws IOException; 110 111 /** 112 * Returns a ClientProtocol interface to the regionserver 113 */ 114 public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName) 115 throws IOException; 116 117 /** 118 * Starts a new region server on the given hostname or if this is a mini/local cluster, 119 * starts a region server locally. 120 * @param hostname the hostname to start the regionserver on 121 * @throws IOException if something goes wrong 122 */ 123 public abstract void startRegionServer(String hostname, int port) throws IOException; 124 125 /** 126 * Kills the region server process if this is a distributed cluster, otherwise 127 * this causes the region server to exit doing basic clean up only. 128 * @throws IOException if something goes wrong 129 */ 130 public abstract void killRegionServer(ServerName serverName) throws IOException; 131 132 /** 133 * Keeping track of killed servers and being able to check if a particular server was killed makes 134 * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete 135 * example of such case is - killing servers and waiting for all regions of a particular table 136 * to be assigned. We can check for server column in META table and that its value is not one 137 * of the killed servers. 138 */ 139 public abstract boolean isKilledRS(ServerName serverName); 140 141 /** 142 * Stops the given region server, by attempting a gradual stop. 143 * @throws IOException if something goes wrong 144 */ 145 public abstract void stopRegionServer(ServerName serverName) throws IOException; 146 147 /** 148 * Wait for the specified region server to join the cluster 149 * @throws IOException if something goes wrong or timeout occurs 150 */ 151 public void waitForRegionServerToStart(String hostname, int port, long timeout) 152 throws IOException { 153 long start = System.currentTimeMillis(); 154 while ((System.currentTimeMillis() - start) < timeout) { 155 for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) { 156 if (server.getHostname().equals(hostname) && server.getPort() == port) { 157 return; 158 } 159 } 160 Threads.sleep(100); 161 } 162 throw new IOException("did timeout " + timeout + "ms waiting for region server to start: " 163 + hostname); 164 } 165 166 /** 167 * Wait for the specified region server to stop the thread / process. 168 * @throws IOException if something goes wrong or timeout occurs 169 */ 170 public abstract void waitForRegionServerToStop(ServerName serverName, long timeout) 171 throws IOException; 172 173 /** 174 * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, 175 * silently logs warning message. 176 * @param hostname the hostname to start the regionserver on 177 * @throws IOException if something goes wrong 178 */ 179 public abstract void startZkNode(String hostname, int port) throws IOException; 180 181 /** 182 * Kills the zookeeper node process if this is a distributed cluster, otherwise, 183 * this causes master to exit doing basic clean up only. 184 * @throws IOException if something goes wrong 185 */ 186 public abstract void killZkNode(ServerName serverName) throws IOException; 187 188 /** 189 * Stops the region zookeeper if this is a distributed cluster, otherwise 190 * silently logs warning message. 191 * @throws IOException if something goes wrong 192 */ 193 public abstract void stopZkNode(ServerName serverName) throws IOException; 194 195 /** 196 * Wait for the specified zookeeper node to join the cluster 197 * @throws IOException if something goes wrong or timeout occurs 198 */ 199 public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) 200 throws IOException; 201 202 /** 203 * Wait for the specified zookeeper node to stop the thread / process. 204 * @throws IOException if something goes wrong or timeout occurs 205 */ 206 public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) 207 throws IOException; 208 209 /** 210 * Starts a new datanode on the given hostname or if this is a mini/local cluster, 211 * silently logs warning message. 212 * @throws IOException if something goes wrong 213 */ 214 public abstract void startDataNode(ServerName serverName) throws IOException; 215 216 /** 217 * Kills the datanode process if this is a distributed cluster, otherwise, 218 * this causes master to exit doing basic clean up only. 219 * @throws IOException if something goes wrong 220 */ 221 public abstract void killDataNode(ServerName serverName) throws IOException; 222 223 /** 224 * Stops the datanode if this is a distributed cluster, otherwise 225 * silently logs warning message. 226 * @throws IOException if something goes wrong 227 */ 228 public abstract void stopDataNode(ServerName serverName) throws IOException; 229 230 /** 231 * Wait for the specified datanode to join the cluster 232 * @throws IOException if something goes wrong or timeout occurs 233 */ 234 public abstract void waitForDataNodeToStart(ServerName serverName, long timeout) 235 throws IOException; 236 237 /** 238 * Wait for the specified datanode to stop the thread / process. 239 * @throws IOException if something goes wrong or timeout occurs 240 */ 241 public abstract void waitForDataNodeToStop(ServerName serverName, long timeout) 242 throws IOException; 243 244 /** 245 * Starts a new master on the given hostname or if this is a mini/local cluster, 246 * starts a master locally. 247 * @param hostname the hostname to start the master on 248 * @throws IOException if something goes wrong 249 */ 250 public abstract void startMaster(String hostname, int port) throws IOException; 251 252 /** 253 * Kills the master process if this is a distributed cluster, otherwise, 254 * this causes master to exit doing basic clean up only. 255 * @throws IOException if something goes wrong 256 */ 257 public abstract void killMaster(ServerName serverName) throws IOException; 258 259 /** 260 * Stops the given master, by attempting a gradual stop. 261 * @throws IOException if something goes wrong 262 */ 263 public abstract void stopMaster(ServerName serverName) throws IOException; 264 265 /** 266 * Wait for the specified master to stop the thread / process. 267 * @throws IOException if something goes wrong or timeout occurs 268 */ 269 public abstract void waitForMasterToStop(ServerName serverName, long timeout) 270 throws IOException; 271 272 /** 273 * Blocks until there is an active master and that master has completed 274 * initialization. 275 * 276 * @return true if an active master becomes available. false if there are no 277 * masters left. 278 * @throws IOException if something goes wrong or timeout occurs 279 */ 280 public boolean waitForActiveAndReadyMaster() 281 throws IOException { 282 return waitForActiveAndReadyMaster(Long.MAX_VALUE); 283 } 284 285 /** 286 * Blocks until there is an active master and that master has completed 287 * initialization. 288 * @param timeout the timeout limit in ms 289 * @return true if an active master becomes available. false if there are no 290 * masters left. 291 */ 292 public abstract boolean waitForActiveAndReadyMaster(long timeout) 293 throws IOException; 294 295 /** 296 * Wait for HBase Cluster to shut down. 297 */ 298 public abstract void waitUntilShutDown() throws IOException; 299 300 /** 301 * Shut down the HBase cluster 302 */ 303 public abstract void shutdown() throws IOException; 304 305 /** 306 * Restores the cluster to it's initial state if this is a real cluster, 307 * otherwise does nothing. 308 * This is a best effort restore. If the servers are not reachable, or insufficient 309 * permissions, etc. restoration might be partial. 310 * @return whether restoration is complete 311 */ 312 public boolean restoreInitialStatus() throws IOException { 313 return restoreClusterMetrics(getInitialClusterMetrics()); 314 } 315 316 /** 317 * Restores the cluster to given state if this is a real cluster, 318 * otherwise does nothing. 319 * This is a best effort restore. If the servers are not reachable, or insufficient 320 * permissions, etc. restoration might be partial. 321 * @return whether restoration is complete 322 */ 323 public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException { 324 return true; 325 } 326 327 /** 328 * Get the ServerName of region server serving the first hbase:meta region 329 */ 330 public ServerName getServerHoldingMeta() throws IOException { 331 return getServerHoldingRegion(TableName.META_TABLE_NAME, 332 RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName()); 333 } 334 335 /** 336 * Get the ServerName of region server serving the specified region 337 * @param regionName Name of the region in bytes 338 * @param tn Table name that has the region. 339 * @return ServerName that hosts the region or null 340 */ 341 public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName) 342 throws IOException; 343 344 /** 345 * @return whether we are interacting with a distributed cluster as opposed to an 346 * in-process mini/local cluster. 347 */ 348 public boolean isDistributedCluster() { 349 return false; 350 } 351 352 /** 353 * Closes all the resources held open for this cluster. Note that this call does not shutdown 354 * the cluster. 355 * @see #shutdown() 356 */ 357 @Override 358 public abstract void close() throws IOException; 359 360 /** 361 * Wait for the namenode. 362 * 363 * @throws InterruptedException 364 */ 365 public void waitForNamenodeAvailable() throws InterruptedException { 366 } 367 368 public void waitForDatanodesRegistered(int nbDN) throws Exception { 369 } 370}