View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.master;
21  
22  import java.io.IOException;
23  import java.net.InetAddress;
24  import java.util.ArrayList;
25  import java.util.Collections;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Set;
32  import java.util.concurrent.ConcurrentHashMap;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.hbase.ClockOutOfSyncException;
38  import org.apache.hadoop.hbase.HRegionInfo;
39  import org.apache.hadoop.hbase.HServerAddress;
40  import org.apache.hadoop.hbase.HServerLoad;
41  import org.apache.hadoop.hbase.PleaseHoldException;
42  import org.apache.hadoop.hbase.Server;
43  import org.apache.hadoop.hbase.ServerName;
44  import org.apache.hadoop.hbase.YouAreDeadException;
45  import org.apache.hadoop.hbase.ZooKeeperConnectionException;
46  import org.apache.hadoop.hbase.client.HConnection;
47  import org.apache.hadoop.hbase.client.HConnectionManager;
48  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
49  import org.apache.hadoop.hbase.ipc.HRegionInterface;
50  import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
51  import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
52  import org.apache.hadoop.hbase.monitoring.MonitoredTask;
53  import org.apache.hadoop.hbase.regionserver.RegionOpeningState;
54  
55  /**
56   * The ServerManager class manages info about region servers.
57   * <p>
58   * Maintains lists of online and dead servers.  Processes the startups,
59   * shutdowns, and deaths of region servers.
60   * <p>
61   * Servers are distinguished in two different ways.  A given server has a
62   * location, specified by hostname and port, and of which there can only be one
63   * online at any given time.  A server instance is specified by the location
64   * (hostname and port) as well as the startcode (timestamp from when the server
65   * was started).  This is used to differentiate a restarted instance of a given
66   * server from the original instance.
67   */
68  public class ServerManager {
69    public static final String WAIT_ON_REGIONSERVERS_MAXTOSTART =
70        "hbase.master.wait.on.regionservers.maxtostart";
71  
72    public static final String WAIT_ON_REGIONSERVERS_MINTOSTART =
73        "hbase.master.wait.on.regionservers.mintostart";
74  
75    public static final String WAIT_ON_REGIONSERVERS_TIMEOUT =
76        "hbase.master.wait.on.regionservers.timeout";
77  
78    public static final String WAIT_ON_REGIONSERVERS_INTERVAL =
79        "hbase.master.wait.on.regionservers.interval";
80  
81    private static final Log LOG = LogFactory.getLog(ServerManager.class);
82  
83    // Set if we are to shutdown the cluster.
84    private volatile boolean clusterShutdown = false;
85  
86    /** Map of registered servers to their current load */
87    private final Map<ServerName, HServerLoad> onlineServers =
88      new ConcurrentHashMap<ServerName, HServerLoad>();
89  
90    // TODO: This is strange to have two maps but HSI above is used on both sides
91    /**
92     * Map from full server-instance name to the RPC connection for this server.
93     */
94    private final Map<ServerName, HRegionInterface> serverConnections =
95      new HashMap<ServerName, HRegionInterface>();
96  
97    /**
98     * List of region servers <ServerName> that should not get any more new
99     * regions.
100    */
101   private final ArrayList<ServerName> drainingServers =
102     new ArrayList<ServerName>();
103 
104   private final Server master;
105   private final MasterServices services;
106   private final HConnection connection;
107 
108   private final DeadServer deadservers;
109 
110   private final long maxSkew;
111   private final long warningSkew;
112 
113   /**
114    * Set of region servers which are dead but not expired immediately. If one
115    * server died before master enables ServerShutdownHandler, the server will be
116    * added to set and will be expired through calling
117    * {@link ServerManager#expireDeadNotExpiredServers()} by master.
118    */
119   private Set<ServerName> deadNotExpiredServers = new HashSet<ServerName>();
120 
121   /**
122    * Flag to enable SSH for ROOT region server. It's used in master initialization to enable SSH for
123    * ROOT before META assignment.
124    */
125   private boolean isSSHForRootEnabled = false;
126 
127   /**
128    * Constructor.
129    * @param master
130    * @param services
131    * @throws ZooKeeperConnectionException
132    */
133   public ServerManager(final Server master, final MasterServices services)
134       throws ZooKeeperConnectionException {
135     this(master, services, true);
136   }
137 
138   ServerManager(final Server master, final MasterServices services,
139       final boolean connect) throws ZooKeeperConnectionException {
140     this.master = master;
141     this.services = services;
142     Configuration c = master.getConfiguration();
143     maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
144     warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
145     this.deadservers = new DeadServer();
146     this.connection = connect ? HConnectionManager.getConnection(c) : null;
147   }
148 
149   /**
150    * Let the server manager know a new regionserver has come online
151    * @param ia The remote address
152    * @param port The remote port
153    * @param serverStartcode
154    * @param serverCurrentTime The current time of the region server in ms
155    * @return The ServerName we know this server as.
156    * @throws IOException
157    */
158   ServerName regionServerStartup(final InetAddress ia, final int port,
159     final long serverStartcode, long serverCurrentTime)
160   throws IOException {
161     // Test for case where we get a region startup message from a regionserver
162     // that has been quickly restarted but whose znode expiration handler has
163     // not yet run, or from a server whose fail we are currently processing.
164     // Test its host+port combo is present in serverAddresstoServerInfo.  If it
165     // is, reject the server and trigger its expiration. The next time it comes
166     // in, it should have been removed from serverAddressToServerInfo and queued
167     // for processing by ProcessServerShutdown.
168     ServerName sn = new ServerName(ia.getHostName(), port, serverStartcode);
169     checkClockSkew(sn, serverCurrentTime);
170     checkIsDead(sn, "STARTUP");
171     checkAlreadySameHostPort(sn);
172     recordNewServer(sn, HServerLoad.EMPTY_HSERVERLOAD);
173     return sn;
174   }
175 
176   void regionServerReport(ServerName sn, HServerLoad hsl)
177   throws YouAreDeadException, PleaseHoldException {
178     checkIsDead(sn, "REPORT");
179     if (!this.onlineServers.containsKey(sn)) {
180       // Already have this host+port combo and its just different start code?
181       checkAlreadySameHostPort(sn);
182       // Just let the server in. Presume master joining a running cluster.
183       // recordNewServer is what happens at the end of reportServerStartup.
184       // The only thing we are skipping is passing back to the regionserver
185       // the ServerName to use. Here we presume a master has already done
186       // that so we'll press on with whatever it gave us for ServerName.
187       recordNewServer(sn, hsl);
188     } else {
189       this.onlineServers.put(sn, hsl);
190     }
191   }
192 
193   /**
194    * Test to see if we have a server of same host and port already.
195    * @param serverName
196    * @throws PleaseHoldException
197    */
198   void checkAlreadySameHostPort(final ServerName serverName)
199   throws PleaseHoldException {
200     ServerName existingServer =
201       ServerName.findServerWithSameHostnamePort(getOnlineServersList(), serverName);
202     if (existingServer != null) {
203       String message = "Server serverName=" + serverName +
204         " rejected; we already have " + existingServer.toString() +
205         " registered with same hostname and port";
206       LOG.info(message);
207       if (existingServer.getStartcode() < serverName.getStartcode()) {
208         LOG.info("Triggering server recovery; existingServer " +
209           existingServer + " looks stale, new server:" + serverName);
210         expireServer(existingServer);
211       }
212       if (services.isServerShutdownHandlerEnabled()) {
213         // master has completed the initialization
214         throw new PleaseHoldException(message);
215       }
216     }
217   }
218 
219   /**
220    * Checks if the clock skew between the server and the master. If the clock skew exceeds the 
221    * configured max, it will throw an exception; if it exceeds the configured warning threshold, 
222    * it will log a warning but start normally.
223    * @param serverName Incoming servers's name
224    * @param serverCurrentTime
225    * @throws ClockOutOfSyncException if the skew exceeds the configured max value
226    */
227   private void checkClockSkew(final ServerName serverName, final long serverCurrentTime)
228   throws ClockOutOfSyncException {
229     long skew = Math.abs(System.currentTimeMillis() - serverCurrentTime);
230     if (skew > maxSkew) {
231       String message = "Server " + serverName + " has been " +
232         "rejected; Reported time is too far out of sync with master.  " +
233         "Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
234       LOG.warn(message);
235       throw new ClockOutOfSyncException(message);
236     } else if (skew > warningSkew){
237       String message = "Reported time for server " + serverName + " is out of sync with master " +
238         "by " + skew + "ms. (Warning threshold is " + warningSkew + "ms; " + 
239         "error threshold is " + maxSkew + "ms)";
240       LOG.warn(message);
241     }
242   }
243 
244   /**
245    * If this server is on the dead list, reject it with a YouAreDeadException.
246    * If it was dead but came back with a new start code, remove the old entry
247    * from the dead list.
248    * @param serverName
249    * @param what START or REPORT
250    * @throws YouAreDeadException
251    */
252   private void checkIsDead(final ServerName serverName, final String what)
253       throws YouAreDeadException {
254     if (this.deadservers.isDeadServer(serverName)) {
255       // host name, port and start code all match with existing one of the
256       // dead servers. So, this server must be dead.
257       String message = "Server " + what + " rejected; currently processing " +
258           serverName + " as dead server";
259       LOG.debug(message);
260       throw new YouAreDeadException(message);
261     }
262 
263     // remove dead server with same hostname and port of newly checking in rs after master
264     // initialization.See HBASE-5916 for more information.
265     if ((this.services == null || ((HMaster) this.services).isInitialized())
266         && this.deadservers.cleanPreviousInstance(serverName)) {
267       // This server has now become alive after we marked it as dead.
268       // We removed it's previous entry from the dead list to reflect it.
269       LOG.debug(what + ":" + " Server " + serverName + " came back up," +
270           " removed it from the dead servers list");
271     }
272   }
273 
274   /**
275    * Adds the onlineServers list.
276    * @param hsl
277    * @param serverName The remote servers name.
278    */
279   void recordNewServer(final ServerName serverName, final  HServerLoad hsl) {
280     LOG.info("Registering server=" + serverName);
281     this.onlineServers.put(serverName, hsl);
282     this.serverConnections.remove(serverName);
283   }
284 
285   /**
286    * @param serverName
287    * @return HServerLoad if serverName is known else null
288    */
289   public HServerLoad getLoad(final ServerName serverName) {
290     return this.onlineServers.get(serverName);
291   }
292 
293   /**
294    * @param address
295    * @return HServerLoad if serverName is known else null
296    * @deprecated Use {@link #getLoad(HServerAddress)}
297    */
298   public HServerLoad getLoad(final HServerAddress address) {
299     ServerName sn = new ServerName(address.toString(), ServerName.NON_STARTCODE);
300     ServerName actual =
301       ServerName.findServerWithSameHostnamePort(this.getOnlineServersList(), sn);
302     return actual == null? null: getLoad(actual);
303   }
304 
305   /**
306    * Compute the average load across all region servers.
307    * Currently, this uses a very naive computation - just uses the number of
308    * regions being served, ignoring stats about number of requests.
309    * @return the average load
310    */
311   public double getAverageLoad() {
312     int totalLoad = 0;
313     int numServers = 0;
314     double averageLoad = 0.0;
315     for (HServerLoad hsl: this.onlineServers.values()) {
316         numServers++;
317         totalLoad += hsl.getNumberOfRegions();
318     }
319     averageLoad = (double)totalLoad / (double)numServers;
320     return averageLoad;
321   }
322 
323   /** @return the count of active regionservers */
324   int countOfRegionServers() {
325     // Presumes onlineServers is a concurrent map
326     return this.onlineServers.size();
327   }
328 
329   /**
330    * @return Read-only map of servers to serverinfo
331    */
332   public Map<ServerName, HServerLoad> getOnlineServers() {
333     // Presumption is that iterating the returned Map is OK.
334     synchronized (this.onlineServers) {
335       return Collections.unmodifiableMap(this.onlineServers);
336     }
337   }
338 
339   public Set<ServerName> getDeadServers() {
340     return this.deadservers.clone();
341   }
342 
343   /**
344    * Checks if any dead servers are currently in progress.
345    * @return true if any RS are being processed as dead, false if not
346    */
347   public boolean areDeadServersInProgress() {
348     return this.deadservers.areDeadServersInProgress();
349   }
350 
351   void letRegionServersShutdown() {
352     long previousLogTime = 0;
353     while (!onlineServers.isEmpty()) {
354 
355       if (System.currentTimeMillis() > (previousLogTime + 1000)) {
356         StringBuilder sb = new StringBuilder();
357         for (ServerName key : this.onlineServers.keySet()) {
358           if (sb.length() > 0) {
359             sb.append(", ");
360           }
361           sb.append(key);
362         }
363         LOG.info("Waiting on regionserver(s) to go down " + sb.toString());
364         previousLogTime = System.currentTimeMillis();
365       }
366 
367       synchronized (onlineServers) {
368         try {
369           onlineServers.wait(100);
370         } catch (InterruptedException ignored) {
371           // continue
372         }
373       }
374     }
375   }
376 
377   /*
378    * Expire the passed server.  Add it to list of deadservers and queue a
379    * shutdown processing.
380    */
381   public synchronized void expireServer(final ServerName serverName) {
382     boolean carryingRoot = services.getAssignmentManager().isCarryingRoot(serverName);
383     if (!services.isServerShutdownHandlerEnabled() && (!carryingRoot || !this.isSSHForRootEnabled)) {
384       LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
385           + "delay expiring server " + serverName);
386       this.deadNotExpiredServers.add(serverName);
387       return;
388     }
389     if (!this.onlineServers.containsKey(serverName)) {
390       LOG.warn("Received expiration of " + serverName +
391         " but server is not currently online");
392     }
393     if (this.deadservers.contains(serverName)) {
394       // TODO: Can this happen?  It shouldn't be online in this case?
395       LOG.warn("Received expiration of " + serverName +
396           " but server shutdown is already in progress");
397       return;
398     }
399     // Remove the server from the known servers lists and update load info BUT
400     // add to deadservers first; do this so it'll show in dead servers list if
401     // not in online servers list.
402     this.deadservers.add(serverName);
403     this.onlineServers.remove(serverName);
404     synchronized (onlineServers) {
405       onlineServers.notifyAll();
406     }
407     this.serverConnections.remove(serverName);
408     // If cluster is going down, yes, servers are going to be expiring; don't
409     // process as a dead server
410     if (this.clusterShutdown) {
411       LOG.info("Cluster shutdown set; " + serverName +
412         " expired; onlineServers=" + this.onlineServers.size());
413       if (this.onlineServers.isEmpty()) {
414         master.stop("Cluster shutdown set; onlineServer=0");
415       }
416       return;
417     }
418 
419     boolean carryingMeta = services.getAssignmentManager().isCarryingMeta(serverName);
420     if (carryingRoot || carryingMeta) {
421       this.services.getExecutorService().submit(new MetaServerShutdownHandler(this.master,
422         this.services, this.deadservers, serverName, carryingRoot, carryingMeta));
423     } else {
424       this.services.getExecutorService().submit(new ServerShutdownHandler(this.master,
425         this.services, this.deadservers, serverName, true));
426     }
427     LOG.debug("Added=" + serverName +
428       " to dead servers, submitted shutdown handler to be executed, root=" +
429         carryingRoot + ", meta=" + carryingMeta);
430   }
431 
432   /**
433    * Expire the servers which died during master's initialization. It will be
434    * called after HMaster#assignRootAndMeta.
435    * @throws IOException
436    * */
437   synchronized void expireDeadNotExpiredServers() throws IOException {
438     if (!services.isServerShutdownHandlerEnabled()) {
439       throw new IOException("Master hasn't enabled ServerShutdownHandler ");
440     }
441     Iterator<ServerName> serverIterator = deadNotExpiredServers.iterator();
442     while (serverIterator.hasNext()) {
443       expireServer(serverIterator.next());
444       serverIterator.remove();
445     }
446   }
447 
448   /**
449    * Enable SSH for ROOT region server and expire ROOT which died during master's initialization. It
450    * will be called before Meta assignment.
451    * @throws IOException
452    */
453   void enableSSHForRoot() throws IOException {
454     if (this.isSSHForRootEnabled) {
455       return;
456     }
457     this.isSSHForRootEnabled = true;
458     Iterator<ServerName> serverIterator = deadNotExpiredServers.iterator();
459     while (serverIterator.hasNext()) {
460       ServerName curServerName = serverIterator.next();
461       if (services.getAssignmentManager().isCarryingRoot(curServerName)) {
462         expireServer(curServerName);
463         serverIterator.remove();
464       }
465     }
466   }
467 
468   /**
469    * Reset flag isSSHForRootEnabled to false
470    */
471   void disableSSHForRoot() {
472     this.isSSHForRootEnabled = false;
473   }
474 
475   /*
476    * Remove the server from the drain list.
477    */
478   public boolean removeServerFromDrainList(final ServerName sn) {
479     // Warn if the server (sn) is not online.  ServerName is of the form:
480     // <hostname> , <port> , <startcode>
481 
482     if (!this.isServerOnline(sn)) {
483       LOG.warn("Server " + sn + " is not currently online. " +
484                "Removing from draining list anyway, as requested.");
485     }
486     // Remove the server from the draining servers lists.
487     return this.drainingServers.remove(sn);
488   }
489 
490   /*
491    * Add the server to the drain list.
492    */
493   public boolean addServerToDrainList(final ServerName sn) {
494     // Warn if the server (sn) is not online.  ServerName is of the form:
495     // <hostname> , <port> , <startcode>
496 
497     if (!this.isServerOnline(sn)) {
498       LOG.warn("Server " + sn + " is not currently online. " +
499                "Ignoring request to add it to draining list.");
500       return false;
501     }
502     // Add the server to the draining servers lists, if it's not already in
503     // it.
504     if (this.drainingServers.contains(sn)) {
505       LOG.warn("Server " + sn + " is already in the draining server list." +
506                "Ignoring request to add it again.");
507       return false;
508     }
509     return this.drainingServers.add(sn);
510   }
511 
512   // RPC methods to region servers
513 
514   /**
515    * Sends an OPEN RPC to the specified server to open the specified region.
516    * <p>
517    * Open should not fail but can if server just crashed.
518    * <p>
519    * @param server server to open a region
520    * @param region region to open
521    * @param versionOfOfflineNode that needs to be present in the offline node
522    * when RS tries to change the state from OFFLINE to other states.
523    */
524   public RegionOpeningState sendRegionOpen(final ServerName server,
525       HRegionInfo region, int versionOfOfflineNode)
526   throws IOException {
527     HRegionInterface hri = getServerConnection(server);
528     if (hri == null) {
529       LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
530         " failed because no RPC connection found to this server");
531       return RegionOpeningState.FAILED_OPENING;
532     }
533     return (versionOfOfflineNode == -1) ? hri.openRegion(region) : hri
534         .openRegion(region, versionOfOfflineNode);
535   }
536 
537   /**
538    * Sends an OPEN RPC to the specified server to open the specified region.
539    * <p>
540    * Open should not fail but can if server just crashed.
541    * <p>
542    * @param server server to open a region
543    * @param regions regions to open
544    */
545   public void sendRegionOpen(ServerName server, List<HRegionInfo> regions)
546   throws IOException {
547     HRegionInterface hri = getServerConnection(server);
548     if (hri == null) {
549       LOG.warn("Attempting to send OPEN RPC to server " + server.toString() +
550         " failed because no RPC connection found to this server");
551       return;
552     }
553     hri.openRegions(regions);
554   }
555 
556   /**
557    * Sends an CLOSE RPC to the specified server to close the specified region.
558    * <p>
559    * A region server could reject the close request because it either does not
560    * have the specified region or the region is being split.
561    * @param server server to open a region
562    * @param region region to open
563    * @param versionOfClosingNode
564    *   the version of znode to compare when RS transitions the znode from
565    *   CLOSING state.
566    * @return true if server acknowledged close, false if not
567    * @throws IOException
568    */
569   public boolean sendRegionClose(ServerName server, HRegionInfo region,
570     int versionOfClosingNode) throws IOException {
571     if (server == null) throw new NullPointerException("Passed server is null");
572     HRegionInterface hri = getServerConnection(server);
573     if (hri == null) {
574       throw new IOException("Attempting to send CLOSE RPC to server " +
575         server.toString() + " for region " +
576         region.getRegionNameAsString() +
577         " failed because no RPC connection found to this server");
578     }
579     return hri.closeRegion(region, versionOfClosingNode);
580   }
581 
582   /**
583    * @param sn
584    * @return
585    * @throws IOException
586    * @throws RetriesExhaustedException wrapping a ConnectException if failed
587    * putting up proxy.
588    */
589   private HRegionInterface getServerConnection(final ServerName sn)
590   throws IOException {
591     HRegionInterface hri = this.serverConnections.get(sn);
592     if (hri == null) {
593       LOG.debug("New connection to " + sn.toString());
594       hri = this.connection.getHRegionConnection(sn.getHostname(), sn.getPort());
595       this.serverConnections.put(sn, hri);
596     }
597     return hri;
598   }
599 
600   /**
601    * Wait for the region servers to report in.
602    * We will wait until one of this condition is met:
603    *  - the master is stopped
604    *  - the 'hbase.master.wait.on.regionservers.maxtostart' number of
605    *    region servers is reached
606    *  - the 'hbase.master.wait.on.regionservers.mintostart' is reached AND
607    *   there have been no new region server in for
608    *      'hbase.master.wait.on.regionservers.interval' time AND
609    *   the 'hbase.master.wait.on.regionservers.timeout' is reached
610    *
611    * @throws InterruptedException
612    */
613   public void waitForRegionServers(MonitoredTask status)
614   throws InterruptedException {
615     final long interval = this.master.getConfiguration().
616       getLong(WAIT_ON_REGIONSERVERS_INTERVAL, 1500);
617     final long timeout = this.master.getConfiguration().
618       getLong(WAIT_ON_REGIONSERVERS_TIMEOUT, 4500);
619     int minToStart = this.master.getConfiguration().
620       getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
621     if (minToStart < 1) {
622       LOG.warn(String.format(
623         "The value of '%s' (%d) can not be less than 1, ignoring.",
624         WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
625       minToStart = 1;
626     }
627     int maxToStart = this.master.getConfiguration().
628       getInt(WAIT_ON_REGIONSERVERS_MAXTOSTART, Integer.MAX_VALUE);
629     if (maxToStart < minToStart) {
630         LOG.warn(String.format(
631             "The value of '%s' (%d) is set less than '%s' (%d), ignoring.",
632             WAIT_ON_REGIONSERVERS_MAXTOSTART, maxToStart,
633             WAIT_ON_REGIONSERVERS_MINTOSTART, minToStart));
634         maxToStart = Integer.MAX_VALUE;
635     }
636 
637     long now =  System.currentTimeMillis();
638     final long startTime = now;
639     long slept = 0;
640     long lastLogTime = 0;
641     long lastCountChange = startTime;
642     int count = countOfRegionServers();
643     int oldCount = 0;
644     while (
645       !this.master.isStopped() &&
646         count < maxToStart &&
647         (lastCountChange+interval > now || timeout > slept || count < minToStart)
648       ){
649 
650       // Log some info at every interval time or if there is a change
651       if (oldCount != count || lastLogTime+interval < now){
652         lastLogTime = now;
653         String msg =
654           "Waiting for region servers count to settle; currently"+
655             " checked in " + count + ", slept for " + slept + " ms," +
656             " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+
657             ", timeout of "+timeout+" ms, interval of "+interval+" ms.";
658         LOG.info(msg);
659         status.setStatus(msg);
660       }
661 
662       // We sleep for some time
663       final long sleepTime = 50;
664       Thread.sleep(sleepTime);
665       now =  System.currentTimeMillis();
666       slept = now - startTime;
667 
668       oldCount = count;
669       count = countOfRegionServers();
670       if (count != oldCount) {
671         lastCountChange = now;
672       }
673     }
674 
675     LOG.info("Finished waiting for region servers count to settle;" +
676       " checked in " + count + ", slept for " + slept + " ms," +
677       " expecting minimum of " + minToStart + ", maximum of "+ maxToStart+","+
678       " master is "+ (this.master.isStopped() ? "stopped.": "running.")
679     );
680   }
681 
682   /**
683    * @return A copy of the internal list of online servers.
684    */
685   public List<ServerName> getOnlineServersList() {
686     // TODO: optimize the load balancer call so we don't need to make a new list
687     // TODO: FIX. THIS IS POPULAR CALL.
688     return new ArrayList<ServerName>(this.onlineServers.keySet());
689   }
690 
691   /**
692    * @return A copy of the internal list of draining servers.
693    */
694   public List<ServerName> getDrainingServersList() {
695     return new ArrayList<ServerName>(this.drainingServers);
696   }
697 
698   /**
699    * @return A copy of the internal set of deadNotExpired servers.
700    */
701   Set<ServerName> getDeadNotExpiredServers() {
702     return new HashSet<ServerName>(this.deadNotExpiredServers);
703   }
704 
705   public boolean isServerOnline(ServerName serverName) {
706     return onlineServers.containsKey(serverName);
707   }
708 
709   public void shutdownCluster() {
710     this.clusterShutdown = true;
711     this.master.stop("Cluster shutdown requested");
712   }
713 
714   public boolean isClusterShutdown() {
715     return this.clusterShutdown;
716   }
717 
718   /**
719    * Stop the ServerManager.  Currently closes the connection to the master.
720    */
721   public void stop() {
722     if (connection != null) {
723       try {
724         connection.close();
725       } catch (IOException e) {
726         LOG.error("Attempt to close connection to master failed", e);
727       }
728     }
729   }
730     
731   /**
732    * To clear any dead server with same host name and port of any online server
733    */
734   void clearDeadServersWithSameHostNameAndPortOfOnlineServer() {
735     ServerName sn = null;
736     for (ServerName serverName : getOnlineServersList()) {
737       while ((sn = ServerName.
738           findServerWithSameHostnamePort(this.deadservers, serverName)) != null) {
739         this.deadservers.remove(sn);
740       }
741     }
742   }
743 
744 }