001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.util.ArrayList;
021import java.util.Collections;
022import java.util.Date;
023import java.util.HashMap;
024import java.util.HashSet;
025import java.util.Iterator;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
031import org.apache.hadoop.hbase.util.Pair;
032import org.apache.yetus.audience.InterfaceAudience;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036/**
037 * Class to hold dead servers list and utility querying dead server list. Servers are added when
038 * they expire or when we find them in filesystem on startup. When a server crash procedure is
039 * queued, it will populate the processing list and then remove the server from processing list when
040 * done. Servers are removed from dead server list when a new instance is started over the old on
041 * same hostname and port or when new Master comes online tidying up after all initialization.
042 * Processing list and deadserver list are not tied together (you don't have to be in deadservers
043 * list to be processing and vice versa).
044 */
045@InterfaceAudience.Private
046public class DeadServer {
047  private static final Logger LOG = LoggerFactory.getLogger(DeadServer.class);
048
049  /**
050   * Set of known dead servers. On znode expiration, servers are added here. This is needed in case
051   * of a network partitioning where the server's lease expires, but the server is still running.
052   * After the network is healed, and it's server logs are recovered, it will be told to call server
053   * startup because by then, its regions have probably been reassigned.
054   */
055  private final Map<ServerName, Long> deadServers = new HashMap<>();
056
057  /**
058   * @param serverName server name.
059   * @return true if this server is on the dead servers list false otherwise
060   */
061  public synchronized boolean isDeadServer(final ServerName serverName) {
062    return deadServers.containsKey(serverName);
063  }
064
065  public synchronized Set<ServerName> copyServerNames() {
066    Set<ServerName> clone = new HashSet<>(deadServers.size());
067    clone.addAll(deadServers.keySet());
068    return clone;
069  }
070
071  /**
072   * Adds the server to the dead server list if it's not there already.
073   */
074  synchronized void putIfAbsent(ServerName sn) {
075    this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
076  }
077
078  synchronized void putIfAbsent(ServerName sn, long crashedTime) {
079    this.deadServers.putIfAbsent(sn, crashedTime);
080  }
081
082  public synchronized int size() {
083    return deadServers.size();
084  }
085
086  synchronized boolean isEmpty() {
087    return deadServers.isEmpty();
088  }
089
090  /**
091   * Handles restart of a server. The new server instance has a different start code. The new start
092   * code should be greater than the old one. We don't check that here. Removes the old server from
093   * deadserver list.
094   * @param newServerName Servername as either <code>host:port</code> or
095   *                      <code>host,port,startcode</code>.
096   * @return true if this server was dead before and coming back alive again
097   */
098  synchronized boolean cleanPreviousInstance(final ServerName newServerName) {
099    Iterator<ServerName> it = deadServers.keySet().iterator();
100    while (it.hasNext()) {
101      if (cleanOldServerName(newServerName, it)) {
102        return true;
103      }
104    }
105    return false;
106  }
107
108  synchronized void cleanAllPreviousInstances(final ServerName newServerName) {
109    Iterator<ServerName> it = deadServers.keySet().iterator();
110    while (it.hasNext()) {
111      cleanOldServerName(newServerName, it);
112    }
113  }
114
115  /**
116   * @param newServerName      Server to match port and hostname against.
117   * @param deadServerIterator Iterator primed so can call 'next' on it.
118   * @return True if <code>newServerName</code> and current primed iterator ServerName have same
119   *         host and port and we removed old server from iterator and from processing list.
120   */
121  private boolean cleanOldServerName(ServerName newServerName,
122    Iterator<ServerName> deadServerIterator) {
123    ServerName sn = deadServerIterator.next();
124    if (ServerName.isSameAddress(sn, newServerName)) {
125      // Remove from dead servers list. Don't remove from the processing list --
126      // let the SCP do it when it is done.
127      deadServerIterator.remove();
128      return true;
129    }
130    return false;
131  }
132
133  @Override
134  public synchronized String toString() {
135    // Display unified set of servers from both maps
136    Set<ServerName> servers = new HashSet<>();
137    servers.addAll(deadServers.keySet());
138    StringBuilder sb = new StringBuilder();
139    for (ServerName sn : servers) {
140      if (sb.length() > 0) {
141        sb.append(", ");
142      }
143      sb.append(sn.toString());
144    }
145    return sb.toString();
146  }
147
148  /**
149   * Extract all the servers dead since a given time, and sort them.
150   * @param ts the time, 0 for all
151   * @return a sorted array list, by death time, lowest values first.
152   */
153  synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts) {
154    List<Pair<ServerName, Long>> res = new ArrayList<>(size());
155
156    for (Map.Entry<ServerName, Long> entry : deadServers.entrySet()) {
157      if (entry.getValue() >= ts) {
158        res.add(new Pair<>(entry.getKey(), entry.getValue()));
159      }
160    }
161
162    Collections.sort(res, (o1, o2) -> o1.getSecond().compareTo(o2.getSecond()));
163    return res;
164  }
165
166  /**
167   * Get the time when a server died
168   * @param deadServerName the dead server name
169   * @return the date when the server died
170   */
171  public synchronized Date getTimeOfDeath(final ServerName deadServerName) {
172    Long time = deadServers.get(deadServerName);
173    return time == null ? null : new Date(time);
174  }
175
176  public synchronized long getDeathTimestamp(final ServerName deadServerName) {
177    Long time = deadServers.get(deadServerName);
178    return time == null ? 0 : time;
179  }
180
181  /**
182   * Called from rpc by operator cleaning up deadserver list.
183   * @param deadServerName the dead server name
184   * @return true if this server was removed
185   */
186  public synchronized boolean removeDeadServer(final ServerName deadServerName) {
187    return this.deadServers.remove(deadServerName) != null;
188  }
189}