001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.util.ArrayList;
021import java.util.Collections;
022import java.util.Date;
023import java.util.HashMap;
024import java.util.HashSet;
025import java.util.Iterator;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029import org.apache.hadoop.hbase.ServerName;
030import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
031import org.apache.hadoop.hbase.util.Pair;
032import org.apache.yetus.audience.InterfaceAudience;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036/**
037 * Class to hold dead servers list and utility querying dead server list. Servers are added when
038 * they expire or when we find them in filesystem on startup. When a server crash procedure is
039 * queued, it will populate the processing list and then remove the server from processing list when
040 * done. Servers are removed from dead server list when a new instance is started over the old on
041 * same hostname and port or when new Master comes online tidying up after all initialization.
042 * Processing list and deadserver list are not tied together (you don't have to be in deadservers
043 * list to be processing and vice versa).
044 */
045@InterfaceAudience.Private
046public class DeadServer {
047  private static final Logger LOG = LoggerFactory.getLogger(DeadServer.class);
048
049  /**
050   * Set of known dead servers. On znode expiration, servers are added here. This is needed in case
051   * of a network partitioning where the server's lease expires, but the server is still running.
052   * After the network is healed, and it's server logs are recovered, it will be told to call server
053   * startup because by then, its regions have probably been reassigned.
054   */
055  private final Map<ServerName, Long> deadServers = new HashMap<>();
056
057  /**
058   * @param serverName server name.
059   * @return true if this server is on the dead servers list false otherwise
060   */
061  public synchronized boolean isDeadServer(final ServerName serverName) {
062    return deadServers.containsKey(serverName);
063  }
064
065  public synchronized Set<ServerName> copyServerNames() {
066    Set<ServerName> clone = new HashSet<>(deadServers.size());
067    clone.addAll(deadServers.keySet());
068    return clone;
069  }
070
071  /**
072   * Adds the server to the dead server list if it's not there already.
073   */
074  synchronized void putIfAbsent(ServerName sn) {
075    this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
076  }
077
078  public synchronized int size() {
079    return deadServers.size();
080  }
081
082  synchronized boolean isEmpty() {
083    return deadServers.isEmpty();
084  }
085
086  /**
087   * Handles restart of a server. The new server instance has a different start code. The new start
088   * code should be greater than the old one. We don't check that here. Removes the old server from
089   * deadserver list.
090   * @param newServerName Servername as either <code>host:port</code> or
091   *                      <code>host,port,startcode</code>.
092   * @return true if this server was dead before and coming back alive again
093   */
094  synchronized boolean cleanPreviousInstance(final ServerName newServerName) {
095    Iterator<ServerName> it = deadServers.keySet().iterator();
096    while (it.hasNext()) {
097      if (cleanOldServerName(newServerName, it)) {
098        return true;
099      }
100    }
101    return false;
102  }
103
104  synchronized void cleanAllPreviousInstances(final ServerName newServerName) {
105    Iterator<ServerName> it = deadServers.keySet().iterator();
106    while (it.hasNext()) {
107      cleanOldServerName(newServerName, it);
108    }
109  }
110
111  /**
112   * @param newServerName      Server to match port and hostname against.
113   * @param deadServerIterator Iterator primed so can call 'next' on it.
114   * @return True if <code>newServerName</code> and current primed iterator ServerName have same
115   *         host and port and we removed old server from iterator and from processing list.
116   */
117  private boolean cleanOldServerName(ServerName newServerName,
118    Iterator<ServerName> deadServerIterator) {
119    ServerName sn = deadServerIterator.next();
120    if (ServerName.isSameAddress(sn, newServerName)) {
121      // Remove from dead servers list. Don't remove from the processing list --
122      // let the SCP do it when it is done.
123      deadServerIterator.remove();
124      return true;
125    }
126    return false;
127  }
128
129  @Override
130  public synchronized String toString() {
131    // Display unified set of servers from both maps
132    Set<ServerName> servers = new HashSet<>();
133    servers.addAll(deadServers.keySet());
134    StringBuilder sb = new StringBuilder();
135    for (ServerName sn : servers) {
136      if (sb.length() > 0) {
137        sb.append(", ");
138      }
139      sb.append(sn.toString());
140    }
141    return sb.toString();
142  }
143
144  /**
145   * Extract all the servers dead since a given time, and sort them.
146   * @param ts the time, 0 for all
147   * @return a sorted array list, by death time, lowest values first.
148   */
149  synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts) {
150    List<Pair<ServerName, Long>> res = new ArrayList<>(size());
151
152    for (Map.Entry<ServerName, Long> entry : deadServers.entrySet()) {
153      if (entry.getValue() >= ts) {
154        res.add(new Pair<>(entry.getKey(), entry.getValue()));
155      }
156    }
157
158    Collections.sort(res, (o1, o2) -> o1.getSecond().compareTo(o2.getSecond()));
159    return res;
160  }
161
162  /**
163   * Get the time when a server died
164   * @param deadServerName the dead server name
165   * @return the date when the server died
166   */
167  public synchronized Date getTimeOfDeath(final ServerName deadServerName) {
168    Long time = deadServers.get(deadServerName);
169    return time == null ? null : new Date(time);
170  }
171
172  /**
173   * Called from rpc by operator cleaning up deadserver list.
174   * @param deadServerName the dead server name
175   * @return true if this server was removed
176   */
177  public synchronized boolean removeDeadServer(final ServerName deadServerName) {
178    return this.deadServers.remove(deadServerName) != null;
179  }
180}