001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.util.ArrayList; 021import java.util.Collections; 022import java.util.Date; 023import java.util.HashMap; 024import java.util.HashSet; 025import java.util.Iterator; 026import java.util.List; 027import java.util.Map; 028import java.util.Set; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 031import org.apache.hadoop.hbase.util.Pair; 032import org.apache.yetus.audience.InterfaceAudience; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036/** 037 * Class to hold dead servers list and utility querying dead server list. Servers are added when 038 * they expire or when we find them in filesystem on startup. When a server crash procedure is 039 * queued, it will populate the processing list and then remove the server from processing list when 040 * done. Servers are removed from dead server list when a new instance is started over the old on 041 * same hostname and port or when new Master comes online tidying up after all initialization. 042 * Processing list and deadserver list are not tied together (you don't have to be in deadservers 043 * list to be processing and vice versa). 044 */ 045@InterfaceAudience.Private 046public class DeadServer { 047 private static final Logger LOG = LoggerFactory.getLogger(DeadServer.class); 048 049 /** 050 * Set of known dead servers. On znode expiration, servers are added here. This is needed in case 051 * of a network partitioning where the server's lease expires, but the server is still running. 052 * After the network is healed, and it's server logs are recovered, it will be told to call server 053 * startup because by then, its regions have probably been reassigned. 054 */ 055 private final Map<ServerName, Long> deadServers = new HashMap<>(); 056 057 /** 058 * @param serverName server name. 059 * @return true if this server is on the dead servers list false otherwise 060 */ 061 public synchronized boolean isDeadServer(final ServerName serverName) { 062 return deadServers.containsKey(serverName); 063 } 064 065 public synchronized Set<ServerName> copyServerNames() { 066 Set<ServerName> clone = new HashSet<>(deadServers.size()); 067 clone.addAll(deadServers.keySet()); 068 return clone; 069 } 070 071 /** 072 * Adds the server to the dead server list if it's not there already. 073 */ 074 synchronized void putIfAbsent(ServerName sn) { 075 this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime()); 076 } 077 078 synchronized void putIfAbsent(ServerName sn, long crashedTime) { 079 this.deadServers.putIfAbsent(sn, crashedTime); 080 } 081 082 public synchronized int size() { 083 return deadServers.size(); 084 } 085 086 synchronized boolean isEmpty() { 087 return deadServers.isEmpty(); 088 } 089 090 /** 091 * Handles restart of a server. The new server instance has a different start code. The new start 092 * code should be greater than the old one. We don't check that here. Removes the old server from 093 * deadserver list. 094 * @param newServerName Servername as either <code>host:port</code> or 095 * <code>host,port,startcode</code>. 096 * @return true if this server was dead before and coming back alive again 097 */ 098 synchronized boolean cleanPreviousInstance(final ServerName newServerName) { 099 Iterator<ServerName> it = deadServers.keySet().iterator(); 100 while (it.hasNext()) { 101 if (cleanOldServerName(newServerName, it)) { 102 return true; 103 } 104 } 105 return false; 106 } 107 108 synchronized void cleanAllPreviousInstances(final ServerName newServerName) { 109 Iterator<ServerName> it = deadServers.keySet().iterator(); 110 while (it.hasNext()) { 111 cleanOldServerName(newServerName, it); 112 } 113 } 114 115 /** 116 * @param newServerName Server to match port and hostname against. 117 * @param deadServerIterator Iterator primed so can call 'next' on it. 118 * @return True if <code>newServerName</code> and current primed iterator ServerName have same 119 * host and port and we removed old server from iterator and from processing list. 120 */ 121 private boolean cleanOldServerName(ServerName newServerName, 122 Iterator<ServerName> deadServerIterator) { 123 ServerName sn = deadServerIterator.next(); 124 if (ServerName.isSameAddress(sn, newServerName)) { 125 // Remove from dead servers list. Don't remove from the processing list -- 126 // let the SCP do it when it is done. 127 deadServerIterator.remove(); 128 return true; 129 } 130 return false; 131 } 132 133 @Override 134 public synchronized String toString() { 135 // Display unified set of servers from both maps 136 Set<ServerName> servers = new HashSet<>(); 137 servers.addAll(deadServers.keySet()); 138 StringBuilder sb = new StringBuilder(); 139 for (ServerName sn : servers) { 140 if (sb.length() > 0) { 141 sb.append(", "); 142 } 143 sb.append(sn.toString()); 144 } 145 return sb.toString(); 146 } 147 148 /** 149 * Extract all the servers dead since a given time, and sort them. 150 * @param ts the time, 0 for all 151 * @return a sorted array list, by death time, lowest values first. 152 */ 153 synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts) { 154 List<Pair<ServerName, Long>> res = new ArrayList<>(size()); 155 156 for (Map.Entry<ServerName, Long> entry : deadServers.entrySet()) { 157 if (entry.getValue() >= ts) { 158 res.add(new Pair<>(entry.getKey(), entry.getValue())); 159 } 160 } 161 162 Collections.sort(res, (o1, o2) -> o1.getSecond().compareTo(o2.getSecond())); 163 return res; 164 } 165 166 /** 167 * Get the time when a server died 168 * @param deadServerName the dead server name 169 * @return the date when the server died 170 */ 171 public synchronized Date getTimeOfDeath(final ServerName deadServerName) { 172 Long time = deadServers.get(deadServerName); 173 return time == null ? null : new Date(time); 174 } 175 176 public synchronized long getDeathTimestamp(final ServerName deadServerName) { 177 Long time = deadServers.get(deadServerName); 178 return time == null ? 0 : time; 179 } 180 181 /** 182 * Called from rpc by operator cleaning up deadserver list. 183 * @param deadServerName the dead server name 184 * @return true if this server was removed 185 */ 186 public synchronized boolean removeDeadServer(final ServerName deadServerName) { 187 return this.deadServers.remove(deadServerName) != null; 188 } 189}