001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.util.ArrayList; 021import java.util.Collections; 022import java.util.Date; 023import java.util.HashMap; 024import java.util.HashSet; 025import java.util.Iterator; 026import java.util.List; 027import java.util.Map; 028import java.util.Set; 029import org.apache.hadoop.hbase.ServerName; 030import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 031import org.apache.hadoop.hbase.util.Pair; 032import org.apache.yetus.audience.InterfaceAudience; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036/** 037 * Class to hold dead servers list and utility querying dead server list. Servers are added when 038 * they expire or when we find them in filesystem on startup. When a server crash procedure is 039 * queued, it will populate the processing list and then remove the server from processing list when 040 * done. Servers are removed from dead server list when a new instance is started over the old on 041 * same hostname and port or when new Master comes online tidying up after all initialization. 042 * Processing list and deadserver list are not tied together (you don't have to be in deadservers 043 * list to be processing and vice versa). 044 */ 045@InterfaceAudience.Private 046public class DeadServer { 047 private static final Logger LOG = LoggerFactory.getLogger(DeadServer.class); 048 049 /** 050 * Set of known dead servers. On znode expiration, servers are added here. This is needed in case 051 * of a network partitioning where the server's lease expires, but the server is still running. 052 * After the network is healed, and it's server logs are recovered, it will be told to call server 053 * startup because by then, its regions have probably been reassigned. 054 */ 055 private final Map<ServerName, Long> deadServers = new HashMap<>(); 056 057 /** 058 * @param serverName server name. 059 * @return true if this server is on the dead servers list false otherwise 060 */ 061 public synchronized boolean isDeadServer(final ServerName serverName) { 062 return deadServers.containsKey(serverName); 063 } 064 065 public synchronized Set<ServerName> copyServerNames() { 066 Set<ServerName> clone = new HashSet<>(deadServers.size()); 067 clone.addAll(deadServers.keySet()); 068 return clone; 069 } 070 071 /** 072 * Adds the server to the dead server list if it's not there already. 073 */ 074 synchronized void putIfAbsent(ServerName sn) { 075 this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime()); 076 } 077 078 public synchronized int size() { 079 return deadServers.size(); 080 } 081 082 synchronized boolean isEmpty() { 083 return deadServers.isEmpty(); 084 } 085 086 /** 087 * Handles restart of a server. The new server instance has a different start code. The new start 088 * code should be greater than the old one. We don't check that here. Removes the old server from 089 * deadserver list. 090 * @param newServerName Servername as either <code>host:port</code> or 091 * <code>host,port,startcode</code>. 092 * @return true if this server was dead before and coming back alive again 093 */ 094 synchronized boolean cleanPreviousInstance(final ServerName newServerName) { 095 Iterator<ServerName> it = deadServers.keySet().iterator(); 096 while (it.hasNext()) { 097 if (cleanOldServerName(newServerName, it)) { 098 return true; 099 } 100 } 101 return false; 102 } 103 104 synchronized void cleanAllPreviousInstances(final ServerName newServerName) { 105 Iterator<ServerName> it = deadServers.keySet().iterator(); 106 while (it.hasNext()) { 107 cleanOldServerName(newServerName, it); 108 } 109 } 110 111 /** 112 * @param newServerName Server to match port and hostname against. 113 * @param deadServerIterator Iterator primed so can call 'next' on it. 114 * @return True if <code>newServerName</code> and current primed iterator ServerName have same 115 * host and port and we removed old server from iterator and from processing list. 116 */ 117 private boolean cleanOldServerName(ServerName newServerName, 118 Iterator<ServerName> deadServerIterator) { 119 ServerName sn = deadServerIterator.next(); 120 if (ServerName.isSameAddress(sn, newServerName)) { 121 // Remove from dead servers list. Don't remove from the processing list -- 122 // let the SCP do it when it is done. 123 deadServerIterator.remove(); 124 return true; 125 } 126 return false; 127 } 128 129 @Override 130 public synchronized String toString() { 131 // Display unified set of servers from both maps 132 Set<ServerName> servers = new HashSet<>(); 133 servers.addAll(deadServers.keySet()); 134 StringBuilder sb = new StringBuilder(); 135 for (ServerName sn : servers) { 136 if (sb.length() > 0) { 137 sb.append(", "); 138 } 139 sb.append(sn.toString()); 140 } 141 return sb.toString(); 142 } 143 144 /** 145 * Extract all the servers dead since a given time, and sort them. 146 * @param ts the time, 0 for all 147 * @return a sorted array list, by death time, lowest values first. 148 */ 149 synchronized List<Pair<ServerName, Long>> copyDeadServersSince(long ts) { 150 List<Pair<ServerName, Long>> res = new ArrayList<>(size()); 151 152 for (Map.Entry<ServerName, Long> entry : deadServers.entrySet()) { 153 if (entry.getValue() >= ts) { 154 res.add(new Pair<>(entry.getKey(), entry.getValue())); 155 } 156 } 157 158 Collections.sort(res, (o1, o2) -> o1.getSecond().compareTo(o2.getSecond())); 159 return res; 160 } 161 162 /** 163 * Get the time when a server died 164 * @param deadServerName the dead server name 165 * @return the date when the server died 166 */ 167 public synchronized Date getTimeOfDeath(final ServerName deadServerName) { 168 Long time = deadServers.get(deadServerName); 169 return time == null ? null : new Date(time); 170 } 171 172 /** 173 * Called from rpc by operator cleaning up deadserver list. 174 * @param deadServerName the dead server name 175 * @return true if this server was removed 176 */ 177 public synchronized boolean removeDeadServer(final ServerName deadServerName) { 178 return this.deadServers.remove(deadServerName) != null; 179 } 180}