View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.HTableDescriptor;
38  import org.apache.hadoop.hbase.MetaTableAccessor;
39  import org.apache.hadoop.hbase.RegionTransition;
40  import org.apache.hadoop.hbase.Server;
41  import org.apache.hadoop.hbase.ServerLoad;
42  import org.apache.hadoop.hbase.ServerName;
43  import org.apache.hadoop.hbase.TableName;
44  import org.apache.hadoop.hbase.TableStateManager;
45  import org.apache.hadoop.hbase.classification.InterfaceAudience;
46  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
47  import org.apache.hadoop.hbase.master.RegionState.State;
48  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
49  import org.apache.hadoop.hbase.util.Bytes;
50  import org.apache.hadoop.hbase.util.FSUtils;
51  import org.apache.hadoop.hbase.util.Pair;
52  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
53  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
54  import org.apache.zookeeper.KeeperException;
55  
56  import com.google.common.annotations.VisibleForTesting;
57  import com.google.common.base.Preconditions;
58  
59  /**
60   * Region state accountant. It holds the states of all regions in the memory.
61   * In normal scenario, it should match the meta table and the true region states.
62   *
63   * This map is used by AssignmentManager to track region states.
64   */
65  @InterfaceAudience.Private
66  public class RegionStates {
67    private static final Log LOG = LogFactory.getLog(RegionStates.class);
68  
69    /**
70     * Regions currently in transition.
71     */
72    final HashMap<String, RegionState> regionsInTransition =
73      new HashMap<String, RegionState>();
74  
75    /**
76     * Region encoded name to state map.
77     * All the regions should be in this map.
78     */
79    private final Map<String, RegionState> regionStates =
80      new HashMap<String, RegionState>();
81  
82    /**
83     * Holds mapping of table -> region state
84     */
85    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
86        new HashMap<TableName, Map<String, RegionState>>();
87  
88    /**
89     * Server to regions assignment map.
90     * Contains the set of regions currently assigned to a given server.
91     */
92    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
93      new HashMap<ServerName, Set<HRegionInfo>>();
94  
95    /**
96     * Maintains the mapping from the default region to the replica regions.
97     */
98    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
99      new HashMap<HRegionInfo, Set<HRegionInfo>>();
100 
101   /**
102    * Region to server assignment map.
103    * Contains the server a given region is currently assigned to.
104    */
105   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
106     new TreeMap<HRegionInfo, ServerName>();
107 
108   /**
109    * Encoded region name to server assignment map for re-assignment
110    * purpose. Contains the server a given region is last known assigned
111    * to, which has not completed log splitting, so not assignable.
112    * If a region is currently assigned, this server info in this
113    * map should be the same as that in regionAssignments.
114    * However the info in regionAssignments is cleared when the region
115    * is offline while the info in lastAssignments is cleared when
116    * the region is closed or the server is dead and processed.
117    */
118   private final HashMap<String, ServerName> lastAssignments =
119     new HashMap<String, ServerName>();
120 
121   /**
122    * Encoded region name to server assignment map for the
123    * purpose to clean up serverHoldings when a region is online
124    * on a new server. When the region is offline from the previous
125    * server, we cleaned up regionAssignments so that it has the
126    * latest assignment map. But we didn't clean up serverHoldings
127    * to match the meta. We need this map to find out the old server
128    * whose serverHoldings needs cleanup, given a moved region.
129    */
130   private final HashMap<String, ServerName> oldAssignments =
131     new HashMap<String, ServerName>();
132 
133   /**
134    * Map a host port pair string to the latest start code
135    * of a region server which is known to be dead. It is dead
136    * to us, but server manager may not know it yet.
137    */
138   private final HashMap<String, Long> deadServers =
139     new HashMap<String, Long>();
140 
141   /**
142    * Map a dead servers to the time when log split is done.
143    * Since log splitting is not ordered, we have to remember
144    * all processed instances. The map is cleaned up based
145    * on a configured time. By default, we assume a dead
146    * server should be done with log splitting in two hours.
147    */
148   private final HashMap<ServerName, Long> processedServers =
149     new HashMap<ServerName, Long>();
150   private long lastProcessedServerCleanTime;
151 
152   private final TableStateManager tableStateManager;
153   private final RegionStateStore regionStateStore;
154   private final ServerManager serverManager;
155   private final Server server;
156 
157   // The maximum time to keep a log split info in region states map
158   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
159   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
160 
161   RegionStates(final Server master, final TableStateManager tableStateManager,
162       final ServerManager serverManager, final RegionStateStore regionStateStore) {
163     this.tableStateManager = tableStateManager;
164     this.regionStateStore = regionStateStore;
165     this.serverManager = serverManager;
166     this.server = master;
167   }
168 
169   /**
170    * @return a copy of the region assignment map
171    */
172   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
173     return new TreeMap<HRegionInfo, ServerName>(regionAssignments);
174   }
175 
176   /**
177    * Return the replicas (including default) for the regions grouped by ServerName
178    * @param regions
179    * @return a pair containing the groupings as a map
180    */
181   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
182     Collection<HRegionInfo> regions) {
183     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
184     for (HRegionInfo region : regions) {
185       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
186       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
187       if (allReplicas != null) {
188         for (HRegionInfo hri : allReplicas) {
189           ServerName server = regionAssignments.get(hri);
190           if (server != null) {
191             List<HRegionInfo> regionsOnServer = map.get(server);
192             if (regionsOnServer == null) {
193               regionsOnServer = new ArrayList<HRegionInfo>(1);
194               map.put(server, regionsOnServer);
195             }
196             regionsOnServer.add(hri);
197           }
198         }
199       }
200     }
201     return map;
202   }
203 
204   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
205     return regionAssignments.get(hri);
206   }
207 
208   /**
209    * Get regions in transition and their states
210    */
211   @SuppressWarnings("unchecked")
212   public synchronized Map<String, RegionState> getRegionsInTransition() {
213     return (Map<String, RegionState>)regionsInTransition.clone();
214   }
215 
216   /**
217    * @return True if specified region in transition.
218    */
219   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
220     return regionsInTransition.containsKey(hri.getEncodedName());
221   }
222 
223   /**
224    * @return True if specified region in transition.
225    */
226   public synchronized boolean isRegionInTransition(final String encodedName) {
227     return regionsInTransition.containsKey(encodedName);
228   }
229 
230   /**
231    * @return True if any region in transition.
232    */
233   public synchronized boolean isRegionsInTransition() {
234     return !regionsInTransition.isEmpty();
235   }
236 
237   /**
238    * @return True if specified region assigned, and not in transition.
239    */
240   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
241     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
242   }
243 
244   /**
245    * @return True if specified region offline/closed, but not in transition.
246    * If the region is not in the map, it is offline to us too.
247    */
248   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
249     return getRegionState(hri) == null || (!isRegionInTransition(hri)
250       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
251   }
252 
253   /**
254    * @return True if specified region is in one of the specified states.
255    */
256   public boolean isRegionInState(
257       final HRegionInfo hri, final State... states) {
258     return isRegionInState(hri.getEncodedName(), states);
259   }
260 
261   /**
262    * @return True if specified region is in one of the specified states.
263    */
264   public boolean isRegionInState(
265       final String encodedName, final State... states) {
266     RegionState regionState = getRegionState(encodedName);
267     return isOneOfStates(regionState, states);
268   }
269 
270   /**
271    * Wait for the state map to be updated by assignment manager.
272    */
273   public synchronized void waitForUpdate(
274       final long timeout) throws InterruptedException {
275     this.wait(timeout);
276   }
277 
278   /**
279    * Get region transition state
280    */
281   public RegionState getRegionTransitionState(final HRegionInfo hri) {
282     return getRegionTransitionState(hri.getEncodedName());
283   }
284 
285   /**
286    * Get region transition state
287    */
288   public synchronized RegionState
289       getRegionTransitionState(final String encodedName) {
290     return regionsInTransition.get(encodedName);
291   }
292 
293   /**
294    * Add a list of regions to RegionStates. If a region is split
295    * and offline, its state will be SPLIT. Otherwise, its state will
296    * be OFFLINE. Region already in RegionStates will be skipped.
297    */
298   public void createRegionStates(
299       final List<HRegionInfo> hris) {
300     for (HRegionInfo hri: hris) {
301       createRegionState(hri);
302     }
303   }
304 
305   /**
306    * Add a region to RegionStates. If the region is split
307    * and offline, its state will be SPLIT. Otherwise, its state will
308    * be OFFLINE. If it is already in RegionStates, this call has
309    * no effect, and the original state is returned.
310    */
311   public RegionState createRegionState(final HRegionInfo hri) {
312     return createRegionState(hri, null, null, null);
313   }
314 
315   /**
316    * Add a region to RegionStates with the specified state.
317    * If the region is already in RegionStates, this call has
318    * no effect, and the original state is returned.
319    *
320    * @param hri the region info to create a state for
321    * @param newState the state to the region in set to
322    * @param serverName the server the region is transitioning on
323    * @param lastHost the last server that hosts the region
324    * @return the current state
325    */
326   public synchronized RegionState createRegionState(final HRegionInfo hri,
327       State newState, ServerName serverName, ServerName lastHost) {
328     if (newState == null || (newState == State.OPEN && serverName == null)) {
329       newState =  State.OFFLINE;
330     }
331     if (hri.isOffline() && hri.isSplit()) {
332       newState = State.SPLIT;
333       serverName = null;
334     }
335     String encodedName = hri.getEncodedName();
336     RegionState regionState = regionStates.get(encodedName);
337     if (regionState != null) {
338       LOG.warn("Tried to create a state for a region already in RegionStates, "
339         + "used existing: " + regionState + ", ignored new: " + newState);
340     } else {
341       regionState = new RegionState(hri, newState, serverName);
342       putRegionState(regionState);
343       if (newState == State.OPEN) {
344         if (!serverName.equals(lastHost)) {
345           LOG.warn("Open region's last host " + lastHost
346             + " should be the same as the current one " + serverName
347             + ", ignored the last and used the current one");
348           lastHost = serverName;
349         }
350         lastAssignments.put(encodedName, lastHost);
351         regionAssignments.put(hri, lastHost);
352       } else if (!regionState.isUnassignable()) {
353         regionsInTransition.put(encodedName, regionState);
354       }
355       if (lastHost != null && newState != State.SPLIT) {
356         addToServerHoldings(lastHost, hri);
357         if (newState != State.OPEN) {
358           oldAssignments.put(encodedName, lastHost);
359         }
360       }
361     }
362     return regionState;
363   }
364 
365   private RegionState putRegionState(RegionState regionState) {
366     HRegionInfo hri = regionState.getRegion();
367     String encodedName = hri.getEncodedName();
368     TableName table = hri.getTable();
369     RegionState oldState = regionStates.put(encodedName, regionState);
370     Map<String, RegionState> map = regionStatesTableIndex.get(table);
371     if (map == null) {
372       map = new HashMap<String, RegionState>();
373       regionStatesTableIndex.put(table, map);
374     }
375     map.put(encodedName, regionState);
376     return oldState;
377   }
378 
379   /**
380    * Set the region state to CLOSED
381    */
382   public RegionState setRegionStateTOCLOSED(
383       final byte[] regionName,
384       final ServerName serverName) {
385     HRegionInfo regionInfo = getRegionInfo(regionName);
386     return setRegionStateTOCLOSED(regionInfo, serverName);
387   }
388 
389   /**
390    * Set the region state to CLOSED
391    */
392   public RegionState setRegionStateTOCLOSED(
393       final HRegionInfo regionInfo,
394       final ServerName serverName) {
395     ServerName sn = serverName;
396     if (sn == null) {
397       RegionState regionState = getRegionState(regionInfo.getEncodedName());
398       if (regionState != null) {
399         sn = regionState.getServerName();
400       }
401       // TODO: if sn is null, should we dig into
402       // lastAssignments.get(regionInfo.getEncodedName() to get the server name?
403       // For now, I just keep the same logic that works in the past
404     }
405     // We have to make sure that the last region server is set to be the same as the
406     // current RS.  If we don't do that, we could run into situation that both AM and SSH
407     // think other would do the assignment work; at the end, neither does the work and
408     // region remains RIT.
409     // See HBASE-13330 and HBASE-17023
410     setLastRegionServerOfRegion(sn, regionInfo.getEncodedName());
411     return updateRegionState(regionInfo, State.CLOSED, sn);
412   }
413 
414   /**
415    * Update a region state. It will be put in transition if not already there.
416    */
417   public RegionState updateRegionState(
418       final HRegionInfo hri, final State state) {
419     RegionState regionState = getRegionState(hri.getEncodedName());
420     return updateRegionState(hri, state,
421       regionState == null ? null : regionState.getServerName());
422   }
423 
424   /**
425    * Update a region state. It will be put in transition if not already there.
426    *
427    * If we can't find the region info based on the region name in
428    * the transition, log a warning and return null.
429    */
430   public RegionState updateRegionState(
431       final RegionTransition transition, final State state) {
432     byte [] regionName = transition.getRegionName();
433     HRegionInfo regionInfo = getRegionInfo(regionName);
434     if (regionInfo == null) {
435       String prettyRegionName = HRegionInfo.prettyPrint(
436         HRegionInfo.encodeRegionName(regionName));
437       LOG.warn("Failed to find region " + prettyRegionName
438         + " in updating its state to " + state
439         + " based on region transition " + transition);
440       return null;
441     }
442     return updateRegionState(regionInfo, state,
443       transition.getServerName());
444   }
445 
446   /**
447    * Transition a region state to OPEN from OPENING/PENDING_OPEN
448    */
449   public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
450       final RegionTransition transition, final RegionState fromState, final ServerName sn) {
451     if(fromState.isPendingOpenOrOpeningOnServer(sn)){
452       return updateRegionState(transition, State.OPEN);
453     }
454     return null;
455   }
456 
457   /**
458    * Update a region state. It will be put in transition if not already there.
459    */
460   public RegionState updateRegionState(
461       final HRegionInfo hri, final State state, final ServerName serverName) {
462     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
463   }
464 
465   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
466     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
467   }
468 
469   /**
470    * A region is online, won't be in transition any more.
471    * We can't confirm it is really online on specified region server
472    * because it hasn't been put in region server's online region list yet.
473    */
474   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
475     String encodedName = hri.getEncodedName();
476     if (!serverManager.isServerOnline(serverName)) {
477       // This is possible if the region server dies before master gets a
478       // chance to handle ZK event in time. At this time, if the dead server
479       // is already processed by SSH, we should ignore this event.
480       // If not processed yet, ignore and let SSH deal with it.
481       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
482       return;
483     }
484     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
485 
486     synchronized (this) {
487       regionsInTransition.remove(encodedName);
488       ServerName oldServerName = regionAssignments.put(hri, serverName);
489       if (!serverName.equals(oldServerName)) {
490         if (LOG.isDebugEnabled()) {
491           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
492         }
493         addToServerHoldings(serverName, hri);
494         addToReplicaMapping(hri);
495         if (oldServerName == null) {
496           oldServerName = oldAssignments.remove(encodedName);
497         }
498         if (oldServerName != null
499             && !oldServerName.equals(serverName)
500             && serverHoldings.containsKey(oldServerName)) {
501           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
502           removeFromServerHoldings(oldServerName, hri);
503         }
504       }
505     }
506   }
507 
508   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
509     Set<HRegionInfo> regions = serverHoldings.get(serverName);
510     if (regions == null) {
511       regions = new HashSet<HRegionInfo>();
512       serverHoldings.put(serverName, regions);
513     }
514     regions.add(hri);
515   }
516 
517   private void addToReplicaMapping(HRegionInfo hri) {
518     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
519     Set<HRegionInfo> replicas =
520         defaultReplicaToOtherReplicas.get(defaultReplica);
521     if (replicas == null) {
522       replicas = new HashSet<HRegionInfo>();
523       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
524     }
525     replicas.add(hri);
526   }
527 
528   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
529     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
530     oldRegions.remove(hri);
531     if (oldRegions.isEmpty()) {
532       serverHoldings.remove(serverName);
533     }
534   }
535 
536   private void removeFromReplicaMapping(HRegionInfo hri) {
537     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
538     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
539     if (replicas != null) {
540       replicas.remove(hri);
541       if (replicas.isEmpty()) {
542         defaultReplicaToOtherReplicas.remove(defaultReplica);
543       }
544     }
545   }
546 
547   /**
548    * A dead server's wals have been split so that all the regions
549    * used to be open on it can be safely assigned now. Mark them assignable.
550    */
551   public synchronized void logSplit(final ServerName serverName) {
552     for (Iterator<Map.Entry<String, ServerName>> it
553         = lastAssignments.entrySet().iterator(); it.hasNext();) {
554       Map.Entry<String, ServerName> e = it.next();
555       if (e.getValue().equals(serverName)) {
556         it.remove();
557       }
558     }
559     long now = System.currentTimeMillis();
560     if (LOG.isDebugEnabled()) {
561       LOG.debug("Adding to log splitting servers " + serverName);
562     }
563     processedServers.put(serverName, Long.valueOf(now));
564     Configuration conf = server.getConfiguration();
565     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
566     // Doesn't have to be very accurate about the clean up time
567     if (now > lastProcessedServerCleanTime + obsoleteTime) {
568       lastProcessedServerCleanTime = now;
569       long cutoff = now - obsoleteTime;
570       for (Iterator<Map.Entry<ServerName, Long>> it
571           = processedServers.entrySet().iterator(); it.hasNext();) {
572         Map.Entry<ServerName, Long> e = it.next();
573         if (e.getValue().longValue() < cutoff) {
574           if (LOG.isDebugEnabled()) {
575             LOG.debug("Removed from log splitting servers " + e.getKey());
576           }
577           it.remove();
578         }
579       }
580     }
581   }
582 
583   /**
584    * Log split is done for a given region, so it is assignable now.
585    */
586   public void logSplit(final HRegionInfo region) {
587     clearLastAssignment(region);
588   }
589 
590   public synchronized void clearLastAssignment(final HRegionInfo region) {
591     lastAssignments.remove(region.getEncodedName());
592   }
593 
594   /**
595    * A region is offline, won't be in transition any more.
596    */
597   public void regionOffline(final HRegionInfo hri) {
598     regionOffline(hri, null);
599   }
600 
601   /**
602    * A region is offline, won't be in transition any more. Its state
603    * should be the specified expected state, which can only be
604    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
605    */
606   public void regionOffline(
607       final HRegionInfo hri, final State expectedState) {
608     Preconditions.checkArgument(expectedState == null
609       || RegionState.isUnassignable(expectedState),
610         "Offlined region should not be " + expectedState);
611     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
612       // Remove it from all region maps
613       deleteRegion(hri);
614       return;
615     }
616     State newState =
617       expectedState == null ? State.OFFLINE : expectedState;
618     updateRegionState(hri, newState);
619     String encodedName = hri.getEncodedName();
620     synchronized (this) {
621       regionsInTransition.remove(encodedName);
622       ServerName oldServerName = regionAssignments.remove(hri);
623       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
624         if (newState == State.MERGED || newState == State.SPLIT
625             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
626               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
627           // Offline the region only if it's merged/split, or the table is disabled/disabling.
628           // Otherwise, offline it from this server only when it is online on a different server.
629           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
630           removeFromServerHoldings(oldServerName, hri);
631           removeFromReplicaMapping(hri);
632         } else {
633           // Need to remember it so that we can offline it from this
634           // server when it is online on a different server.
635           oldAssignments.put(encodedName, oldServerName);
636         }
637       }
638     }
639   }
640 
641   /**
642    * A server is offline, all regions on it are dead.
643    */
644   public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
645     // Offline all regions on this server not already in transition.
646     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
647     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
648     // Offline regions outside the loop and synchronized block to avoid
649     // ConcurrentModificationException and deadlock in case of meta anassigned,
650     // but RegionState a blocked.
651     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
652     synchronized (this) {
653       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
654       if (assignedRegions == null) {
655         assignedRegions = new HashSet<HRegionInfo>();
656       }
657 
658       for (HRegionInfo region : assignedRegions) {
659         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
660         if (isRegionOnline(region)) {
661           regionsToOffline.add(region);
662         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
663           LOG.debug("Offline splitting/merging region " + getRegionState(region));
664           try {
665             // Delete the ZNode if exists
666             ZKAssign.deleteNodeFailSilent(watcher, region);
667             regionsToOffline.add(region);
668           } catch (KeeperException ke) {
669             server.abort("Unexpected ZK exception deleting node " + region, ke);
670           }
671         }
672       }
673 
674       for (RegionState state : regionsInTransition.values()) {
675         HRegionInfo hri = state.getRegion();
676         if (assignedRegions.contains(hri)) {
677           // Region is open on this region server, but in transition.
678           // This region must be moving away from this server, or splitting/merging.
679           // SSH will handle it, either skip assigning, or re-assign.
680           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
681         } else if (sn.equals(state.getServerName())) {
682           // Region is in transition on this region server, and this
683           // region is not open on this server. So the region must be
684           // moving to this server from another one (i.e. opening or
685           // pending open on this server, was open on another one.
686           // Offline state is also kind of pending open if the region is in
687           // transition. The region could be in failed_close state too if we have
688           // tried several times to open it while this region server is not reachable)
689           if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
690             LOG.info("Found region in " + state +
691               " to be reassigned by ServerCrashProcedure for " + sn);
692             rits.add(hri);
693           } else if(state.isSplittingNew() || state.isMergingNew()) {
694             regionsToCleanIfNoMetaEntry.add(state.getRegion());
695           } else {
696             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
697           }
698         }
699       }
700       this.notifyAll();
701     }
702 
703     for (HRegionInfo hri : regionsToOffline) {
704       regionOffline(hri);
705     }
706 
707     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
708     return rits;
709   }
710 
711   /**
712    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
713    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
714    */
715   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
716     if (hris.isEmpty()) return;
717     for (HRegionInfo hri: hris) {
718       try {
719         // This is RPC to meta table. It is done while we have a synchronize on
720         // regionstates. No progress will be made if meta is not available at this time.
721         // This is a cleanup task. Not critical.
722         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
723             null) {
724           regionOffline(hri);
725           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
726         }
727       } catch (IOException e) {
728         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
729       }
730     }
731   }
732 
733   /**
734    * Gets the online regions of the specified table.
735    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
736    * Only returns <em>online</em> regions.  If a region on this table has been
737    * closed during a disable, etc., it will be included in the returned list.
738    * So, the returned list may not necessarily be ALL regions in this table, its
739    * all the ONLINE regions in the table.
740    * @param tableName
741    * @return Online regions from <code>tableName</code>
742    */
743   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
744     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
745     // boundary needs to have table's name but regionID 0 so that it is sorted
746     // before all table's regions.
747     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
748     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
749       if(!hri.getTable().equals(tableName)) break;
750       tableRegions.add(hri);
751     }
752     return tableRegions;
753   }
754 
755   /**
756    * Gets current state of all regions of the table.
757    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
758    * Method guaranteed to return keys for all states
759    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
760    *
761    * @param tableName
762    * @return Online regions from <code>tableName</code>
763    */
764   public synchronized Map<RegionState.State, List<HRegionInfo>>
765   getRegionByStateOfTable(TableName tableName) {
766     Map<RegionState.State, List<HRegionInfo>> tableRegions =
767         new HashMap<State, List<HRegionInfo>>();
768     for (State state : State.values()) {
769       tableRegions.put(state, new ArrayList<HRegionInfo>());
770     }
771     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
772     if (indexMap == null)
773       return tableRegions;
774     for (RegionState regionState : indexMap.values()) {
775       tableRegions.get(regionState.getState()).add(regionState.getRegion());
776     }
777     return tableRegions;
778   }
779 
780   /**
781    * Wait on region to clear regions-in-transition.
782    * <p>
783    * If the region isn't in transition, returns immediately.  Otherwise, method
784    * blocks until the region is out of transition.
785    */
786   public synchronized void waitOnRegionToClearRegionsInTransition(
787       final HRegionInfo hri) throws InterruptedException {
788     if (!isRegionInTransition(hri)) return;
789 
790     while(!server.isStopped() && isRegionInTransition(hri)) {
791       RegionState rs = getRegionState(hri);
792       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
793       waitForUpdate(100);
794     }
795 
796     if (server.isStopped()) {
797       LOG.info("Giving up wait on region in " +
798         "transition because stoppable.isStopped is set");
799     }
800   }
801 
802   /**
803    * A table is deleted. Remove its regions from all internal maps.
804    * We loop through all regions assuming we don't delete tables too much.
805    */
806   public void tableDeleted(final TableName tableName) {
807     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
808     synchronized (this) {
809       for (RegionState state: regionStates.values()) {
810         HRegionInfo region = state.getRegion();
811         if (region.getTable().equals(tableName)) {
812           regionsToDelete.add(region);
813         }
814       }
815     }
816     for (HRegionInfo region: regionsToDelete) {
817       deleteRegion(region);
818     }
819   }
820 
821   /**
822    * Get a copy of all regions assigned to a server
823    */
824   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
825     Set<HRegionInfo> regions = serverHoldings.get(serverName);
826     if (regions == null) return null;
827     return new HashSet<HRegionInfo>(regions);
828   }
829 
830   /**
831    * Remove a region from all state maps.
832    */
833   @VisibleForTesting
834   public synchronized void deleteRegion(final HRegionInfo hri) {
835     String encodedName = hri.getEncodedName();
836     regionsInTransition.remove(encodedName);
837     regionStates.remove(encodedName);
838     TableName table = hri.getTable();
839     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
840     indexMap.remove(encodedName);
841     if (indexMap.size() == 0)
842       regionStatesTableIndex.remove(table);
843     lastAssignments.remove(encodedName);
844     ServerName sn = regionAssignments.remove(hri);
845     if (sn != null) {
846       Set<HRegionInfo> regions = serverHoldings.get(sn);
847       regions.remove(hri);
848     }
849   }
850 
851   /**
852    * Checking if a region was assigned to a server which is not online now.
853    * If so, we should hold re-assign this region till SSH has split its wals.
854    * Once logs are split, the last assignment of this region will be reset,
855    * which means a null last assignment server is ok for re-assigning.
856    *
857    * A region server could be dead but we don't know it yet. We may
858    * think it's online falsely. Therefore if a server is online, we still
859    * need to confirm it reachable and having the expected start code.
860    */
861   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
862     ServerName server = lastAssignments.get(encodedName);
863     return isServerDeadAndNotProcessed(server);
864   }
865 
866   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
867     if (server == null) return false;
868     if (serverManager.isServerOnline(server)) {
869       String hostAndPort = server.getHostAndPort();
870       long startCode = server.getStartcode();
871       Long deadCode = deadServers.get(hostAndPort);
872       if (deadCode == null || startCode > deadCode.longValue()) {
873         if (serverManager.isServerReachable(server)) {
874           return false;
875         }
876         // The size of deadServers won't grow unbounded.
877         deadServers.put(hostAndPort, Long.valueOf(startCode));
878       }
879       // Watch out! If the server is not dead, the region could
880       // remain unassigned. That's why ServerManager#isServerReachable
881       // should use some retry.
882       //
883       // We cache this info since it is very unlikely for that
884       // instance to come back up later on. We don't want to expire
885       // the server since we prefer to let it die naturally.
886       LOG.warn("Couldn't reach online server " + server);
887     }
888     // Now, we know it's dead. Check if it's processed
889     return !processedServers.containsKey(server);
890   }
891 
892  /**
893    * Get the last region server a region was on for purpose of re-assignment,
894    * i.e. should the re-assignment be held back till log split is done?
895    */
896   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
897     return lastAssignments.get(encodedName);
898   }
899 
900   synchronized void setLastRegionServerOfRegions(
901       final ServerName serverName, final List<HRegionInfo> regionInfos) {
902     for (HRegionInfo hri: regionInfos) {
903       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
904     }
905   }
906 
907   synchronized void setLastRegionServerOfRegion(
908       final ServerName serverName, final String encodedName) {
909     lastAssignments.put(encodedName, serverName);
910   }
911 
912   void splitRegion(HRegionInfo p,
913       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
914 
915     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
916     synchronized (this) {
917       // After PONR, split is considered to be done.
918       // Update server holdings to be aligned with the meta.
919       Set<HRegionInfo> regions = serverHoldings.get(sn);
920       if (regions == null) {
921         throw new IllegalStateException(sn + " should host some regions");
922       }
923       regions.remove(p);
924       regions.add(a);
925       regions.add(b);
926     }
927   }
928 
929   void mergeRegions(HRegionInfo p,
930       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
931     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
932     synchronized (this) {
933       // After PONR, merge is considered to be done.
934       // Update server holdings to be aligned with the meta.
935       Set<HRegionInfo> regions = serverHoldings.get(sn);
936       if (regions == null) {
937         throw new IllegalStateException(sn + " should host some regions");
938       }
939       regions.remove(a);
940       regions.remove(b);
941       regions.add(p);
942     }
943   }
944 
945   private int getRegionReplication(HRegionInfo r) throws IOException {
946     if (tableStateManager != null) {
947       HTableDescriptor htd = ((MasterServices)server).getTableDescriptors().get(r.getTable());
948       if (htd != null) {
949         return htd.getRegionReplication();
950       }
951     }
952     return 1;
953   }
954 
955   /**
956    * At cluster clean re/start, mark all user regions closed except those of tables
957    * that are excluded, such as disabled/disabling/enabling tables. All user regions
958    * and their previous locations are returned.
959    */
960   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
961     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
962     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
963     for(RegionState state: regionStates.values()) {
964       HRegionInfo hri = state.getRegion();
965       if (state.isSplit() || hri.isSplit()) {
966         continue;
967       }
968       TableName tableName = hri.getTable();
969       if (!TableName.META_TABLE_NAME.equals(tableName)
970           && (noExcludeTables || !excludedTables.contains(tableName))) {
971         toBeClosed.add(hri);
972       }
973     }
974     Map<HRegionInfo, ServerName> allUserRegions =
975       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
976     for (HRegionInfo hri: toBeClosed) {
977       RegionState regionState = updateRegionState(hri, State.CLOSED);
978       allUserRegions.put(hri, regionState.getServerName());
979     }
980     return allUserRegions;
981   }
982 
983   /**
984    * Compute the average load across all region servers.
985    * Currently, this uses a very naive computation - just uses the number of
986    * regions being served, ignoring stats about number of requests.
987    * @return the average load
988    */
989   protected synchronized double getAverageLoad() {
990     int numServers = 0, totalLoad = 0;
991     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
992       Set<HRegionInfo> regions = e.getValue();
993       ServerName serverName = e.getKey();
994       int regionCount = regions.size();
995       if (serverManager.isServerOnline(serverName)) {
996         totalLoad += regionCount;
997         numServers++;
998       }
999     }
1000     if (numServers > 1) {
1001       // The master region server holds only a couple regions.
1002       // Don't consider this server in calculating the average load
1003       // if there are other region servers to avoid possible confusion.
1004       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
1005       if (hris != null) {
1006         totalLoad -= hris.size();
1007         numServers--;
1008       }
1009     }
1010     return numServers == 0 ? 0.0 :
1011       (double)totalLoad / (double)numServers;
1012   }
1013 
1014   /**
1015    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
1016    * Can't let out original since it can change and at least the load balancer
1017    * wants to iterate this exported list.  We need to synchronize on regions
1018    * since all access to this.servers is under a lock on this.regions.
1019    *
1020    * @return A clone of current assignments by table.
1021    */
1022   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
1023       getAssignmentsByTable() {
1024     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
1025       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
1026     synchronized (this) {
1027       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
1028         Map<ServerName, List<HRegionInfo>> svrToRegions =
1029           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1030         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1031           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1032         }
1033         result.put(TableName.valueOf("ensemble"), svrToRegions);
1034       } else {
1035         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1036           for (HRegionInfo hri: e.getValue()) {
1037             if (hri.isMetaRegion()) continue;
1038             TableName tablename = hri.getTable();
1039             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
1040             if (svrToRegions == null) {
1041               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1042               result.put(tablename, svrToRegions);
1043             }
1044             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1045             if (regions == null) {
1046               regions = new ArrayList<HRegionInfo>();
1047               svrToRegions.put(e.getKey(), regions);
1048             }
1049             regions.add(hri);
1050           }
1051         }
1052       }
1053     }
1054 
1055     Map<ServerName, ServerLoad>
1056       onlineSvrs = serverManager.getOnlineServers();
1057     // Take care of servers w/o assignments, and remove servers in draining mode
1058     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1059     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1060       for (ServerName svr: onlineSvrs.keySet()) {
1061         if (!map.containsKey(svr)) {
1062           map.put(svr, new ArrayList<HRegionInfo>());
1063         }
1064       }
1065       map.keySet().removeAll(drainingServers);
1066     }
1067     return result;
1068   }
1069 
1070   protected RegionState getRegionState(final HRegionInfo hri) {
1071     return getRegionState(hri.getEncodedName());
1072   }
1073 
1074   /**
1075    * Returns a clone of region assignments per server
1076    * @return a Map of ServerName to a List of HRegionInfo's
1077    */
1078   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1079     Map<ServerName, List<HRegionInfo>> regionsByServer =
1080         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1081     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1082       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1083     }
1084     return regionsByServer;
1085   }
1086 
1087   protected synchronized RegionState getRegionState(final String encodedName) {
1088     return regionStates.get(encodedName);
1089   }
1090 
1091   /**
1092    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
1093    * @param  regionName
1094    * @return HRegionInfo for the region
1095    */
1096   @SuppressWarnings("deprecation")
1097   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1098     String encodedName = HRegionInfo.encodeRegionName(regionName);
1099     RegionState regionState = getRegionState(encodedName);
1100     if (regionState != null) {
1101       return regionState.getRegion();
1102     }
1103 
1104     try {
1105       Pair<HRegionInfo, ServerName> p =
1106         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1107       HRegionInfo hri = p == null ? null : p.getFirst();
1108       if (hri != null) {
1109         createRegionState(hri);
1110       }
1111       return hri;
1112     } catch (IOException e) {
1113       server.abort("Aborting because error occoured while reading "
1114         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1115       return null;
1116     }
1117   }
1118 
1119   static boolean isOneOfStates(RegionState regionState, State... states) {
1120     State s = regionState != null ? regionState.getState() : null;
1121     for (State state: states) {
1122       if (s == state) return true;
1123     }
1124     return false;
1125   }
1126 
1127   /**
1128    * Update a region state. It will be put in transition if not already there.
1129    */
1130   private RegionState updateRegionState(final HRegionInfo hri,
1131       final State state, final ServerName serverName, long openSeqNum) {
1132     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
1133       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1134         + " on " + serverName + ", set to " + state);
1135     }
1136 
1137     String encodedName = hri.getEncodedName();
1138     RegionState regionState = new RegionState(
1139       hri, state, System.currentTimeMillis(), serverName);
1140     RegionState oldState = getRegionState(encodedName);
1141     if (!regionState.equals(oldState)) {
1142       LOG.info("Transition " + oldState + " to " + regionState);
1143       // Persist region state before updating in-memory info, if needed
1144       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1145     }
1146 
1147     synchronized (this) {
1148       regionsInTransition.put(encodedName, regionState);
1149       putRegionState(regionState);
1150 
1151       // For these states, region should be properly closed.
1152       // There should be no log splitting issue.
1153       if ((state == State.CLOSED || state == State.MERGED
1154           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1155         ServerName last = lastAssignments.get(encodedName);
1156         if (last.equals(serverName)) {
1157           lastAssignments.remove(encodedName);
1158         } else {
1159           LOG.warn(encodedName + " moved to " + state + " on "
1160             + serverName + ", expected " + last);
1161         }
1162       }
1163 
1164       // Once a region is opened, record its last assignment right away.
1165       if (serverName != null && state == State.OPEN) {
1166         ServerName last = lastAssignments.get(encodedName);
1167         if (!serverName.equals(last)) {
1168           lastAssignments.put(encodedName, serverName);
1169           if (last != null && isServerDeadAndNotProcessed(last)) {
1170             LOG.warn(encodedName + " moved to " + serverName
1171               + ", while it's previous host " + last
1172               + " is dead but not processed yet");
1173           }
1174         }
1175       }
1176 
1177       // notify the change
1178       this.notifyAll();
1179     }
1180     return regionState;
1181   }
1182 }