View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.MetaTableAccessor;
39  import org.apache.hadoop.hbase.Server;
40  import org.apache.hadoop.hbase.ServerLoad;
41  import org.apache.hadoop.hbase.ServerName;
42  import org.apache.hadoop.hbase.TableName;
43  import org.apache.hadoop.hbase.TableStateManager;
44  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
45  import org.apache.hadoop.hbase.master.RegionState.State;
46  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.Pair;
49  
50  import com.google.common.annotations.VisibleForTesting;
51  import com.google.common.base.Preconditions;
52  
53  /**
54   * Region state accountant. It holds the states of all regions in the memory.
55   * In normal scenario, it should match the meta table and the true region states.
56   *
57   * This map is used by AssignmentManager to track region states.
58   */
59  @InterfaceAudience.Private
60  public class RegionStates {
61    private static final Log LOG = LogFactory.getLog(RegionStates.class);
62  
63    /**
64     * Regions currently in transition.
65     */
66    final HashMap<String, RegionState> regionsInTransition;
67  
68    /**
69     * Region encoded name to state map.
70     * All the regions should be in this map.
71     */
72    private final Map<String, RegionState> regionStates;
73  
74    /**
75     * Server to regions assignment map.
76     * Contains the set of regions currently assigned to a given server.
77     */
78    private final Map<ServerName, Set<HRegionInfo>> serverHoldings;
79  
80    /**
81     * Maintains the mapping from the default region to the replica regions.
82     */
83    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas;
84  
85    /**
86     * Region to server assignment map.
87     * Contains the server a given region is currently assigned to.
88     */
89    private final TreeMap<HRegionInfo, ServerName> regionAssignments;
90  
91    /**
92     * Encoded region name to server assignment map for re-assignment
93     * purpose. Contains the server a given region is last known assigned
94     * to, which has not completed log splitting, so not assignable.
95     * If a region is currently assigned, this server info in this
96     * map should be the same as that in regionAssignments.
97     * However the info in regionAssignments is cleared when the region
98     * is offline while the info in lastAssignments is cleared when
99     * the region is closed or the server is dead and processed.
100    */
101   private final HashMap<String, ServerName> lastAssignments;
102 
103   /**
104    * Map a host port pair string to the latest start code
105    * of a region server which is known to be dead. It is dead
106    * to us, but server manager may not know it yet.
107    */
108   private final HashMap<String, Long> deadServers;
109 
110   /**
111    * Map a dead servers to the time when log split is done.
112    * Since log splitting is not ordered, we have to remember
113    * all processed instances. The map is cleaned up based
114    * on a configured time. By default, we assume a dead
115    * server should be done with log splitting in two hours.
116    */
117   private final HashMap<ServerName, Long> processedServers;
118   private long lastProcessedServerCleanTime;
119 
120   private final TableStateManager tableStateManager;
121   private final RegionStateStore regionStateStore;
122   private final ServerManager serverManager;
123   private final Server server;
124 
125   // The maximum time to keep a log split info in region states map
126   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
127   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
128 
129   RegionStates(final Server master, final TableStateManager tableStateManager,
130       final ServerManager serverManager, final RegionStateStore regionStateStore) {
131     regionStates = new HashMap<String, RegionState>();
132     regionsInTransition = new HashMap<String, RegionState>();
133     serverHoldings = new HashMap<ServerName, Set<HRegionInfo>>();
134     defaultReplicaToOtherReplicas = new HashMap<HRegionInfo, Set<HRegionInfo>>();
135     regionAssignments = new TreeMap<HRegionInfo, ServerName>();
136     lastAssignments = new HashMap<String, ServerName>();
137     processedServers = new HashMap<ServerName, Long>();
138     deadServers = new HashMap<String, Long>();
139     this.tableStateManager = tableStateManager;
140     this.regionStateStore = regionStateStore;
141     this.serverManager = serverManager;
142     this.server = master;
143   }
144 
145   /**
146    * @return an unmodifiable the region assignment map
147    */
148   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
149     return Collections.unmodifiableMap(regionAssignments);
150   }
151 
152   /**
153    * Return the replicas (including default) for the regions grouped by ServerName
154    * @param regions
155    * @return a pair containing the groupings as a map
156    */
157   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
158     Collection<HRegionInfo> regions) {
159     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
160     for (HRegionInfo region : regions) {
161       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
162       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
163       if (allReplicas != null) {
164         for (HRegionInfo hri : allReplicas) {
165           ServerName server = regionAssignments.get(hri);
166           if (server != null) {
167             List<HRegionInfo> regionsOnServer = map.get(server);
168             if (regionsOnServer == null) {
169               regionsOnServer = new ArrayList<HRegionInfo>(1);
170               map.put(server, regionsOnServer);
171             }
172             regionsOnServer.add(hri);
173           }
174         }
175       }
176     }
177     return map;
178   }
179 
180   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
181     return regionAssignments.get(hri);
182   }
183 
184   /**
185    * Get regions in transition and their states
186    */
187   @SuppressWarnings("unchecked")
188   public synchronized Map<String, RegionState> getRegionsInTransition() {
189     return (Map<String, RegionState>)regionsInTransition.clone();
190   }
191 
192   /**
193    * @return True if specified region in transition.
194    */
195   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
196     return regionsInTransition.containsKey(hri.getEncodedName());
197   }
198 
199   /**
200    * @return True if specified region in transition.
201    */
202   public synchronized boolean isRegionInTransition(final String encodedName) {
203     return regionsInTransition.containsKey(encodedName);
204   }
205 
206   /**
207    * @return True if any region in transition.
208    */
209   public synchronized boolean isRegionsInTransition() {
210     return !regionsInTransition.isEmpty();
211   }
212 
213   /**
214    * @return True if specified region assigned, and not in transition.
215    */
216   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
217     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
218   }
219 
220   /**
221    * @return True if specified region offline/closed, but not in transition.
222    * If the region is not in the map, it is offline to us too.
223    */
224   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
225     return getRegionState(hri) == null || (!isRegionInTransition(hri)
226       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
227   }
228 
229   /**
230    * @return True if specified region is in one of the specified states.
231    */
232   public boolean isRegionInState(
233       final HRegionInfo hri, final State... states) {
234     return isRegionInState(hri.getEncodedName(), states);
235   }
236 
237   /**
238    * @return True if specified region is in one of the specified states.
239    */
240   public boolean isRegionInState(
241       final String encodedName, final State... states) {
242     RegionState regionState = getRegionState(encodedName);
243     return isOneOfStates(regionState, states);
244   }
245 
246   /**
247    * Wait for the state map to be updated by assignment manager.
248    */
249   public synchronized void waitForUpdate(
250       final long timeout) throws InterruptedException {
251     this.wait(timeout);
252   }
253 
254   /**
255    * Get region transition state
256    */
257   public RegionState getRegionTransitionState(final HRegionInfo hri) {
258     return getRegionTransitionState(hri.getEncodedName());
259   }
260 
261   /**
262    * Get region transition state
263    */
264   public synchronized RegionState
265       getRegionTransitionState(final String encodedName) {
266     return regionsInTransition.get(encodedName);
267   }
268 
269   /**
270    * Add a list of regions to RegionStates. If a region is split
271    * and offline, its state will be SPLIT. Otherwise, its state will
272    * be OFFLINE. Region already in RegionStates will be skipped.
273    */
274   public void createRegionStates(
275       final List<HRegionInfo> hris) {
276     for (HRegionInfo hri: hris) {
277       createRegionState(hri);
278     }
279   }
280 
281   /**
282    * Add a region to RegionStates. If the region is split
283    * and offline, its state will be SPLIT. Otherwise, its state will
284    * be OFFLINE. If it is already in RegionStates, this call has
285    * no effect, and the original state is returned.
286    */
287   public RegionState createRegionState(final HRegionInfo hri) {
288     return createRegionState(hri, null, null, null);
289   }
290 
291   /**
292    * Add a region to RegionStates with the specified state.
293    * If the region is already in RegionStates, this call has
294    * no effect, and the original state is returned.
295    *
296    * @param hri the region info to create a state for
297    * @param newState the state to the region in set to
298    * @param serverName the server the region is transitioning on
299    * @param lastHost the last server that hosts the region
300    * @return the current state
301    */
302   public synchronized RegionState createRegionState(final HRegionInfo hri,
303       State newState, ServerName serverName, ServerName lastHost) {
304     if (newState == null || (newState == State.OPEN && serverName == null)) {
305       newState =  State.OFFLINE;
306     }
307     if (hri.isOffline() && hri.isSplit()) {
308       newState = State.SPLIT;
309       serverName = null;
310     }
311     String encodedName = hri.getEncodedName();
312     RegionState regionState = regionStates.get(encodedName);
313     if (regionState != null) {
314       LOG.warn("Tried to create a state for a region already in RegionStates, "
315         + "used existing: " + regionState + ", ignored new: " + newState);
316     } else {
317       regionState = new RegionState(hri, newState, serverName);
318       regionStates.put(encodedName, regionState);
319       if (newState == State.OPEN) {
320         if (!serverName.equals(lastHost)) {
321           LOG.warn("Open region's last host " + lastHost
322             + " should be the same as the current one " + serverName
323             + ", ignored the last and used the current one");
324           lastHost = serverName;
325         }
326         lastAssignments.put(encodedName, lastHost);
327         regionAssignments.put(hri, lastHost);
328       } else if (!regionState.isUnassignable()) {
329         regionsInTransition.put(encodedName, regionState);
330       }
331       if (lastHost != null && newState != State.SPLIT) {
332         addToServerHoldings(lastHost, hri);
333       }
334     }
335     return regionState;
336   }
337 
338   /**
339    * Update a region state. It will be put in transition if not already there.
340    */
341   public RegionState updateRegionState(
342       final HRegionInfo hri, final State state) {
343     RegionState regionState = getRegionState(hri.getEncodedName());
344     return updateRegionState(hri, state,
345       regionState == null ? null : regionState.getServerName());
346   }
347 
348   /**
349    * Update a region state. It will be put in transition if not already there.
350    */
351   public RegionState updateRegionState(
352       final HRegionInfo hri, final State state, final ServerName serverName) {
353     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
354   }
355 
356   public void regionOnline(
357       final HRegionInfo hri, final ServerName serverName) {
358     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
359   }
360 
361   /**
362    * A region is online, won't be in transition any more.
363    * We can't confirm it is really online on specified region server
364    * because it hasn't been put in region server's online region list yet.
365    */
366   public void regionOnline(final HRegionInfo hri,
367       final ServerName serverName, long openSeqNum) {
368     if (!serverManager.isServerOnline(serverName)) {
369       // This is possible if the region server dies before master gets a
370       // chance to handle ZK event in time. At this time, if the dead server
371       // is already processed by SSH, we should ignore this event.
372       // If not processed yet, ignore and let SSH deal with it.
373       LOG.warn("Ignored, " + hri.getEncodedName()
374         + " was opened on a dead server: " + serverName);
375       return;
376     }
377     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
378 
379     synchronized (this) {
380       regionsInTransition.remove(hri.getEncodedName());
381       ServerName oldServerName = regionAssignments.put(hri, serverName);
382       if (!serverName.equals(oldServerName)) {
383         LOG.info("Onlined " + hri.getShortNameToLog() + " on " + serverName);
384         addToServerHoldings(serverName, hri);
385         addToReplicaMapping(hri);
386         if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
387           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
388           removeFromServerHoldings(oldServerName, hri);
389         }
390       }
391     }
392   }
393 
394   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
395     Set<HRegionInfo> regions = serverHoldings.get(serverName);
396     if (regions == null) {
397       regions = new HashSet<HRegionInfo>();
398       serverHoldings.put(serverName, regions);
399     }
400     regions.add(hri);
401   }
402 
403   private void addToReplicaMapping(HRegionInfo hri) {
404     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
405     Set<HRegionInfo> replicas =
406         defaultReplicaToOtherReplicas.get(defaultReplica);
407     if (replicas == null) {
408       replicas = new HashSet<HRegionInfo>();
409       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
410     }
411     replicas.add(hri);
412   }
413 
414   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
415     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
416     oldRegions.remove(hri);
417     if (oldRegions.isEmpty()) {
418       serverHoldings.remove(serverName);
419     }
420   }
421 
422   private void removeFromReplicaMapping(HRegionInfo hri) {
423     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
424     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
425     if (replicas != null) {
426       replicas.remove(hri);
427       if (replicas.isEmpty()) {
428         defaultReplicaToOtherReplicas.remove(defaultReplica);
429       }
430     }
431   }
432 
433   /**
434    * A dead server's hlogs have been split so that all the regions
435    * used to be open on it can be safely assigned now. Mark them assignable.
436    */
437   public synchronized void logSplit(final ServerName serverName) {
438     for (Iterator<Map.Entry<String, ServerName>> it
439         = lastAssignments.entrySet().iterator(); it.hasNext();) {
440       Map.Entry<String, ServerName> e = it.next();
441       if (e.getValue().equals(serverName)) {
442         it.remove();
443       }
444     }
445     long now = System.currentTimeMillis();
446     if (LOG.isDebugEnabled()) {
447       LOG.debug("Adding to processed servers " + serverName);
448     }
449     processedServers.put(serverName, Long.valueOf(now));
450     Configuration conf = server.getConfiguration();
451     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
452     // Doesn't have to be very accurate about the clean up time
453     if (now > lastProcessedServerCleanTime + obsoleteTime) {
454       lastProcessedServerCleanTime = now;
455       long cutoff = now - obsoleteTime;
456       for (Iterator<Map.Entry<ServerName, Long>> it
457           = processedServers.entrySet().iterator(); it.hasNext();) {
458         Map.Entry<ServerName, Long> e = it.next();
459         if (e.getValue().longValue() < cutoff) {
460           if (LOG.isDebugEnabled()) {
461             LOG.debug("Removed from processed servers " + e.getKey());
462           }
463           it.remove();
464         }
465       }
466     }
467   }
468 
469   /**
470    * Log split is done for a given region, so it is assignable now.
471    */
472   public void logSplit(final HRegionInfo region) {
473     clearLastAssignment(region);
474   }
475 
476   public synchronized void clearLastAssignment(final HRegionInfo region) {
477     lastAssignments.remove(region.getEncodedName());
478   }
479 
480   /**
481    * A region is offline, won't be in transition any more.
482    */
483   public void regionOffline(final HRegionInfo hri) {
484     regionOffline(hri, null);
485   }
486 
487   /**
488    * A region is offline, won't be in transition any more. Its state
489    * should be the specified expected state, which can only be
490    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
491    */
492   public void regionOffline(
493       final HRegionInfo hri, final State expectedState) {
494     Preconditions.checkArgument(expectedState == null
495       || RegionState.isUnassignable(expectedState),
496         "Offlined region should not be " + expectedState);
497     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
498       // Remove it from all region maps
499       deleteRegion(hri);
500       return;
501     }
502     State newState =
503       expectedState == null ? State.OFFLINE : expectedState;
504     updateRegionState(hri, newState);
505 
506     synchronized (this) {
507       regionsInTransition.remove(hri.getEncodedName());
508       ServerName oldServerName = regionAssignments.remove(hri);
509       if (oldServerName != null && serverHoldings.containsKey(oldServerName)
510           && (newState == State.MERGED || newState == State.SPLIT
511             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
512               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING))) {
513         // Offline the region only if it's merged/split, or the table is disabled/disabling.
514         // Otherwise, offline it from this server only when it is online on a different server.
515         LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
516         removeFromServerHoldings(oldServerName, hri);
517         removeFromReplicaMapping(hri);
518       }
519     }
520   }
521 
522   /**
523    * A server is offline, all regions on it are dead.
524    */
525   public synchronized List<HRegionInfo> serverOffline(final ServerName sn) {
526     // Offline all regions on this server not already in transition.
527     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
528     Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
529     if (assignedRegions == null) {
530       assignedRegions = new HashSet<HRegionInfo>();
531     }
532 
533     // Offline regions outside the loop to avoid ConcurrentModificationException
534     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
535     for (HRegionInfo region : assignedRegions) {
536       // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
537       if (isRegionOnline(region)) {
538         regionsToOffline.add(region);
539       } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
540         LOG.debug("Offline splitting/merging region " + getRegionState(region));
541         regionsToOffline.add(region);
542       }
543     }
544 
545     for (HRegionInfo hri : regionsToOffline) {
546       regionOffline(hri);
547     }
548 
549     for (RegionState state : regionsInTransition.values()) {
550       HRegionInfo hri = state.getRegion();
551       if (assignedRegions.contains(hri)) {
552         // Region is open on this region server, but in transition.
553         // This region must be moving away from this server, or splitting/merging.
554         // SSH will handle it, either skip assigning, or re-assign.
555         LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
556       } else if (sn.equals(state.getServerName())) {
557         // Region is in transition on this region server, and this
558         // region is not open on this server. So the region must be
559         // moving to this server from another one (i.e. opening or
560         // pending open on this server, was open on another one.
561         // Offline state is also kind of pending open if the region is in
562         // transition. The region could be in failed_close state too if we have
563         // tried several times to open it while this region server is not reachable)
564         if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
565           LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
566           rits.add(hri);
567         } else {
568           LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
569         }
570       }
571     }
572 
573     this.notifyAll();
574     return rits;
575   }
576 
577   /**
578    * Gets the online regions of the specified table.
579    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
580    * Only returns <em>online</em> regions.  If a region on this table has been
581    * closed during a disable, etc., it will be included in the returned list.
582    * So, the returned list may not necessarily be ALL regions in this table, its
583    * all the ONLINE regions in the table.
584    * @param tableName
585    * @return Online regions from <code>tableName</code>
586    */
587   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
588     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
589     // boundary needs to have table's name but regionID 0 so that it is sorted
590     // before all table's regions.
591     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
592     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
593       if(!hri.getTable().equals(tableName)) break;
594       tableRegions.add(hri);
595     }
596     return tableRegions;
597   }
598 
599 
600   /**
601    * Wait on region to clear regions-in-transition.
602    * <p>
603    * If the region isn't in transition, returns immediately.  Otherwise, method
604    * blocks until the region is out of transition.
605    */
606   public synchronized void waitOnRegionToClearRegionsInTransition(
607       final HRegionInfo hri) throws InterruptedException {
608     if (!isRegionInTransition(hri)) return;
609 
610     while(!server.isStopped() && isRegionInTransition(hri)) {
611       RegionState rs = getRegionState(hri);
612       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
613       waitForUpdate(100);
614     }
615 
616     if (server.isStopped()) {
617       LOG.info("Giving up wait on region in " +
618         "transition because stoppable.isStopped is set");
619     }
620   }
621 
622   /**
623    * A table is deleted. Remove its regions from all internal maps.
624    * We loop through all regions assuming we don't delete tables too much.
625    */
626   public void tableDeleted(final TableName tableName) {
627     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
628     synchronized (this) {
629       for (RegionState state: regionStates.values()) {
630         HRegionInfo region = state.getRegion();
631         if (region.getTable().equals(tableName)) {
632           regionsToDelete.add(region);
633         }
634       }
635     }
636     for (HRegionInfo region: regionsToDelete) {
637       deleteRegion(region);
638     }
639   }
640 
641   /**
642    * Get a copy of all regions assigned to a server
643    */
644   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
645     Set<HRegionInfo> regions = serverHoldings.get(serverName);
646     if (regions == null) return null;
647     return new HashSet<HRegionInfo>(regions);
648   }
649 
650   /**
651    * Remove a region from all state maps.
652    */
653   @VisibleForTesting
654   public synchronized void deleteRegion(final HRegionInfo hri) {
655     String encodedName = hri.getEncodedName();
656     regionsInTransition.remove(encodedName);
657     regionStates.remove(encodedName);
658     lastAssignments.remove(encodedName);
659     ServerName sn = regionAssignments.remove(hri);
660     if (sn != null) {
661       Set<HRegionInfo> regions = serverHoldings.get(sn);
662       regions.remove(hri);
663     }
664   }
665 
666   /**
667    * Checking if a region was assigned to a server which is not online now.
668    * If so, we should hold re-assign this region till SSH has split its hlogs.
669    * Once logs are split, the last assignment of this region will be reset,
670    * which means a null last assignment server is ok for re-assigning.
671    *
672    * A region server could be dead but we don't know it yet. We may
673    * think it's online falsely. Therefore if a server is online, we still
674    * need to confirm it reachable and having the expected start code.
675    */
676   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
677     ServerName server = lastAssignments.get(encodedName);
678     return isServerDeadAndNotProcessed(server);
679   }
680 
681   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
682     if (server == null) return false;
683     if (serverManager.isServerOnline(server)) {
684       String hostAndPort = server.getHostAndPort();
685       long startCode = server.getStartcode();
686       Long deadCode = deadServers.get(hostAndPort);
687       if (deadCode == null || startCode > deadCode.longValue()) {
688         if (serverManager.isServerReachable(server)) {
689           return false;
690         }
691         // The size of deadServers won't grow unbounded.
692         deadServers.put(hostAndPort, Long.valueOf(startCode));
693       }
694       // Watch out! If the server is not dead, the region could
695       // remain unassigned. That's why ServerManager#isServerReachable
696       // should use some retry.
697       //
698       // We cache this info since it is very unlikely for that
699       // instance to come back up later on. We don't want to expire
700       // the server since we prefer to let it die naturally.
701       LOG.warn("Couldn't reach online server " + server);
702     }
703     // Now, we know it's dead. Check if it's processed
704     return !processedServers.containsKey(server);
705   }
706 
707  /**
708    * Get the last region server a region was on for purpose of re-assignment,
709    * i.e. should the re-assignment be held back till log split is done?
710    */
711   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
712     return lastAssignments.get(encodedName);
713   }
714 
715   synchronized void setLastRegionServerOfRegions(
716       final ServerName serverName, final List<HRegionInfo> regionInfos) {
717     for (HRegionInfo hri: regionInfos) {
718       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
719     }
720   }
721 
722   synchronized void setLastRegionServerOfRegion(
723       final ServerName serverName, final String encodedName) {
724     lastAssignments.put(encodedName, serverName);
725   }
726 
727   void splitRegion(HRegionInfo p,
728       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
729     regionStateStore.splitRegion(p, a, b, sn);
730     synchronized (this) {
731       // After PONR, split is considered to be done.
732       // Update server holdings to be aligned with the meta.
733       Set<HRegionInfo> regions = serverHoldings.get(sn);
734       if (regions == null) {
735         throw new IllegalStateException(sn + " should host some regions");
736       }
737       regions.remove(p);
738       regions.add(a);
739       regions.add(b);
740     }
741   }
742 
743   void mergeRegions(HRegionInfo p,
744       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
745     regionStateStore.mergeRegions(p, a, b, sn);
746     synchronized (this) {
747       // After PONR, merge is considered to be done.
748       // Update server holdings to be aligned with the meta.
749       Set<HRegionInfo> regions = serverHoldings.get(sn);
750       if (regions == null) {
751         throw new IllegalStateException(sn + " should host some regions");
752       }
753       regions.remove(a);
754       regions.remove(b);
755       regions.add(p);
756     }
757   }
758 
759   /**
760    * At cluster clean re/start, mark all user regions closed except those of tables
761    * that are excluded, such as disabled/disabling/enabling tables. All user regions
762    * and their previous locations are returned.
763    */
764   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
765     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
766     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
767     for(RegionState state: regionStates.values()) {
768       HRegionInfo hri = state.getRegion();
769       if (state.isSplit() || hri.isSplit()) {
770         continue;
771       }
772       TableName tableName = hri.getTable();
773       if (!TableName.META_TABLE_NAME.equals(tableName)
774           && (noExcludeTables || !excludedTables.contains(tableName))) {
775         toBeClosed.add(hri);
776       }
777     }
778     Map<HRegionInfo, ServerName> allUserRegions =
779       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
780     for (HRegionInfo hri: toBeClosed) {
781       RegionState regionState = updateRegionState(hri, State.CLOSED);
782       allUserRegions.put(hri, regionState.getServerName());
783     }
784     return allUserRegions;
785   }
786 
787   /**
788    * Compute the average load across all region servers.
789    * Currently, this uses a very naive computation - just uses the number of
790    * regions being served, ignoring stats about number of requests.
791    * @return the average load
792    */
793   protected synchronized double getAverageLoad() {
794     int numServers = 0, totalLoad = 0;
795     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
796       Set<HRegionInfo> regions = e.getValue();
797       ServerName serverName = e.getKey();
798       int regionCount = regions.size();
799       if (regionCount > 0 || serverManager.isServerOnline(serverName)) {
800         totalLoad += regionCount;
801         numServers++;
802       }
803     }
804     if (numServers > 1) {
805       // The master region server holds only a couple regions.
806       // Don't consider this server in calculating the average load
807       // if there are other region servers to avoid possible confusion.
808       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
809       if (hris != null) {
810         totalLoad -= hris.size();
811         numServers--;
812       }
813     }
814     return numServers == 0 ? 0.0 :
815       (double)totalLoad / (double)numServers;
816   }
817 
818   /**
819    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
820    * Can't let out original since it can change and at least the load balancer
821    * wants to iterate this exported list.  We need to synchronize on regions
822    * since all access to this.servers is under a lock on this.regions.
823    *
824    * @return A clone of current assignments by table.
825    */
826   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
827       getAssignmentsByTable() {
828     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
829       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
830     synchronized (this) {
831       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
832         Map<ServerName, List<HRegionInfo>> svrToRegions =
833           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
834         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
835           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
836         }
837         result.put(TableName.valueOf("ensemble"), svrToRegions);
838       } else {
839         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
840           for (HRegionInfo hri: e.getValue()) {
841             if (hri.isMetaRegion()) continue;
842             TableName tablename = hri.getTable();
843             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
844             if (svrToRegions == null) {
845               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
846               result.put(tablename, svrToRegions);
847             }
848             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
849             if (regions == null) {
850               regions = new ArrayList<HRegionInfo>();
851               svrToRegions.put(e.getKey(), regions);
852             }
853             regions.add(hri);
854           }
855         }
856       }
857     }
858 
859     Map<ServerName, ServerLoad>
860       onlineSvrs = serverManager.getOnlineServers();
861     // Take care of servers w/o assignments.
862     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
863       for (ServerName svr: onlineSvrs.keySet()) {
864         if (!map.containsKey(svr)) {
865           map.put(svr, new ArrayList<HRegionInfo>());
866         }
867       }
868     }
869     return result;
870   }
871 
872   protected RegionState getRegionState(final HRegionInfo hri) {
873     return getRegionState(hri.getEncodedName());
874   }
875 
876   /**
877    * Returns a clone of region assignments per server
878    * @return a Map of ServerName to a List of HRegionInfo's
879    */
880   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
881     Map<ServerName, List<HRegionInfo>> regionsByServer =
882         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
883     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
884       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
885     }
886     return regionsByServer;
887   }
888 
889   protected synchronized RegionState getRegionState(final String encodedName) {
890     return regionStates.get(encodedName);
891   }
892 
893   /**
894    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
895    * @param  regionName
896    * @return HRegionInfo for the region
897    */
898   @SuppressWarnings("deprecation")
899   protected HRegionInfo getRegionInfo(final byte [] regionName) {
900     String encodedName = HRegionInfo.encodeRegionName(regionName);
901     RegionState regionState = getRegionState(encodedName);
902     if (regionState != null) {
903       return regionState.getRegion();
904     }
905 
906     try {
907       Pair<HRegionInfo, ServerName> p =
908         MetaTableAccessor.getRegion(server.getShortCircuitConnection(), regionName);
909       HRegionInfo hri = p == null ? null : p.getFirst();
910       if (hri != null) {
911         createRegionState(hri);
912       }
913       return hri;
914     } catch (IOException e) {
915       server.abort("Aborting because error occoured while reading "
916         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
917       return null;
918     }
919   }
920 
921   static boolean isOneOfStates(RegionState regionState, State... states) {
922     State s = regionState != null ? regionState.getState() : null;
923     for (State state: states) {
924       if (s == state) return true;
925     }
926     return false;
927   }
928 
929   /**
930    * Update a region state. It will be put in transition if not already there.
931    */
932   private RegionState updateRegionState(final HRegionInfo hri,
933       final State state, final ServerName serverName, long openSeqNum) {
934     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
935       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
936         + " on " + serverName + ", set to " + state);
937     }
938 
939     String encodedName = hri.getEncodedName();
940     RegionState regionState = new RegionState(
941       hri, state, System.currentTimeMillis(), serverName);
942     RegionState oldState = getRegionState(encodedName);
943     if (!regionState.equals(oldState)) {
944       LOG.info("Transition " + oldState + " to " + regionState);
945       // Persist region state before updating in-memory info, if needed
946       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
947     }
948 
949     synchronized (this) {
950       regionsInTransition.put(encodedName, regionState);
951       regionStates.put(encodedName, regionState);
952 
953       // For these states, region should be properly closed.
954       // There should be no log splitting issue.
955       if ((state == State.CLOSED || state == State.MERGED
956           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
957         ServerName last = lastAssignments.get(encodedName);
958         if (last.equals(serverName)) {
959           lastAssignments.remove(encodedName);
960         } else {
961           LOG.warn(encodedName + " moved to " + state + " on "
962             + serverName + ", expected " + last);
963         }
964       }
965 
966       // Once a region is opened, record its last assignment right away.
967       if (serverName != null && state == State.OPEN) {
968         ServerName last = lastAssignments.get(encodedName);
969         if (!serverName.equals(last)) {
970           lastAssignments.put(encodedName, serverName);
971           if (last != null && isServerDeadAndNotProcessed(last)) {
972             LOG.warn(encodedName + " moved to " + serverName
973               + ", while it's previous host " + last
974               + " is dead but not processed yet");
975           }
976         }
977       }
978 
979       // notify the change
980       this.notifyAll();
981     }
982     return regionState;
983   }
984 }