View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import com.google.common.annotations.VisibleForTesting;
33  import com.google.common.base.Preconditions;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.hbase.classification.InterfaceAudience;
38  import org.apache.hadoop.conf.Configuration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.HRegionInfo;
41  import org.apache.hadoop.hbase.HTableDescriptor;
42  import org.apache.hadoop.hbase.MetaTableAccessor;
43  import org.apache.hadoop.hbase.ServerLoad;
44  import org.apache.hadoop.hbase.ServerName;
45  import org.apache.hadoop.hbase.TableName;
46  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
47  import org.apache.hadoop.hbase.master.RegionState.State;
48  import org.apache.hadoop.hbase.client.TableState;
49  import org.apache.hadoop.hbase.util.Bytes;
50  import org.apache.hadoop.hbase.util.FSUtils;
51  import org.apache.hadoop.hbase.util.Pair;
52  
53  /**
54   * Region state accountant. It holds the states of all regions in the memory.
55   * In normal scenario, it should match the meta table and the true region states.
56   *
57   * This map is used by AssignmentManager to track region states.
58   */
59  @InterfaceAudience.Private
60  public class RegionStates {
61    private static final Log LOG = LogFactory.getLog(RegionStates.class);
62  
63    /**
64     * Regions currently in transition.
65     */
66    final HashMap<String, RegionState> regionsInTransition =
67      new HashMap<String, RegionState>();
68  
69    /**
70     * Region encoded name to state map.
71     * All the regions should be in this map.
72     */
73    private final Map<String, RegionState> regionStates =
74      new HashMap<String, RegionState>();
75  
76    /**
77     * Holds mapping of table -> region state
78     */
79    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
80        new HashMap<TableName, Map<String, RegionState>>();
81  
82    /**
83     * Server to regions assignment map.
84     * Contains the set of regions currently assigned to a given server.
85     */
86    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
87      new HashMap<ServerName, Set<HRegionInfo>>();
88  
89    /**
90     * Maintains the mapping from the default region to the replica regions.
91     */
92    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
93      new HashMap<HRegionInfo, Set<HRegionInfo>>();
94  
95    /**
96     * Region to server assignment map.
97     * Contains the server a given region is currently assigned to.
98     */
99    private final TreeMap<HRegionInfo, ServerName> regionAssignments =
100     new TreeMap<HRegionInfo, ServerName>();
101 
102   /**
103    * Encoded region name to server assignment map for re-assignment
104    * purpose. Contains the server a given region is last known assigned
105    * to, which has not completed log splitting, so not assignable.
106    * If a region is currently assigned, this server info in this
107    * map should be the same as that in regionAssignments.
108    * However the info in regionAssignments is cleared when the region
109    * is offline while the info in lastAssignments is cleared when
110    * the region is closed or the server is dead and processed.
111    */
112   private final HashMap<String, ServerName> lastAssignments =
113     new HashMap<String, ServerName>();
114 
115   /**
116    * Encoded region name to server assignment map for the
117    * purpose to clean up serverHoldings when a region is online
118    * on a new server. When the region is offline from the previous
119    * server, we cleaned up regionAssignments so that it has the
120    * latest assignment map. But we didn't clean up serverHoldings
121    * to match the meta. We need this map to find out the old server
122    * whose serverHoldings needs cleanup, given a moved region.
123    */
124   private final HashMap<String, ServerName> oldAssignments =
125     new HashMap<String, ServerName>();
126 
127   /**
128    * Map a host port pair string to the latest start code
129    * of a region server which is known to be dead. It is dead
130    * to us, but server manager may not know it yet.
131    */
132   private final HashMap<String, Long> deadServers =
133     new HashMap<String, Long>();
134 
135   /**
136    * Map a dead servers to the time when log split is done.
137    * Since log splitting is not ordered, we have to remember
138    * all processed instances. The map is cleaned up based
139    * on a configured time. By default, we assume a dead
140    * server should be done with log splitting in two hours.
141    */
142   private final HashMap<ServerName, Long> processedServers =
143     new HashMap<ServerName, Long>();
144   private long lastProcessedServerCleanTime;
145 
146   private final TableStateManager tableStateManager;
147   private final RegionStateStore regionStateStore;
148   private final ServerManager serverManager;
149   private final MasterServices server;
150 
151   // The maximum time to keep a log split info in region states map
152   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
153   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
154 
155   RegionStates(final MasterServices master, final TableStateManager tableStateManager,
156       final ServerManager serverManager, final RegionStateStore regionStateStore) {
157     this.tableStateManager = tableStateManager;
158     this.regionStateStore = regionStateStore;
159     this.serverManager = serverManager;
160     this.server = master;
161   }
162 
163   /**
164    * @return an unmodifiable the region assignment map
165    */
166   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
167     return Collections.unmodifiableMap(regionAssignments);
168   }
169 
170   /**
171    * Return the replicas (including default) for the regions grouped by ServerName
172    * @param regions
173    * @return a pair containing the groupings as a map
174    */
175   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
176     Collection<HRegionInfo> regions) {
177     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
178     for (HRegionInfo region : regions) {
179       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
180       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
181       if (allReplicas != null) {
182         for (HRegionInfo hri : allReplicas) {
183           ServerName server = regionAssignments.get(hri);
184           if (server != null) {
185             List<HRegionInfo> regionsOnServer = map.get(server);
186             if (regionsOnServer == null) {
187               regionsOnServer = new ArrayList<HRegionInfo>(1);
188               map.put(server, regionsOnServer);
189             }
190             regionsOnServer.add(hri);
191           }
192         }
193       }
194     }
195     return map;
196   }
197 
198   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
199     return regionAssignments.get(hri);
200   }
201 
202   /**
203    * Get regions in transition and their states
204    */
205   @SuppressWarnings("unchecked")
206   public synchronized Map<String, RegionState> getRegionsInTransition() {
207     return (Map<String, RegionState>)regionsInTransition.clone();
208   }
209 
210   /**
211    * @return True if specified region in transition.
212    */
213   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
214     return regionsInTransition.containsKey(hri.getEncodedName());
215   }
216 
217   /**
218    * @return True if specified region in transition.
219    */
220   public synchronized boolean isRegionInTransition(final String encodedName) {
221     return regionsInTransition.containsKey(encodedName);
222   }
223 
224   /**
225    * @return True if any region in transition.
226    */
227   public synchronized boolean isRegionsInTransition() {
228     return !regionsInTransition.isEmpty();
229   }
230 
231   /**
232    * @return True if specified region assigned, and not in transition.
233    */
234   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
235     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
236   }
237 
238   /**
239    * @return True if specified region offline/closed, but not in transition.
240    * If the region is not in the map, it is offline to us too.
241    */
242   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
243     return getRegionState(hri) == null || (!isRegionInTransition(hri)
244       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
245   }
246 
247   /**
248    * @return True if specified region is in one of the specified states.
249    */
250   public boolean isRegionInState(
251       final HRegionInfo hri, final State... states) {
252     return isRegionInState(hri.getEncodedName(), states);
253   }
254 
255   /**
256    * @return True if specified region is in one of the specified states.
257    */
258   public boolean isRegionInState(
259       final String encodedName, final State... states) {
260     RegionState regionState = getRegionState(encodedName);
261     return isOneOfStates(regionState, states);
262   }
263 
264   /**
265    * Wait for the state map to be updated by assignment manager.
266    */
267   public synchronized void waitForUpdate(
268       final long timeout) throws InterruptedException {
269     this.wait(timeout);
270   }
271 
272   /**
273    * Get region transition state
274    */
275   public RegionState getRegionTransitionState(final HRegionInfo hri) {
276     return getRegionTransitionState(hri.getEncodedName());
277   }
278 
279   /**
280    * Get region transition state
281    */
282   public synchronized RegionState
283       getRegionTransitionState(final String encodedName) {
284     return regionsInTransition.get(encodedName);
285   }
286 
287   /**
288    * Add a list of regions to RegionStates. If a region is split
289    * and offline, its state will be SPLIT. Otherwise, its state will
290    * be OFFLINE. Region already in RegionStates will be skipped.
291    */
292   public void createRegionStates(
293       final List<HRegionInfo> hris) {
294     for (HRegionInfo hri: hris) {
295       createRegionState(hri);
296     }
297   }
298 
299   /**
300    * Add a region to RegionStates. If the region is split
301    * and offline, its state will be SPLIT. Otherwise, its state will
302    * be OFFLINE. If it is already in RegionStates, this call has
303    * no effect, and the original state is returned.
304    */
305   public RegionState createRegionState(final HRegionInfo hri) {
306     return createRegionState(hri, null, null, null);
307   }
308 
309   /**
310    * Add a region to RegionStates with the specified state.
311    * If the region is already in RegionStates, this call has
312    * no effect, and the original state is returned.
313    *
314    * @param hri the region info to create a state for
315    * @param newState the state to the region in set to
316    * @param serverName the server the region is transitioning on
317    * @param lastHost the last server that hosts the region
318    * @return the current state
319    */
320   public synchronized RegionState createRegionState(final HRegionInfo hri,
321       State newState, ServerName serverName, ServerName lastHost) {
322     if (newState == null || (newState == State.OPEN && serverName == null)) {
323       newState =  State.OFFLINE;
324     }
325     if (hri.isOffline() && hri.isSplit()) {
326       newState = State.SPLIT;
327       serverName = null;
328     }
329     String encodedName = hri.getEncodedName();
330     RegionState regionState = regionStates.get(encodedName);
331     if (regionState != null) {
332       LOG.warn("Tried to create a state for a region already in RegionStates, "
333         + "used existing: " + regionState + ", ignored new: " + newState);
334     } else {
335       regionState = new RegionState(hri, newState, serverName);
336       putRegionState(regionState);
337       if (newState == State.OPEN) {
338         if (!serverName.equals(lastHost)) {
339           LOG.warn("Open region's last host " + lastHost
340             + " should be the same as the current one " + serverName
341             + ", ignored the last and used the current one");
342           lastHost = serverName;
343         }
344         lastAssignments.put(encodedName, lastHost);
345         regionAssignments.put(hri, lastHost);
346       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
347         regionsInTransition.put(encodedName, regionState);
348       }
349       if (lastHost != null && newState != State.SPLIT) {
350         addToServerHoldings(lastHost, hri);
351         if (newState != State.OPEN) {
352           oldAssignments.put(encodedName, lastHost);
353         }
354       }
355     }
356     return regionState;
357   }
358 
359   private RegionState putRegionState(RegionState regionState) {
360     HRegionInfo hri = regionState.getRegion();
361     String encodedName = hri.getEncodedName();
362     TableName table = hri.getTable();
363     RegionState oldState = regionStates.put(encodedName, regionState);
364     Map<String, RegionState> map = regionStatesTableIndex.get(table);
365     if (map == null) {
366       map = new HashMap<String, RegionState>();
367       regionStatesTableIndex.put(table, map);
368     }
369     map.put(encodedName, regionState);
370     return oldState;
371   }
372 
373   /**
374    * Update a region state. It will be put in transition if not already there.
375    */
376   public RegionState updateRegionState(
377       final HRegionInfo hri, final State state) {
378     RegionState regionState = getRegionState(hri.getEncodedName());
379     return updateRegionState(hri, state,
380       regionState == null ? null : regionState.getServerName());
381   }
382 
383   /**
384    * Update a region state. It will be put in transition if not already there.
385    */
386   public RegionState updateRegionState(
387       final HRegionInfo hri, final State state, final ServerName serverName) {
388     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
389   }
390 
391   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
392     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
393   }
394 
395   /**
396    * A region is online, won't be in transition any more.
397    * We can't confirm it is really online on specified region server
398    * because it hasn't been put in region server's online region list yet.
399    */
400   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
401     String encodedName = hri.getEncodedName();
402     if (!serverManager.isServerOnline(serverName)) {
403       // This is possible if the region server dies before master gets a
404       // chance to handle ZK event in time. At this time, if the dead server
405       // is already processed by SSH, we should ignore this event.
406       // If not processed yet, ignore and let SSH deal with it.
407       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
408       return;
409     }
410     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
411 
412     synchronized (this) {
413       regionsInTransition.remove(encodedName);
414       ServerName oldServerName = regionAssignments.put(hri, serverName);
415       if (!serverName.equals(oldServerName)) {
416         if (LOG.isDebugEnabled()) {
417           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
418         } else {
419           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
420         }
421         addToServerHoldings(serverName, hri);
422         addToReplicaMapping(hri);
423         if (oldServerName == null) {
424           oldServerName = oldAssignments.remove(encodedName);
425         }
426         if (oldServerName != null
427             && !oldServerName.equals(serverName)
428             && serverHoldings.containsKey(oldServerName)) {
429           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
430           removeFromServerHoldings(oldServerName, hri);
431         }
432       }
433     }
434   }
435 
436   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
437     Set<HRegionInfo> regions = serverHoldings.get(serverName);
438     if (regions == null) {
439       regions = new HashSet<HRegionInfo>();
440       serverHoldings.put(serverName, regions);
441     }
442     regions.add(hri);
443   }
444 
445   private void addToReplicaMapping(HRegionInfo hri) {
446     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
447     Set<HRegionInfo> replicas =
448         defaultReplicaToOtherReplicas.get(defaultReplica);
449     if (replicas == null) {
450       replicas = new HashSet<HRegionInfo>();
451       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
452     }
453     replicas.add(hri);
454   }
455 
456   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
457     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
458     oldRegions.remove(hri);
459     if (oldRegions.isEmpty()) {
460       serverHoldings.remove(serverName);
461     }
462   }
463 
464   private void removeFromReplicaMapping(HRegionInfo hri) {
465     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
466     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
467     if (replicas != null) {
468       replicas.remove(hri);
469       if (replicas.isEmpty()) {
470         defaultReplicaToOtherReplicas.remove(defaultReplica);
471       }
472     }
473   }
474 
475   /**
476    * A dead server's wals have been split so that all the regions
477    * used to be open on it can be safely assigned now. Mark them assignable.
478    */
479   public synchronized void logSplit(final ServerName serverName) {
480     for (Iterator<Map.Entry<String, ServerName>> it
481         = lastAssignments.entrySet().iterator(); it.hasNext();) {
482       Map.Entry<String, ServerName> e = it.next();
483       if (e.getValue().equals(serverName)) {
484         it.remove();
485       }
486     }
487     long now = System.currentTimeMillis();
488     if (LOG.isDebugEnabled()) {
489       LOG.debug("Adding to log splitting servers " + serverName);
490     }
491     processedServers.put(serverName, Long.valueOf(now));
492     Configuration conf = server.getConfiguration();
493     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
494     // Doesn't have to be very accurate about the clean up time
495     if (now > lastProcessedServerCleanTime + obsoleteTime) {
496       lastProcessedServerCleanTime = now;
497       long cutoff = now - obsoleteTime;
498       for (Iterator<Map.Entry<ServerName, Long>> it
499           = processedServers.entrySet().iterator(); it.hasNext();) {
500         Map.Entry<ServerName, Long> e = it.next();
501         if (e.getValue().longValue() < cutoff) {
502           if (LOG.isDebugEnabled()) {
503             LOG.debug("Removed from log splitting servers " + e.getKey());
504           }
505           it.remove();
506         }
507       }
508     }
509   }
510 
511   /**
512    * Log split is done for a given region, so it is assignable now.
513    */
514   public void logSplit(final HRegionInfo region) {
515     clearLastAssignment(region);
516   }
517 
518   public synchronized void clearLastAssignment(final HRegionInfo region) {
519     lastAssignments.remove(region.getEncodedName());
520   }
521 
522   /**
523    * A region is offline, won't be in transition any more.
524    */
525   public void regionOffline(final HRegionInfo hri) {
526     regionOffline(hri, null);
527   }
528 
529   /**
530    * A region is offline, won't be in transition any more. Its state
531    * should be the specified expected state, which can only be
532    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
533    */
534   public void regionOffline(
535       final HRegionInfo hri, final State expectedState) {
536     Preconditions.checkArgument(expectedState == null
537       || RegionState.isUnassignable(expectedState),
538         "Offlined region should not be " + expectedState);
539     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
540       // Remove it from all region maps
541       deleteRegion(hri);
542       return;
543     }
544     State newState =
545       expectedState == null ? State.OFFLINE : expectedState;
546     updateRegionState(hri, newState);
547     String encodedName = hri.getEncodedName();
548     synchronized (this) {
549       regionsInTransition.remove(encodedName);
550       ServerName oldServerName = regionAssignments.remove(hri);
551       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
552         if (newState == State.MERGED || newState == State.SPLIT
553             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
554               TableState.State.DISABLED, TableState.State.DISABLING)) {
555           // Offline the region only if it's merged/split, or the table is disabled/disabling.
556           // Otherwise, offline it from this server only when it is online on a different server.
557           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
558           removeFromServerHoldings(oldServerName, hri);
559           removeFromReplicaMapping(hri);
560         } else {
561           // Need to remember it so that we can offline it from this
562           // server when it is online on a different server.
563           oldAssignments.put(encodedName, oldServerName);
564         }
565       }
566     }
567   }
568 
569   /**
570    * A server is offline, all regions on it are dead.
571    */
572   public List<HRegionInfo> serverOffline(final ServerName sn) {
573     // Offline all regions on this server not already in transition.
574     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
575     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
576     // Offline regions outside the loop and synchronized block to avoid
577     // ConcurrentModificationException and deadlock in case of meta anassigned,
578     // but RegionState a blocked.
579     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
580     synchronized (this) {
581       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
582       if (assignedRegions == null) {
583         assignedRegions = new HashSet<HRegionInfo>();
584       }
585 
586       for (HRegionInfo region : assignedRegions) {
587         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
588         if (isRegionOnline(region)) {
589           regionsToOffline.add(region);
590         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
591           LOG.debug("Offline splitting/merging region " + getRegionState(region));
592           regionsToOffline.add(region);
593         }
594       }
595 
596       for (RegionState state : regionsInTransition.values()) {
597         HRegionInfo hri = state.getRegion();
598         if (assignedRegions.contains(hri)) {
599           // Region is open on this region server, but in transition.
600           // This region must be moving away from this server, or splitting/merging.
601           // SSH will handle it, either skip assigning, or re-assign.
602           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
603         } else if (sn.equals(state.getServerName())) {
604           // Region is in transition on this region server, and this
605           // region is not open on this server. So the region must be
606           // moving to this server from another one (i.e. opening or
607           // pending open on this server, was open on another one.
608           // Offline state is also kind of pending open if the region is in
609           // transition. The region could be in failed_close state too if we have
610           // tried several times to open it while this region server is not reachable)
611           if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
612               State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
613             LOG.info("Found region in " + state +
614               " to be reassigned by ServerCrashProcedure for " + sn);
615             rits.add(hri);
616           } else if (isOneOfStates(state, State.SPLITTING_NEW)) {
617             regionsToCleanIfNoMetaEntry.add(state.getRegion());
618           } else {
619             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
620           }
621         }
622       }
623       this.notifyAll();
624     }
625 
626     for (HRegionInfo hri : regionsToOffline) {
627       regionOffline(hri);
628     }
629 
630     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
631     return rits;
632   }
633 
634   /**
635    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
636    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
637    */
638   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
639     if (hris.isEmpty()) return;
640     for (HRegionInfo hri: hris) {
641       try {
642         // This is RPC to meta table. It is done while we have a synchronize on
643         // regionstates. No progress will be made if meta is not available at this time.
644         // This is a cleanup task. Not critical.
645         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
646             null) {
647           regionOffline(hri);
648           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
649         }
650       } catch (IOException e) {
651         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
652       }
653     }
654   }
655 
656   /**
657    * Gets the online regions of the specified table.
658    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
659    * Only returns <em>online</em> regions.  If a region on this table has been
660    * closed during a disable, etc., it will be included in the returned list.
661    * So, the returned list may not necessarily be ALL regions in this table, its
662    * all the ONLINE regions in the table.
663    * @param tableName
664    * @return Online regions from <code>tableName</code>
665    */
666   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
667     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
668     // boundary needs to have table's name but regionID 0 so that it is sorted
669     // before all table's regions.
670     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
671     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
672       if(!hri.getTable().equals(tableName)) break;
673       tableRegions.add(hri);
674     }
675     return tableRegions;
676   }
677 
678   /**
679    * Gets current state of all regions of the table.
680    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
681    * Method guaranteed to return keys for all states
682    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
683    *
684    * @param tableName
685    * @return Online regions from <code>tableName</code>
686    */
687   public synchronized Map<RegionState.State, List<HRegionInfo>>
688   getRegionByStateOfTable(TableName tableName) {
689     Map<RegionState.State, List<HRegionInfo>> tableRegions =
690         new HashMap<State, List<HRegionInfo>>();
691     for (State state : State.values()) {
692       tableRegions.put(state, new ArrayList<HRegionInfo>());
693     }
694     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
695     if (indexMap == null)
696       return tableRegions;
697     for (RegionState regionState : indexMap.values()) {
698       tableRegions.get(regionState.getState()).add(regionState.getRegion());
699     }
700     return tableRegions;
701   }
702 
703   /**
704    * Wait on region to clear regions-in-transition.
705    * <p>
706    * If the region isn't in transition, returns immediately.  Otherwise, method
707    * blocks until the region is out of transition.
708    */
709   public synchronized void waitOnRegionToClearRegionsInTransition(
710       final HRegionInfo hri) throws InterruptedException {
711     if (!isRegionInTransition(hri)) return;
712 
713     while(!server.isStopped() && isRegionInTransition(hri)) {
714       RegionState rs = getRegionState(hri);
715       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
716       waitForUpdate(100);
717     }
718 
719     if (server.isStopped()) {
720       LOG.info("Giving up wait on region in " +
721         "transition because stoppable.isStopped is set");
722     }
723   }
724 
725   /**
726    * A table is deleted. Remove its regions from all internal maps.
727    * We loop through all regions assuming we don't delete tables too much.
728    */
729   public void tableDeleted(final TableName tableName) {
730     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
731     synchronized (this) {
732       for (RegionState state: regionStates.values()) {
733         HRegionInfo region = state.getRegion();
734         if (region.getTable().equals(tableName)) {
735           regionsToDelete.add(region);
736         }
737       }
738     }
739     for (HRegionInfo region: regionsToDelete) {
740       deleteRegion(region);
741     }
742   }
743 
744   /**
745    * Get a copy of all regions assigned to a server
746    */
747   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
748     Set<HRegionInfo> regions = serverHoldings.get(serverName);
749     if (regions == null) return null;
750     return new HashSet<HRegionInfo>(regions);
751   }
752 
753   /**
754    * Remove a region from all state maps.
755    */
756   @VisibleForTesting
757   public synchronized void deleteRegion(final HRegionInfo hri) {
758     String encodedName = hri.getEncodedName();
759     regionsInTransition.remove(encodedName);
760     regionStates.remove(encodedName);
761     TableName table = hri.getTable();
762     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
763     indexMap.remove(encodedName);
764     if (indexMap.size() == 0)
765       regionStatesTableIndex.remove(table);
766     lastAssignments.remove(encodedName);
767     ServerName sn = regionAssignments.remove(hri);
768     if (sn != null) {
769       Set<HRegionInfo> regions = serverHoldings.get(sn);
770       regions.remove(hri);
771     }
772   }
773 
774   /**
775    * Checking if a region was assigned to a server which is not online now.
776    * If so, we should hold re-assign this region till SSH has split its wals.
777    * Once logs are split, the last assignment of this region will be reset,
778    * which means a null last assignment server is ok for re-assigning.
779    *
780    * A region server could be dead but we don't know it yet. We may
781    * think it's online falsely. Therefore if a server is online, we still
782    * need to confirm it reachable and having the expected start code.
783    */
784   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
785     ServerName server = lastAssignments.get(encodedName);
786     return isServerDeadAndNotProcessed(server);
787   }
788 
789   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
790     if (server == null) return false;
791     if (serverManager.isServerOnline(server)) {
792       String hostAndPort = server.getHostAndPort();
793       long startCode = server.getStartcode();
794       Long deadCode = deadServers.get(hostAndPort);
795       if (deadCode == null || startCode > deadCode.longValue()) {
796         if (serverManager.isServerReachable(server)) {
797           return false;
798         }
799         // The size of deadServers won't grow unbounded.
800         deadServers.put(hostAndPort, Long.valueOf(startCode));
801       }
802       // Watch out! If the server is not dead, the region could
803       // remain unassigned. That's why ServerManager#isServerReachable
804       // should use some retry.
805       //
806       // We cache this info since it is very unlikely for that
807       // instance to come back up later on. We don't want to expire
808       // the server since we prefer to let it die naturally.
809       LOG.warn("Couldn't reach online server " + server);
810     }
811     // Now, we know it's dead. Check if it's processed
812     return !processedServers.containsKey(server);
813   }
814 
815  /**
816    * Get the last region server a region was on for purpose of re-assignment,
817    * i.e. should the re-assignment be held back till log split is done?
818    */
819   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
820     return lastAssignments.get(encodedName);
821   }
822 
823   synchronized void setLastRegionServerOfRegions(
824       final ServerName serverName, final List<HRegionInfo> regionInfos) {
825     for (HRegionInfo hri: regionInfos) {
826       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
827     }
828   }
829 
830   synchronized void setLastRegionServerOfRegion(
831       final ServerName serverName, final String encodedName) {
832     lastAssignments.put(encodedName, serverName);
833   }
834 
835   synchronized boolean isRegionOnServer(
836       final HRegionInfo hri, final ServerName serverName) {
837     Set<HRegionInfo> regions = serverHoldings.get(serverName);
838     return regions == null ? false : regions.contains(hri);
839   }
840 
841   void splitRegion(HRegionInfo p,
842       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
843 
844     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
845     synchronized (this) {
846       // After PONR, split is considered to be done.
847       // Update server holdings to be aligned with the meta.
848       Set<HRegionInfo> regions = serverHoldings.get(sn);
849       if (regions == null) {
850         throw new IllegalStateException(sn + " should host some regions");
851       }
852       regions.remove(p);
853       regions.add(a);
854       regions.add(b);
855     }
856   }
857 
858   void mergeRegions(HRegionInfo p,
859       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
860     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
861     synchronized (this) {
862       // After PONR, merge is considered to be done.
863       // Update server holdings to be aligned with the meta.
864       Set<HRegionInfo> regions = serverHoldings.get(sn);
865       if (regions == null) {
866         throw new IllegalStateException(sn + " should host some regions");
867       }
868       regions.remove(a);
869       regions.remove(b);
870       regions.add(p);
871     }
872   }
873 
874   private int getRegionReplication(HRegionInfo r) throws IOException {
875     if (tableStateManager != null) {
876       HTableDescriptor htd = server.getTableDescriptors().get(r.getTable());
877       if (htd != null) {
878         return htd.getRegionReplication();
879       }
880     }
881     return 1;
882   }
883 
884   /**
885    * At cluster clean re/start, mark all user regions closed except those of tables
886    * that are excluded, such as disabled/disabling/enabling tables. All user regions
887    * and their previous locations are returned.
888    */
889   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
890     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
891     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
892     for(RegionState state: regionStates.values()) {
893       HRegionInfo hri = state.getRegion();
894       if (state.isSplit() || hri.isSplit()) {
895         continue;
896       }
897       TableName tableName = hri.getTable();
898       if (!TableName.META_TABLE_NAME.equals(tableName)
899           && (noExcludeTables || !excludedTables.contains(tableName))) {
900         toBeClosed.add(hri);
901       }
902     }
903     Map<HRegionInfo, ServerName> allUserRegions =
904       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
905     for (HRegionInfo hri: toBeClosed) {
906       RegionState regionState = updateRegionState(hri, State.CLOSED);
907       allUserRegions.put(hri, regionState.getServerName());
908     }
909     return allUserRegions;
910   }
911 
912   /**
913    * Compute the average load across all region servers.
914    * Currently, this uses a very naive computation - just uses the number of
915    * regions being served, ignoring stats about number of requests.
916    * @return the average load
917    */
918   protected synchronized double getAverageLoad() {
919     int numServers = 0, totalLoad = 0;
920     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
921       Set<HRegionInfo> regions = e.getValue();
922       ServerName serverName = e.getKey();
923       int regionCount = regions.size();
924       if (serverManager.isServerOnline(serverName)) {
925         totalLoad += regionCount;
926         numServers++;
927       }
928     }
929     if (numServers > 1) {
930       // The master region server holds only a couple regions.
931       // Don't consider this server in calculating the average load
932       // if there are other region servers to avoid possible confusion.
933       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
934       if (hris != null) {
935         totalLoad -= hris.size();
936         numServers--;
937       }
938     }
939     return numServers == 0 ? 0.0 :
940       (double)totalLoad / (double)numServers;
941   }
942 
943   /**
944    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
945    * Can't let out original since it can change and at least the load balancer
946    * wants to iterate this exported list.  We need to synchronize on regions
947    * since all access to this.servers is under a lock on this.regions.
948    *
949    * @return A clone of current assignments by table.
950    */
951   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
952       getAssignmentsByTable() {
953     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
954       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
955     synchronized (this) {
956       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
957         Map<ServerName, List<HRegionInfo>> svrToRegions =
958           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
959         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
960           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
961         }
962         result.put(TableName.valueOf("ensemble"), svrToRegions);
963       } else {
964         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
965           for (HRegionInfo hri: e.getValue()) {
966             if (hri.isMetaRegion()) continue;
967             TableName tablename = hri.getTable();
968             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
969             if (svrToRegions == null) {
970               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
971               result.put(tablename, svrToRegions);
972             }
973             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
974             if (regions == null) {
975               regions = new ArrayList<HRegionInfo>();
976               svrToRegions.put(e.getKey(), regions);
977             }
978             regions.add(hri);
979           }
980         }
981       }
982     }
983 
984     Map<ServerName, ServerLoad>
985       onlineSvrs = serverManager.getOnlineServers();
986     // Take care of servers w/o assignments, and remove servers in draining mode
987     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
988     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
989       for (ServerName svr: onlineSvrs.keySet()) {
990         if (!map.containsKey(svr)) {
991           map.put(svr, new ArrayList<HRegionInfo>());
992         }
993       }
994       map.keySet().removeAll(drainingServers);
995     }
996     return result;
997   }
998 
999   protected RegionState getRegionState(final HRegionInfo hri) {
1000     return getRegionState(hri.getEncodedName());
1001   }
1002 
1003   /**
1004    * Returns a clone of region assignments per server
1005    * @return a Map of ServerName to a List of HRegionInfo's
1006    */
1007   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1008     Map<ServerName, List<HRegionInfo>> regionsByServer =
1009         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1010     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1011       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1012     }
1013     return regionsByServer;
1014   }
1015 
1016   protected synchronized RegionState getRegionState(final String encodedName) {
1017     return regionStates.get(encodedName);
1018   }
1019 
1020   /**
1021    * Get the HRegionInfo from cache, if not there, from the hbase:meta table.
1022    * Be careful. Does RPC. Do not hold a lock or synchronize when you call this method.
1023    * @param  regionName
1024    * @return HRegionInfo for the region
1025    */
1026   @SuppressWarnings("deprecation")
1027   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1028     String encodedName = HRegionInfo.encodeRegionName(regionName);
1029     RegionState regionState = getRegionState(encodedName);
1030     if (regionState != null) {
1031       return regionState.getRegion();
1032     }
1033 
1034     try {
1035       Pair<HRegionInfo, ServerName> p =
1036         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1037       HRegionInfo hri = p == null ? null : p.getFirst();
1038       if (hri != null) {
1039         createRegionState(hri);
1040       }
1041       return hri;
1042     } catch (IOException e) {
1043       server.abort("Aborting because error occoured while reading "
1044         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1045       return null;
1046     }
1047   }
1048 
1049   static boolean isOneOfStates(RegionState regionState, State... states) {
1050     State s = regionState != null ? regionState.getState() : null;
1051     for (State state: states) {
1052       if (s == state) return true;
1053     }
1054     return false;
1055   }
1056 
1057   /**
1058    * Update a region state. It will be put in transition if not already there.
1059    */
1060   private RegionState updateRegionState(final HRegionInfo hri,
1061       final RegionState.State state, final ServerName serverName, long openSeqNum) {
1062     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
1063       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1064         + " on " + serverName + ", set to " + state);
1065     }
1066 
1067     String encodedName = hri.getEncodedName();
1068     RegionState regionState = new RegionState(
1069       hri, state, System.currentTimeMillis(), serverName);
1070     RegionState oldState = getRegionState(encodedName);
1071     if (!regionState.equals(oldState)) {
1072       LOG.info("Transition " + oldState + " to " + regionState);
1073       // Persist region state before updating in-memory info, if needed
1074       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1075     }
1076 
1077     synchronized (this) {
1078       regionsInTransition.put(encodedName, regionState);
1079       putRegionState(regionState);
1080 
1081       // For these states, region should be properly closed.
1082       // There should be no log splitting issue.
1083       if ((state == State.CLOSED || state == State.MERGED
1084           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1085         ServerName last = lastAssignments.get(encodedName);
1086         if (last.equals(serverName)) {
1087           lastAssignments.remove(encodedName);
1088         } else {
1089           LOG.warn(encodedName + " moved to " + state + " on "
1090             + serverName + ", expected " + last);
1091         }
1092       }
1093 
1094       // Once a region is opened, record its last assignment right away.
1095       if (serverName != null && state == State.OPEN) {
1096         ServerName last = lastAssignments.get(encodedName);
1097         if (!serverName.equals(last)) {
1098           lastAssignments.put(encodedName, serverName);
1099           if (last != null && isServerDeadAndNotProcessed(last)) {
1100             LOG.warn(encodedName + " moved to " + serverName
1101               + ", while it's previous host " + last
1102               + " is dead but not processed yet");
1103           }
1104         }
1105       }
1106 
1107       // notify the change
1108       this.notifyAll();
1109     }
1110     return regionState;
1111   }
1112 }