View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import com.google.common.annotations.VisibleForTesting;
33  import com.google.common.base.Preconditions;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.hbase.classification.InterfaceAudience;
38  import org.apache.hadoop.conf.Configuration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.HRegionInfo;
41  import org.apache.hadoop.hbase.HTableDescriptor;
42  import org.apache.hadoop.hbase.MetaTableAccessor;
43  import org.apache.hadoop.hbase.ServerLoad;
44  import org.apache.hadoop.hbase.ServerName;
45  import org.apache.hadoop.hbase.TableName;
46  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
47  import org.apache.hadoop.hbase.master.RegionState.State;
48  import org.apache.hadoop.hbase.client.TableState;
49  import org.apache.hadoop.hbase.util.Bytes;
50  import org.apache.hadoop.hbase.util.FSUtils;
51  import org.apache.hadoop.hbase.util.Pair;
52  
53  /**
54   * Region state accountant. It holds the states of all regions in the memory.
55   * In normal scenario, it should match the meta table and the true region states.
56   *
57   * This map is used by AssignmentManager to track region states.
58   */
59  @InterfaceAudience.Private
60  public class RegionStates {
61    private static final Log LOG = LogFactory.getLog(RegionStates.class);
62  
63    /**
64     * Regions currently in transition.
65     */
66    final HashMap<String, RegionState> regionsInTransition =
67      new HashMap<String, RegionState>();
68  
69    /**
70     * Region encoded name to state map.
71     * All the regions should be in this map.
72     */
73    private final Map<String, RegionState> regionStates =
74      new HashMap<String, RegionState>();
75  
76    /**
77     * Holds mapping of table -> region state
78     */
79    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
80        new HashMap<TableName, Map<String, RegionState>>();
81  
82    /**
83     * Server to regions assignment map.
84     * Contains the set of regions currently assigned to a given server.
85     */
86    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
87      new HashMap<ServerName, Set<HRegionInfo>>();
88  
89    /**
90     * Maintains the mapping from the default region to the replica regions.
91     */
92    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
93      new HashMap<HRegionInfo, Set<HRegionInfo>>();
94  
95    /**
96     * Region to server assignment map.
97     * Contains the server a given region is currently assigned to.
98     */
99    private final TreeMap<HRegionInfo, ServerName> regionAssignments =
100     new TreeMap<HRegionInfo, ServerName>();
101 
102   /**
103    * Encoded region name to server assignment map for re-assignment
104    * purpose. Contains the server a given region is last known assigned
105    * to, which has not completed log splitting, so not assignable.
106    * If a region is currently assigned, this server info in this
107    * map should be the same as that in regionAssignments.
108    * However the info in regionAssignments is cleared when the region
109    * is offline while the info in lastAssignments is cleared when
110    * the region is closed or the server is dead and processed.
111    */
112   private final HashMap<String, ServerName> lastAssignments =
113     new HashMap<String, ServerName>();
114 
115   /**
116    * Encoded region name to server assignment map for the
117    * purpose to clean up serverHoldings when a region is online
118    * on a new server. When the region is offline from the previous
119    * server, we cleaned up regionAssignments so that it has the
120    * latest assignment map. But we didn't clean up serverHoldings
121    * to match the meta. We need this map to find out the old server
122    * whose serverHoldings needs cleanup, given a moved region.
123    */
124   private final HashMap<String, ServerName> oldAssignments =
125     new HashMap<String, ServerName>();
126 
127   /**
128    * Map a host port pair string to the latest start code
129    * of a region server which is known to be dead. It is dead
130    * to us, but server manager may not know it yet.
131    */
132   private final HashMap<String, Long> deadServers =
133     new HashMap<String, Long>();
134 
135   /**
136    * Map a dead servers to the time when log split is done.
137    * Since log splitting is not ordered, we have to remember
138    * all processed instances. The map is cleaned up based
139    * on a configured time. By default, we assume a dead
140    * server should be done with log splitting in two hours.
141    */
142   private final HashMap<ServerName, Long> processedServers =
143     new HashMap<ServerName, Long>();
144   private long lastProcessedServerCleanTime;
145 
146   private final TableStateManager tableStateManager;
147   private final RegionStateStore regionStateStore;
148   private final ServerManager serverManager;
149   private final MasterServices server;
150 
151   // The maximum time to keep a log split info in region states map
152   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
153   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
154 
155   RegionStates(final MasterServices master, final TableStateManager tableStateManager,
156       final ServerManager serverManager, final RegionStateStore regionStateStore) {
157     this.tableStateManager = tableStateManager;
158     this.regionStateStore = regionStateStore;
159     this.serverManager = serverManager;
160     this.server = master;
161   }
162 
163   /**
164    * @return an unmodifiable the region assignment map
165    */
166   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
167     return Collections.unmodifiableMap(regionAssignments);
168   }
169 
170   /**
171    * Return the replicas (including default) for the regions grouped by ServerName
172    * @param regions
173    * @return a pair containing the groupings as a map
174    */
175   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
176     Collection<HRegionInfo> regions) {
177     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
178     for (HRegionInfo region : regions) {
179       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
180       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
181       if (allReplicas != null) {
182         for (HRegionInfo hri : allReplicas) {
183           ServerName server = regionAssignments.get(hri);
184           if (server != null) {
185             List<HRegionInfo> regionsOnServer = map.get(server);
186             if (regionsOnServer == null) {
187               regionsOnServer = new ArrayList<HRegionInfo>(1);
188               map.put(server, regionsOnServer);
189             }
190             regionsOnServer.add(hri);
191           }
192         }
193       }
194     }
195     return map;
196   }
197 
198   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
199     return regionAssignments.get(hri);
200   }
201 
202   /**
203    * Get regions in transition and their states
204    */
205   @SuppressWarnings("unchecked")
206   public synchronized Map<String, RegionState> getRegionsInTransition() {
207     return (Map<String, RegionState>)regionsInTransition.clone();
208   }
209 
210   /**
211    * @return True if specified region in transition.
212    */
213   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
214     return regionsInTransition.containsKey(hri.getEncodedName());
215   }
216 
217   /**
218    * @return True if specified region in transition.
219    */
220   public synchronized boolean isRegionInTransition(final String encodedName) {
221     return regionsInTransition.containsKey(encodedName);
222   }
223 
224   /**
225    * @return True if any region in transition.
226    */
227   public synchronized boolean isRegionsInTransition() {
228     return !regionsInTransition.isEmpty();
229   }
230 
231   /**
232    * @return True if specified region assigned, and not in transition.
233    */
234   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
235     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
236   }
237 
238   /**
239    * @return True if specified region offline/closed, but not in transition.
240    * If the region is not in the map, it is offline to us too.
241    */
242   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
243     return getRegionState(hri) == null || (!isRegionInTransition(hri)
244       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
245   }
246 
247   /**
248    * @return True if specified region is in one of the specified states.
249    */
250   public boolean isRegionInState(
251       final HRegionInfo hri, final State... states) {
252     return isRegionInState(hri.getEncodedName(), states);
253   }
254 
255   /**
256    * @return True if specified region is in one of the specified states.
257    */
258   public boolean isRegionInState(
259       final String encodedName, final State... states) {
260     RegionState regionState = getRegionState(encodedName);
261     return isOneOfStates(regionState, states);
262   }
263 
264   /**
265    * Wait for the state map to be updated by assignment manager.
266    */
267   public synchronized void waitForUpdate(
268       final long timeout) throws InterruptedException {
269     this.wait(timeout);
270   }
271 
272   /**
273    * Get region transition state
274    */
275   public RegionState getRegionTransitionState(final HRegionInfo hri) {
276     return getRegionTransitionState(hri.getEncodedName());
277   }
278 
279   /**
280    * Get region transition state
281    */
282   public synchronized RegionState
283       getRegionTransitionState(final String encodedName) {
284     return regionsInTransition.get(encodedName);
285   }
286 
287   /**
288    * Add a list of regions to RegionStates. If a region is split
289    * and offline, its state will be SPLIT. Otherwise, its state will
290    * be OFFLINE. Region already in RegionStates will be skipped.
291    */
292   public void createRegionStates(
293       final List<HRegionInfo> hris) {
294     for (HRegionInfo hri: hris) {
295       createRegionState(hri);
296     }
297   }
298 
299   /**
300    * Add a region to RegionStates. If the region is split
301    * and offline, its state will be SPLIT. Otherwise, its state will
302    * be OFFLINE. If it is already in RegionStates, this call has
303    * no effect, and the original state is returned.
304    */
305   public RegionState createRegionState(final HRegionInfo hri) {
306     return createRegionState(hri, null, null, null);
307   }
308 
309   /**
310    * Add a region to RegionStates with the specified state.
311    * If the region is already in RegionStates, this call has
312    * no effect, and the original state is returned.
313    *
314    * @param hri the region info to create a state for
315    * @param newState the state to the region in set to
316    * @param serverName the server the region is transitioning on
317    * @param lastHost the last server that hosts the region
318    * @return the current state
319    */
320   public synchronized RegionState createRegionState(final HRegionInfo hri,
321       State newState, ServerName serverName, ServerName lastHost) {
322     if (newState == null || (newState == State.OPEN && serverName == null)) {
323       newState =  State.OFFLINE;
324     }
325     if (hri.isOffline() && hri.isSplit()) {
326       newState = State.SPLIT;
327       serverName = null;
328     }
329     String encodedName = hri.getEncodedName();
330     RegionState regionState = regionStates.get(encodedName);
331     if (regionState != null) {
332       LOG.warn("Tried to create a state for a region already in RegionStates, "
333         + "used existing: " + regionState + ", ignored new: " + newState);
334     } else {
335       regionState = new RegionState(hri, newState, serverName);
336       putRegionState(regionState);
337       if (newState == State.OPEN) {
338         if (!serverName.equals(lastHost)) {
339           LOG.warn("Open region's last host " + lastHost
340             + " should be the same as the current one " + serverName
341             + ", ignored the last and used the current one");
342           lastHost = serverName;
343         }
344         lastAssignments.put(encodedName, lastHost);
345         regionAssignments.put(hri, lastHost);
346       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
347         regionsInTransition.put(encodedName, regionState);
348       }
349       if (lastHost != null && newState != State.SPLIT) {
350         addToServerHoldings(lastHost, hri);
351         if (newState != State.OPEN) {
352           oldAssignments.put(encodedName, lastHost);
353         }
354       }
355     }
356     return regionState;
357   }
358 
359   private RegionState putRegionState(RegionState regionState) {
360     HRegionInfo hri = regionState.getRegion();
361     String encodedName = hri.getEncodedName();
362     TableName table = hri.getTable();
363     RegionState oldState = regionStates.put(encodedName, regionState);
364     Map<String, RegionState> map = regionStatesTableIndex.get(table);
365     if (map == null) {
366       map = new HashMap<String, RegionState>();
367       regionStatesTableIndex.put(table, map);
368     }
369     map.put(encodedName, regionState);
370     return oldState;
371   }
372 
373   /**
374    * Update a region state. It will be put in transition if not already there.
375    */
376   public RegionState updateRegionState(
377       final HRegionInfo hri, final State state) {
378     RegionState regionState = getRegionState(hri.getEncodedName());
379     return updateRegionState(hri, state,
380       regionState == null ? null : regionState.getServerName());
381   }
382 
383   /**
384    * Update a region state. It will be put in transition if not already there.
385    */
386   public RegionState updateRegionState(
387       final HRegionInfo hri, final State state, final ServerName serverName) {
388     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
389   }
390 
391   public void regionOnline(
392       final HRegionInfo hri, final ServerName serverName) {
393     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
394   }
395 
396   /**
397    * A region is online, won't be in transition any more.
398    * We can't confirm it is really online on specified region server
399    * because it hasn't been put in region server's online region list yet.
400    */
401   public void regionOnline(final HRegionInfo hri,
402       final ServerName serverName, long openSeqNum) {
403     String encodedName = hri.getEncodedName();
404     if (!serverManager.isServerOnline(serverName)) {
405       // This is possible if the region server dies before master gets a
406       // chance to handle ZK event in time. At this time, if the dead server
407       // is already processed by SSH, we should ignore this event.
408       // If not processed yet, ignore and let SSH deal with it.
409       LOG.warn("Ignored, " + encodedName
410         + " was opened on a dead server: " + serverName);
411       return;
412     }
413     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
414 
415     synchronized (this) {
416       regionsInTransition.remove(encodedName);
417       ServerName oldServerName = regionAssignments.put(hri, serverName);
418       if (!serverName.equals(oldServerName)) {
419         if (LOG.isDebugEnabled()) {
420           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName + " " + hri);
421         } else {
422           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
423         }
424         addToServerHoldings(serverName, hri);
425         addToReplicaMapping(hri);
426         if (oldServerName == null) {
427           oldServerName = oldAssignments.remove(encodedName);
428         }
429         if (oldServerName != null
430             && !oldServerName.equals(serverName)
431             && serverHoldings.containsKey(oldServerName)) {
432           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
433           removeFromServerHoldings(oldServerName, hri);
434         }
435       }
436     }
437   }
438 
439   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
440     Set<HRegionInfo> regions = serverHoldings.get(serverName);
441     if (regions == null) {
442       regions = new HashSet<HRegionInfo>();
443       serverHoldings.put(serverName, regions);
444     }
445     regions.add(hri);
446   }
447 
448   private void addToReplicaMapping(HRegionInfo hri) {
449     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
450     Set<HRegionInfo> replicas =
451         defaultReplicaToOtherReplicas.get(defaultReplica);
452     if (replicas == null) {
453       replicas = new HashSet<HRegionInfo>();
454       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
455     }
456     replicas.add(hri);
457   }
458 
459   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
460     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
461     oldRegions.remove(hri);
462     if (oldRegions.isEmpty()) {
463       serverHoldings.remove(serverName);
464     }
465   }
466 
467   private void removeFromReplicaMapping(HRegionInfo hri) {
468     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
469     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
470     if (replicas != null) {
471       replicas.remove(hri);
472       if (replicas.isEmpty()) {
473         defaultReplicaToOtherReplicas.remove(defaultReplica);
474       }
475     }
476   }
477 
478   /**
479    * A dead server's wals have been split so that all the regions
480    * used to be open on it can be safely assigned now. Mark them assignable.
481    */
482   public synchronized void logSplit(final ServerName serverName) {
483     for (Iterator<Map.Entry<String, ServerName>> it
484         = lastAssignments.entrySet().iterator(); it.hasNext();) {
485       Map.Entry<String, ServerName> e = it.next();
486       if (e.getValue().equals(serverName)) {
487         it.remove();
488       }
489     }
490     long now = System.currentTimeMillis();
491     if (LOG.isDebugEnabled()) {
492       LOG.debug("Adding to processed servers " + serverName);
493     }
494     processedServers.put(serverName, Long.valueOf(now));
495     Configuration conf = server.getConfiguration();
496     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
497     // Doesn't have to be very accurate about the clean up time
498     if (now > lastProcessedServerCleanTime + obsoleteTime) {
499       lastProcessedServerCleanTime = now;
500       long cutoff = now - obsoleteTime;
501       for (Iterator<Map.Entry<ServerName, Long>> it
502           = processedServers.entrySet().iterator(); it.hasNext();) {
503         Map.Entry<ServerName, Long> e = it.next();
504         if (e.getValue().longValue() < cutoff) {
505           if (LOG.isDebugEnabled()) {
506             LOG.debug("Removed from processed servers " + e.getKey());
507           }
508           it.remove();
509         }
510       }
511     }
512   }
513 
514   /**
515    * Log split is done for a given region, so it is assignable now.
516    */
517   public void logSplit(final HRegionInfo region) {
518     clearLastAssignment(region);
519   }
520 
521   public synchronized void clearLastAssignment(final HRegionInfo region) {
522     lastAssignments.remove(region.getEncodedName());
523   }
524 
525   /**
526    * A region is offline, won't be in transition any more.
527    */
528   public void regionOffline(final HRegionInfo hri) {
529     regionOffline(hri, null);
530   }
531 
532   /**
533    * A region is offline, won't be in transition any more. Its state
534    * should be the specified expected state, which can only be
535    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
536    */
537   public void regionOffline(
538       final HRegionInfo hri, final State expectedState) {
539     Preconditions.checkArgument(expectedState == null
540       || RegionState.isUnassignable(expectedState),
541         "Offlined region should not be " + expectedState);
542     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
543       // Remove it from all region maps
544       deleteRegion(hri);
545       return;
546     }
547     State newState =
548       expectedState == null ? State.OFFLINE : expectedState;
549     updateRegionState(hri, newState);
550     String encodedName = hri.getEncodedName();
551     synchronized (this) {
552       regionsInTransition.remove(encodedName);
553       ServerName oldServerName = regionAssignments.remove(hri);
554       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
555         if (newState == State.MERGED || newState == State.SPLIT
556             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
557               TableState.State.DISABLED, TableState.State.DISABLING)) {
558           // Offline the region only if it's merged/split, or the table is disabled/disabling.
559           // Otherwise, offline it from this server only when it is online on a different server.
560           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
561           removeFromServerHoldings(oldServerName, hri);
562           removeFromReplicaMapping(hri);
563         } else {
564           // Need to remember it so that we can offline it from this
565           // server when it is online on a different server.
566           oldAssignments.put(encodedName, oldServerName);
567         }
568       }
569     }
570   }
571 
572   /**
573    * A server is offline, all regions on it are dead.
574    */
575   public List<HRegionInfo> serverOffline(final ServerName sn) {
576     // Offline all regions on this server not already in transition.
577     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
578     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
579     // Offline regions outside the loop and synchronized block to avoid
580     // ConcurrentModificationException and deadlock in case of meta anassigned,
581     // but RegionState a blocked.
582     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
583     synchronized (this) {
584       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
585       if (assignedRegions == null) {
586         assignedRegions = new HashSet<HRegionInfo>();
587       }
588 
589       for (HRegionInfo region : assignedRegions) {
590         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
591         if (isRegionOnline(region)) {
592           regionsToOffline.add(region);
593         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
594           LOG.debug("Offline splitting/merging region " + getRegionState(region));
595           regionsToOffline.add(region);
596         }
597       }
598 
599       for (RegionState state : regionsInTransition.values()) {
600         HRegionInfo hri = state.getRegion();
601         if (assignedRegions.contains(hri)) {
602           // Region is open on this region server, but in transition.
603           // This region must be moving away from this server, or splitting/merging.
604           // SSH will handle it, either skip assigning, or re-assign.
605           LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
606         } else if (sn.equals(state.getServerName())) {
607           // Region is in transition on this region server, and this
608           // region is not open on this server. So the region must be
609           // moving to this server from another one (i.e. opening or
610           // pending open on this server, was open on another one.
611           // Offline state is also kind of pending open if the region is in
612           // transition. The region could be in failed_close state too if we have
613           // tried several times to open it while this region server is not reachable)
614           if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
615               State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
616             LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
617             rits.add(hri);
618           } else if (isOneOfStates(state, State.SPLITTING_NEW)) {
619             regionsToCleanIfNoMetaEntry.add(state.getRegion());
620           } else {
621             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
622           }
623         }
624       }
625       this.notifyAll();
626     }
627 
628     for (HRegionInfo hri : regionsToOffline) {
629       regionOffline(hri);
630     }
631 
632     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
633     return rits;
634   }
635 
636   /**
637    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
638    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
639    */
640   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
641     if (hris.isEmpty()) return;
642     for (HRegionInfo hri: hris) {
643       try {
644         // This is RPC to meta table. It is done while we have a synchronize on
645         // regionstates. No progress will be made if meta is not available at this time.
646         // This is a cleanup task. Not critical.
647         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
648             null) {
649           regionOffline(hri);
650           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
651         }
652       } catch (IOException e) {
653         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
654       }
655     }
656   }
657 
658   /**
659    * Gets the online regions of the specified table.
660    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
661    * Only returns <em>online</em> regions.  If a region on this table has been
662    * closed during a disable, etc., it will be included in the returned list.
663    * So, the returned list may not necessarily be ALL regions in this table, its
664    * all the ONLINE regions in the table.
665    * @param tableName
666    * @return Online regions from <code>tableName</code>
667    */
668   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
669     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
670     // boundary needs to have table's name but regionID 0 so that it is sorted
671     // before all table's regions.
672     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
673     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
674       if(!hri.getTable().equals(tableName)) break;
675       tableRegions.add(hri);
676     }
677     return tableRegions;
678   }
679 
680   /**
681    * Gets current state of all regions of the table.
682    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
683    * Method guaranteed to return keys for all states
684    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
685    *
686    * @param tableName
687    * @return Online regions from <code>tableName</code>
688    */
689   public synchronized Map<RegionState.State, List<HRegionInfo>>
690   getRegionByStateOfTable(TableName tableName) {
691     Map<RegionState.State, List<HRegionInfo>> tableRegions =
692         new HashMap<State, List<HRegionInfo>>();
693     for (State state : State.values()) {
694       tableRegions.put(state, new ArrayList<HRegionInfo>());
695     }
696     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
697     if (indexMap == null)
698       return tableRegions;
699     for (RegionState regionState : indexMap.values()) {
700       tableRegions.get(regionState.getState()).add(regionState.getRegion());
701     }
702     return tableRegions;
703   }
704 
705   /**
706    * Wait on region to clear regions-in-transition.
707    * <p>
708    * If the region isn't in transition, returns immediately.  Otherwise, method
709    * blocks until the region is out of transition.
710    */
711   public synchronized void waitOnRegionToClearRegionsInTransition(
712       final HRegionInfo hri) throws InterruptedException {
713     if (!isRegionInTransition(hri)) return;
714 
715     while(!server.isStopped() && isRegionInTransition(hri)) {
716       RegionState rs = getRegionState(hri);
717       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
718       waitForUpdate(100);
719     }
720 
721     if (server.isStopped()) {
722       LOG.info("Giving up wait on region in " +
723         "transition because stoppable.isStopped is set");
724     }
725   }
726 
727   /**
728    * A table is deleted. Remove its regions from all internal maps.
729    * We loop through all regions assuming we don't delete tables too much.
730    */
731   public void tableDeleted(final TableName tableName) {
732     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
733     synchronized (this) {
734       for (RegionState state: regionStates.values()) {
735         HRegionInfo region = state.getRegion();
736         if (region.getTable().equals(tableName)) {
737           regionsToDelete.add(region);
738         }
739       }
740     }
741     for (HRegionInfo region: regionsToDelete) {
742       deleteRegion(region);
743     }
744   }
745 
746   /**
747    * Get a copy of all regions assigned to a server
748    */
749   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
750     Set<HRegionInfo> regions = serverHoldings.get(serverName);
751     if (regions == null) return null;
752     return new HashSet<HRegionInfo>(regions);
753   }
754 
755   /**
756    * Remove a region from all state maps.
757    */
758   @VisibleForTesting
759   public synchronized void deleteRegion(final HRegionInfo hri) {
760     String encodedName = hri.getEncodedName();
761     regionsInTransition.remove(encodedName);
762     regionStates.remove(encodedName);
763     TableName table = hri.getTable();
764     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
765     indexMap.remove(encodedName);
766     if (indexMap.size() == 0)
767       regionStatesTableIndex.remove(table);
768     lastAssignments.remove(encodedName);
769     ServerName sn = regionAssignments.remove(hri);
770     if (sn != null) {
771       Set<HRegionInfo> regions = serverHoldings.get(sn);
772       regions.remove(hri);
773     }
774   }
775 
776   /**
777    * Checking if a region was assigned to a server which is not online now.
778    * If so, we should hold re-assign this region till SSH has split its wals.
779    * Once logs are split, the last assignment of this region will be reset,
780    * which means a null last assignment server is ok for re-assigning.
781    *
782    * A region server could be dead but we don't know it yet. We may
783    * think it's online falsely. Therefore if a server is online, we still
784    * need to confirm it reachable and having the expected start code.
785    */
786   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
787     ServerName server = lastAssignments.get(encodedName);
788     return isServerDeadAndNotProcessed(server);
789   }
790 
791   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
792     if (server == null) return false;
793     if (serverManager.isServerOnline(server)) {
794       String hostAndPort = server.getHostAndPort();
795       long startCode = server.getStartcode();
796       Long deadCode = deadServers.get(hostAndPort);
797       if (deadCode == null || startCode > deadCode.longValue()) {
798         if (serverManager.isServerReachable(server)) {
799           return false;
800         }
801         // The size of deadServers won't grow unbounded.
802         deadServers.put(hostAndPort, Long.valueOf(startCode));
803       }
804       // Watch out! If the server is not dead, the region could
805       // remain unassigned. That's why ServerManager#isServerReachable
806       // should use some retry.
807       //
808       // We cache this info since it is very unlikely for that
809       // instance to come back up later on. We don't want to expire
810       // the server since we prefer to let it die naturally.
811       LOG.warn("Couldn't reach online server " + server);
812     }
813     // Now, we know it's dead. Check if it's processed
814     return !processedServers.containsKey(server);
815   }
816 
817  /**
818    * Get the last region server a region was on for purpose of re-assignment,
819    * i.e. should the re-assignment be held back till log split is done?
820    */
821   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
822     return lastAssignments.get(encodedName);
823   }
824 
825   synchronized void setLastRegionServerOfRegions(
826       final ServerName serverName, final List<HRegionInfo> regionInfos) {
827     for (HRegionInfo hri: regionInfos) {
828       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
829     }
830   }
831 
832   synchronized void setLastRegionServerOfRegion(
833       final ServerName serverName, final String encodedName) {
834     lastAssignments.put(encodedName, serverName);
835   }
836 
837   synchronized boolean isRegionOnServer(
838       final HRegionInfo hri, final ServerName serverName) {
839     Set<HRegionInfo> regions = serverHoldings.get(serverName);
840     return regions == null ? false : regions.contains(hri);
841   }
842 
843   void splitRegion(HRegionInfo p,
844       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
845 
846     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
847     synchronized (this) {
848       // After PONR, split is considered to be done.
849       // Update server holdings to be aligned with the meta.
850       Set<HRegionInfo> regions = serverHoldings.get(sn);
851       if (regions == null) {
852         throw new IllegalStateException(sn + " should host some regions");
853       }
854       regions.remove(p);
855       regions.add(a);
856       regions.add(b);
857     }
858   }
859 
860   void mergeRegions(HRegionInfo p,
861       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
862     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
863     synchronized (this) {
864       // After PONR, merge is considered to be done.
865       // Update server holdings to be aligned with the meta.
866       Set<HRegionInfo> regions = serverHoldings.get(sn);
867       if (regions == null) {
868         throw new IllegalStateException(sn + " should host some regions");
869       }
870       regions.remove(a);
871       regions.remove(b);
872       regions.add(p);
873     }
874   }
875 
876   private int getRegionReplication(HRegionInfo r) throws IOException {
877     if (tableStateManager != null) {
878       HTableDescriptor htd = server.getTableDescriptors().get(r.getTable());
879       if (htd != null) {
880         return htd.getRegionReplication();
881       }
882     }
883     return 1;
884   }
885 
886   /**
887    * At cluster clean re/start, mark all user regions closed except those of tables
888    * that are excluded, such as disabled/disabling/enabling tables. All user regions
889    * and their previous locations are returned.
890    */
891   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
892     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
893     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
894     for(RegionState state: regionStates.values()) {
895       HRegionInfo hri = state.getRegion();
896       if (state.isSplit() || hri.isSplit()) {
897         continue;
898       }
899       TableName tableName = hri.getTable();
900       if (!TableName.META_TABLE_NAME.equals(tableName)
901           && (noExcludeTables || !excludedTables.contains(tableName))) {
902         toBeClosed.add(hri);
903       }
904     }
905     Map<HRegionInfo, ServerName> allUserRegions =
906       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
907     for (HRegionInfo hri: toBeClosed) {
908       RegionState regionState = updateRegionState(hri, State.CLOSED);
909       allUserRegions.put(hri, regionState.getServerName());
910     }
911     return allUserRegions;
912   }
913 
914   /**
915    * Compute the average load across all region servers.
916    * Currently, this uses a very naive computation - just uses the number of
917    * regions being served, ignoring stats about number of requests.
918    * @return the average load
919    */
920   protected synchronized double getAverageLoad() {
921     int numServers = 0, totalLoad = 0;
922     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
923       Set<HRegionInfo> regions = e.getValue();
924       ServerName serverName = e.getKey();
925       int regionCount = regions.size();
926       if (serverManager.isServerOnline(serverName)) {
927         totalLoad += regionCount;
928         numServers++;
929       }
930     }
931     if (numServers > 1) {
932       // The master region server holds only a couple regions.
933       // Don't consider this server in calculating the average load
934       // if there are other region servers to avoid possible confusion.
935       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
936       if (hris != null) {
937         totalLoad -= hris.size();
938         numServers--;
939       }
940     }
941     return numServers == 0 ? 0.0 :
942       (double)totalLoad / (double)numServers;
943   }
944 
945   /**
946    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
947    * Can't let out original since it can change and at least the load balancer
948    * wants to iterate this exported list.  We need to synchronize on regions
949    * since all access to this.servers is under a lock on this.regions.
950    *
951    * @return A clone of current assignments by table.
952    */
953   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
954       getAssignmentsByTable() {
955     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
956       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
957     synchronized (this) {
958       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
959         Map<ServerName, List<HRegionInfo>> svrToRegions =
960           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
961         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
962           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
963         }
964         result.put(TableName.valueOf("ensemble"), svrToRegions);
965       } else {
966         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
967           for (HRegionInfo hri: e.getValue()) {
968             if (hri.isMetaRegion()) continue;
969             TableName tablename = hri.getTable();
970             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
971             if (svrToRegions == null) {
972               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
973               result.put(tablename, svrToRegions);
974             }
975             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
976             if (regions == null) {
977               regions = new ArrayList<HRegionInfo>();
978               svrToRegions.put(e.getKey(), regions);
979             }
980             regions.add(hri);
981           }
982         }
983       }
984     }
985 
986     Map<ServerName, ServerLoad>
987       onlineSvrs = serverManager.getOnlineServers();
988     // Take care of servers w/o assignments, and remove servers in draining mode
989     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
990     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
991       for (ServerName svr: onlineSvrs.keySet()) {
992         if (!map.containsKey(svr)) {
993           map.put(svr, new ArrayList<HRegionInfo>());
994         }
995       }
996       map.keySet().removeAll(drainingServers);
997     }
998     return result;
999   }
1000 
1001   protected RegionState getRegionState(final HRegionInfo hri) {
1002     return getRegionState(hri.getEncodedName());
1003   }
1004 
1005   /**
1006    * Returns a clone of region assignments per server
1007    * @return a Map of ServerName to a List of HRegionInfo's
1008    */
1009   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1010     Map<ServerName, List<HRegionInfo>> regionsByServer =
1011         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1012     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1013       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1014     }
1015     return regionsByServer;
1016   }
1017 
1018   protected synchronized RegionState getRegionState(final String encodedName) {
1019     return regionStates.get(encodedName);
1020   }
1021 
1022   /**
1023    * Get the HRegionInfo from cache, if not there, from the hbase:meta table.
1024    * Be careful. Does RPC. Do not hold a lock or synchronize when you call this method.
1025    * @param  regionName
1026    * @return HRegionInfo for the region
1027    */
1028   @SuppressWarnings("deprecation")
1029   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1030     String encodedName = HRegionInfo.encodeRegionName(regionName);
1031     RegionState regionState = getRegionState(encodedName);
1032     if (regionState != null) {
1033       return regionState.getRegion();
1034     }
1035 
1036     try {
1037       Pair<HRegionInfo, ServerName> p =
1038         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1039       HRegionInfo hri = p == null ? null : p.getFirst();
1040       if (hri != null) {
1041         createRegionState(hri);
1042       }
1043       return hri;
1044     } catch (IOException e) {
1045       server.abort("Aborting because error occoured while reading "
1046         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1047       return null;
1048     }
1049   }
1050 
1051   static boolean isOneOfStates(RegionState regionState, State... states) {
1052     State s = regionState != null ? regionState.getState() : null;
1053     for (State state: states) {
1054       if (s == state) return true;
1055     }
1056     return false;
1057   }
1058 
1059   /**
1060    * Update a region state. It will be put in transition if not already there.
1061    */
1062   private RegionState updateRegionState(final HRegionInfo hri,
1063       final RegionState.State state, final ServerName serverName, long openSeqNum) {
1064     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
1065       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1066         + " on " + serverName + ", set to " + state);
1067     }
1068 
1069     String encodedName = hri.getEncodedName();
1070     RegionState regionState = new RegionState(
1071       hri, state, System.currentTimeMillis(), serverName);
1072     RegionState oldState = getRegionState(encodedName);
1073     if (!regionState.equals(oldState)) {
1074       LOG.info("Transition " + oldState + " to " + regionState);
1075       // Persist region state before updating in-memory info, if needed
1076       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1077     }
1078 
1079     synchronized (this) {
1080       regionsInTransition.put(encodedName, regionState);
1081       putRegionState(regionState);
1082 
1083       // For these states, region should be properly closed.
1084       // There should be no log splitting issue.
1085       if ((state == State.CLOSED || state == State.MERGED
1086           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1087         ServerName last = lastAssignments.get(encodedName);
1088         if (last.equals(serverName)) {
1089           lastAssignments.remove(encodedName);
1090         } else {
1091           LOG.warn(encodedName + " moved to " + state + " on "
1092             + serverName + ", expected " + last);
1093         }
1094       }
1095 
1096       // Once a region is opened, record its last assignment right away.
1097       if (serverName != null && state == State.OPEN) {
1098         ServerName last = lastAssignments.get(encodedName);
1099         if (!serverName.equals(last)) {
1100           lastAssignments.put(encodedName, serverName);
1101           if (last != null && isServerDeadAndNotProcessed(last)) {
1102             LOG.warn(encodedName + " moved to " + serverName
1103               + ", while it's previous host " + last
1104               + " is dead but not processed yet");
1105           }
1106         }
1107       }
1108 
1109       // notify the change
1110       this.notifyAll();
1111     }
1112     return regionState;
1113   }
1114 }