View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.HashMap;
24  import java.util.HashSet;
25  import java.util.Iterator;
26  import java.util.LinkedHashMap;
27  import java.util.LinkedList;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  import java.util.Collections;
32  import java.util.Comparator;
33  import java.util.TreeMap;
34  
35  import com.google.common.annotations.VisibleForTesting;
36  import com.google.common.base.Preconditions;
37  
38  import org.apache.commons.logging.Log;
39  import org.apache.commons.logging.LogFactory;
40  import org.apache.hadoop.hbase.classification.InterfaceAudience;
41  import org.apache.hadoop.conf.Configuration;
42  import org.apache.hadoop.hbase.HConstants;
43  import org.apache.hadoop.hbase.HRegionInfo;
44  import org.apache.hadoop.hbase.HTableDescriptor;
45  import org.apache.hadoop.hbase.MetaTableAccessor;
46  import org.apache.hadoop.hbase.ServerLoad;
47  import org.apache.hadoop.hbase.ServerName;
48  import org.apache.hadoop.hbase.TableName;
49  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
50  import org.apache.hadoop.hbase.master.RegionState.State;
51  import org.apache.hadoop.hbase.client.TableState;
52  import org.apache.hadoop.hbase.util.Bytes;
53  import org.apache.hadoop.hbase.util.FSUtils;
54  import org.apache.hadoop.hbase.util.Pair;
55  
56  /**
57   * Region state accountant. It holds the states of all regions in the memory.
58   * In normal scenario, it should match the meta table and the true region states.
59   *
60   * This map is used by AssignmentManager to track region states.
61   */
62  @InterfaceAudience.Private
63  public class RegionStates {
64    private static final Log LOG = LogFactory.getLog(RegionStates.class);
65  
66    /**
67     * Regions currently in transition.
68     */
69    final HashMap<String, RegionState> regionsInTransition =
70      new HashMap<String, RegionState>();
71  
72    /**
73     * Region encoded name to state map.
74     * All the regions should be in this map.
75     */
76    private final Map<String, RegionState> regionStates =
77      new HashMap<String, RegionState>();
78  
79    /**
80     * Holds mapping of table -> region state
81     */
82    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
83        new HashMap<TableName, Map<String, RegionState>>();
84  
85    /**
86     * Server to regions assignment map.
87     * Contains the set of regions currently assigned to a given server.
88     */
89    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
90      new HashMap<ServerName, Set<HRegionInfo>>();
91  
92    /**
93     * Maintains the mapping from the default region to the replica regions.
94     */
95    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
96      new HashMap<HRegionInfo, Set<HRegionInfo>>();
97  
98    /**
99     * Region to server assignment map.
100    * Contains the server a given region is currently assigned to.
101    */
102   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
103     new TreeMap<HRegionInfo, ServerName>();
104 
105   /**
106    * Encoded region name to server assignment map for re-assignment
107    * purpose. Contains the server a given region is last known assigned
108    * to, which has not completed log splitting, so not assignable.
109    * If a region is currently assigned, this server info in this
110    * map should be the same as that in regionAssignments.
111    * However the info in regionAssignments is cleared when the region
112    * is offline while the info in lastAssignments is cleared when
113    * the region is closed or the server is dead and processed.
114    */
115   private final HashMap<String, ServerName> lastAssignments =
116     new HashMap<String, ServerName>();
117 
118   /**
119    * Encoded region name to server assignment map for the
120    * purpose to clean up serverHoldings when a region is online
121    * on a new server. When the region is offline from the previous
122    * server, we cleaned up regionAssignments so that it has the
123    * latest assignment map. But we didn't clean up serverHoldings
124    * to match the meta. We need this map to find out the old server
125    * whose serverHoldings needs cleanup, given a moved region.
126    */
127   private final HashMap<String, ServerName> oldAssignments =
128     new HashMap<String, ServerName>();
129 
130   /**
131    * Map a host port pair string to the latest start code
132    * of a region server which is known to be dead. It is dead
133    * to us, but server manager may not know it yet.
134    */
135   private final HashMap<String, Long> deadServers =
136     new HashMap<String, Long>();
137 
138   /**
139    * Map a dead servers to the time when log split is done.
140    * Since log splitting is not ordered, we have to remember
141    * all processed instances. The map is cleaned up based
142    * on a configured time. By default, we assume a dead
143    * server should be done with log splitting in two hours.
144    */
145   private final HashMap<ServerName, Long> processedServers =
146     new HashMap<ServerName, Long>();
147   private long lastProcessedServerCleanTime;
148 
149   private final TableStateManager tableStateManager;
150   private final RegionStateStore regionStateStore;
151   private final ServerManager serverManager;
152   private final MasterServices server;
153 
154   // The maximum time to keep a log split info in region states map
155   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
156   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
157 
158   RegionStates(final MasterServices master, final TableStateManager tableStateManager,
159       final ServerManager serverManager, final RegionStateStore regionStateStore) {
160     this.tableStateManager = tableStateManager;
161     this.regionStateStore = regionStateStore;
162     this.serverManager = serverManager;
163     this.server = master;
164   }
165 
166   /**
167    * @return a copy of the region assignment map
168    */
169   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
170     return new TreeMap<HRegionInfo, ServerName>(regionAssignments);
171   }
172 
173   /**
174    * Return the replicas (including default) for the regions grouped by ServerName
175    * @param regions
176    * @return a pair containing the groupings as a map
177    */
178   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
179     Collection<HRegionInfo> regions) {
180     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
181     for (HRegionInfo region : regions) {
182       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
183       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
184       if (allReplicas != null) {
185         for (HRegionInfo hri : allReplicas) {
186           ServerName server = regionAssignments.get(hri);
187           if (server != null) {
188             List<HRegionInfo> regionsOnServer = map.get(server);
189             if (regionsOnServer == null) {
190               regionsOnServer = new ArrayList<HRegionInfo>(1);
191               map.put(server, regionsOnServer);
192             }
193             regionsOnServer.add(hri);
194           }
195         }
196       }
197     }
198     return map;
199   }
200 
201   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
202     return regionAssignments.get(hri);
203   }
204 
205   /**
206    * Get regions in transition and their states
207    */
208   @SuppressWarnings("unchecked")
209   public synchronized Map<String, RegionState> getRegionsInTransition() {
210     return (Map<String, RegionState>)regionsInTransition.clone();
211   }
212 
213   @SuppressWarnings("unchecked")
214   public synchronized Map<String, RegionState> getRegionsInTransitionOrderedByTimestamp() {
215     Map<String, RegionState> rit = (Map<String, RegionState>)regionsInTransition.clone();
216     List<Map.Entry<String, RegionState>> list = new LinkedList<>(rit.entrySet());
217 
218     // Compare the RITs' timestamps for ordering.
219     Comparator<Map.Entry<String, RegionState>> c =
220         new Comparator<Map.Entry<String, RegionState>>() {
221       @Override
222       public int compare(Map.Entry<String, RegionState> o1, Map.Entry<String, RegionState> o2) {
223         return ((Long)o1.getValue().getStamp()).compareTo((Long)o2.getValue().getStamp());
224       }
225     };
226 
227     Collections.sort(list, c);
228     Map<String, RegionState> result = new LinkedHashMap<>();
229     for (Map.Entry<String, RegionState> entry : list) {
230       result.put(entry.getKey(), entry.getValue());
231     }
232     return result;
233   }
234 
235   /**
236    * @return True if specified region in transition.
237    */
238   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
239     return regionsInTransition.containsKey(hri.getEncodedName());
240   }
241 
242   /**
243    * @return True if specified region in transition.
244    */
245   public synchronized boolean isRegionInTransition(final String encodedName) {
246     return regionsInTransition.containsKey(encodedName);
247   }
248 
249   /**
250    * @return True if any region in transition.
251    */
252   public synchronized boolean isRegionsInTransition() {
253     return !regionsInTransition.isEmpty();
254   }
255 
256   /**
257    * @return True if hbase:meta table region is in transition.
258    */
259   public synchronized boolean isMetaRegionInTransition() {
260     for (RegionState state : regionsInTransition.values()) {
261       if (state.getRegion().isMetaRegion()) return true;
262     }
263     return false;
264   }
265 
266   /**
267    * @return True if specified region assigned, and not in transition.
268    */
269   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
270     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
271   }
272 
273   /**
274    * @return True if specified region offline/closed, but not in transition.
275    * If the region is not in the map, it is offline to us too.
276    */
277   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
278     return getRegionState(hri) == null || (!isRegionInTransition(hri)
279       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
280   }
281 
282   /**
283    * @return True if specified region is in one of the specified states.
284    */
285   public boolean isRegionInState(
286       final HRegionInfo hri, final State... states) {
287     return isRegionInState(hri.getEncodedName(), states);
288   }
289 
290   /**
291    * @return True if specified region is in one of the specified states.
292    */
293   public boolean isRegionInState(
294       final String encodedName, final State... states) {
295     RegionState regionState = getRegionState(encodedName);
296     return isOneOfStates(regionState, states);
297   }
298 
299   /**
300    * Wait for the state map to be updated by assignment manager.
301    */
302   public synchronized void waitForUpdate(
303       final long timeout) throws InterruptedException {
304     this.wait(timeout);
305   }
306 
307   /**
308    * Get region transition state
309    */
310   public RegionState getRegionTransitionState(final HRegionInfo hri) {
311     return getRegionTransitionState(hri.getEncodedName());
312   }
313 
314   /**
315    * Get region transition state
316    */
317   public synchronized RegionState
318       getRegionTransitionState(final String encodedName) {
319     return regionsInTransition.get(encodedName);
320   }
321 
322   /**
323    * Add a list of regions to RegionStates. If a region is split
324    * and offline, its state will be SPLIT. Otherwise, its state will
325    * be OFFLINE. Region already in RegionStates will be skipped.
326    */
327   public void createRegionStates(
328       final List<HRegionInfo> hris) {
329     for (HRegionInfo hri: hris) {
330       createRegionState(hri);
331     }
332   }
333 
334   /**
335    * Add a region to RegionStates. If the region is split
336    * and offline, its state will be SPLIT. Otherwise, its state will
337    * be OFFLINE. If it is already in RegionStates, this call has
338    * no effect, and the original state is returned.
339    */
340   public RegionState createRegionState(final HRegionInfo hri) {
341     return createRegionState(hri, null, null, null);
342   }
343 
344   /**
345    * Add a region to RegionStates with the specified state.
346    * If the region is already in RegionStates, this call has
347    * no effect, and the original state is returned.
348    *
349    * @param hri the region info to create a state for
350    * @param newState the state to the region in set to
351    * @param serverName the server the region is transitioning on
352    * @param lastHost the last server that hosts the region
353    * @return the current state
354    */
355   public synchronized RegionState createRegionState(final HRegionInfo hri,
356       State newState, ServerName serverName, ServerName lastHost) {
357     if (newState == null || (newState == State.OPEN && serverName == null)) {
358       newState =  State.OFFLINE;
359     }
360     if (hri.isOffline() && hri.isSplit()) {
361       newState = State.SPLIT;
362       serverName = null;
363     }
364     String encodedName = hri.getEncodedName();
365     RegionState regionState = regionStates.get(encodedName);
366     if (regionState != null) {
367       LOG.warn("Tried to create a state for a region already in RegionStates, "
368         + "used existing: " + regionState + ", ignored new: " + newState);
369     } else {
370       regionState = new RegionState(hri, newState, serverName);
371       putRegionState(regionState);
372       if (newState == State.OPEN) {
373         if (!serverName.equals(lastHost)) {
374           LOG.warn("Open region's last host " + lastHost
375             + " should be the same as the current one " + serverName
376             + ", ignored the last and used the current one");
377           lastHost = serverName;
378         }
379         lastAssignments.put(encodedName, lastHost);
380         regionAssignments.put(hri, lastHost);
381       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
382         regionsInTransition.put(encodedName, regionState);
383       }
384       if (lastHost != null && newState != State.SPLIT) {
385         addToServerHoldings(lastHost, hri);
386         if (newState != State.OPEN) {
387           oldAssignments.put(encodedName, lastHost);
388         }
389       }
390     }
391     return regionState;
392   }
393 
394   private RegionState putRegionState(RegionState regionState) {
395     HRegionInfo hri = regionState.getRegion();
396     String encodedName = hri.getEncodedName();
397     TableName table = hri.getTable();
398     RegionState oldState = regionStates.put(encodedName, regionState);
399     Map<String, RegionState> map = regionStatesTableIndex.get(table);
400     if (map == null) {
401       map = new HashMap<String, RegionState>();
402       regionStatesTableIndex.put(table, map);
403     }
404     map.put(encodedName, regionState);
405     return oldState;
406   }
407 
408   /**
409    * Update a region state. It will be put in transition if not already there.
410    */
411   public RegionState updateRegionState(
412       final HRegionInfo hri, final State state) {
413     RegionState regionState = getRegionState(hri.getEncodedName());
414     return updateRegionState(hri, state,
415       regionState == null ? null : regionState.getServerName());
416   }
417 
418   /**
419    * Update a region state. It will be put in transition if not already there.
420    */
421   public RegionState updateRegionState(
422       final HRegionInfo hri, final State state, final ServerName serverName) {
423     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
424   }
425 
426   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
427     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
428   }
429 
430   /**
431    * A region is online, won't be in transition any more.
432    * We can't confirm it is really online on specified region server
433    * because it hasn't been put in region server's online region list yet.
434    */
435   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
436     String encodedName = hri.getEncodedName();
437     if (!serverManager.isServerOnline(serverName)) {
438       // This is possible if the region server dies before master gets a
439       // chance to handle ZK event in time. At this time, if the dead server
440       // is already processed by SSH, we should ignore this event.
441       // If not processed yet, ignore and let SSH deal with it.
442       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
443       return;
444     }
445     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
446 
447     synchronized (this) {
448       regionsInTransition.remove(encodedName);
449       ServerName oldServerName = regionAssignments.put(hri, serverName);
450       if (!serverName.equals(oldServerName)) {
451         if (LOG.isDebugEnabled()) {
452           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
453         }
454         addToServerHoldings(serverName, hri);
455         addToReplicaMapping(hri);
456         if (oldServerName == null) {
457           oldServerName = oldAssignments.remove(encodedName);
458         }
459         if (oldServerName != null
460             && !oldServerName.equals(serverName)
461             && serverHoldings.containsKey(oldServerName)) {
462           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
463           removeFromServerHoldings(oldServerName, hri);
464         }
465       }
466     }
467   }
468 
469   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
470     Set<HRegionInfo> regions = serverHoldings.get(serverName);
471     if (regions == null) {
472       regions = new HashSet<HRegionInfo>();
473       serverHoldings.put(serverName, regions);
474     }
475     regions.add(hri);
476   }
477 
478   private void addToReplicaMapping(HRegionInfo hri) {
479     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
480     Set<HRegionInfo> replicas =
481         defaultReplicaToOtherReplicas.get(defaultReplica);
482     if (replicas == null) {
483       replicas = new HashSet<HRegionInfo>();
484       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
485     }
486     replicas.add(hri);
487   }
488 
489   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
490     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
491     oldRegions.remove(hri);
492     if (oldRegions.isEmpty()) {
493       serverHoldings.remove(serverName);
494     }
495   }
496 
497   private void removeFromReplicaMapping(HRegionInfo hri) {
498     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
499     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
500     if (replicas != null) {
501       replicas.remove(hri);
502       if (replicas.isEmpty()) {
503         defaultReplicaToOtherReplicas.remove(defaultReplica);
504       }
505     }
506   }
507 
508   /**
509    * A dead server's wals have been split so that all the regions
510    * used to be open on it can be safely assigned now. Mark them assignable.
511    */
512   public synchronized void logSplit(final ServerName serverName) {
513     for (Iterator<Map.Entry<String, ServerName>> it
514         = lastAssignments.entrySet().iterator(); it.hasNext();) {
515       Map.Entry<String, ServerName> e = it.next();
516       if (e.getValue().equals(serverName)) {
517         it.remove();
518       }
519     }
520     long now = System.currentTimeMillis();
521     if (LOG.isDebugEnabled()) {
522       LOG.debug("Adding to log splitting servers " + serverName);
523     }
524     processedServers.put(serverName, Long.valueOf(now));
525     Configuration conf = server.getConfiguration();
526     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
527     // Doesn't have to be very accurate about the clean up time
528     if (now > lastProcessedServerCleanTime + obsoleteTime) {
529       lastProcessedServerCleanTime = now;
530       long cutoff = now - obsoleteTime;
531       for (Iterator<Map.Entry<ServerName, Long>> it
532           = processedServers.entrySet().iterator(); it.hasNext();) {
533         Map.Entry<ServerName, Long> e = it.next();
534         if (e.getValue().longValue() < cutoff) {
535           if (LOG.isDebugEnabled()) {
536             LOG.debug("Removed from log splitting servers " + e.getKey());
537           }
538           it.remove();
539         }
540       }
541     }
542   }
543 
544   /**
545    * Log split is done for a given region, so it is assignable now.
546    */
547   public void logSplit(final HRegionInfo region) {
548     clearLastAssignment(region);
549   }
550 
551   public synchronized void clearLastAssignment(final HRegionInfo region) {
552     lastAssignments.remove(region.getEncodedName());
553   }
554 
555   /**
556    * A region is offline, won't be in transition any more.
557    */
558   public void regionOffline(final HRegionInfo hri) {
559     regionOffline(hri, null);
560   }
561 
562   /**
563    * A region is offline, won't be in transition any more. Its state
564    * should be the specified expected state, which can only be
565    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
566    */
567   public void regionOffline(
568       final HRegionInfo hri, final State expectedState) {
569     Preconditions.checkArgument(expectedState == null
570       || RegionState.isUnassignable(expectedState),
571         "Offlined region should not be " + expectedState);
572     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
573       // Remove it from all region maps
574       deleteRegion(hri);
575       return;
576     }
577     State newState =
578       expectedState == null ? State.OFFLINE : expectedState;
579     updateRegionState(hri, newState);
580     String encodedName = hri.getEncodedName();
581     synchronized (this) {
582       regionsInTransition.remove(encodedName);
583       ServerName oldServerName = regionAssignments.remove(hri);
584       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
585         if (newState == State.MERGED || newState == State.SPLIT
586             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
587               TableState.State.DISABLED, TableState.State.DISABLING)) {
588           // Offline the region only if it's merged/split, or the table is disabled/disabling.
589           // Otherwise, offline it from this server only when it is online on a different server.
590           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
591           removeFromServerHoldings(oldServerName, hri);
592           removeFromReplicaMapping(hri);
593         } else {
594           // Need to remember it so that we can offline it from this
595           // server when it is online on a different server.
596           oldAssignments.put(encodedName, oldServerName);
597         }
598       }
599     }
600   }
601 
602   /**
603    * A server is offline, all regions on it are dead.
604    */
605   public List<HRegionInfo> serverOffline(final ServerName sn) {
606     // Offline all regions on this server not already in transition.
607     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
608     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
609     // Offline regions outside the loop and synchronized block to avoid
610     // ConcurrentModificationException and deadlock in case of meta anassigned,
611     // but RegionState a blocked.
612     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
613     synchronized (this) {
614       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
615       if (assignedRegions == null) {
616         assignedRegions = new HashSet<HRegionInfo>();
617       }
618 
619       for (HRegionInfo region : assignedRegions) {
620         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
621         if (isRegionOnline(region)) {
622           regionsToOffline.add(region);
623         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
624           LOG.debug("Offline splitting/merging region " + getRegionState(region));
625           regionsToOffline.add(region);
626         }
627       }
628 
629       for (RegionState state : regionsInTransition.values()) {
630         HRegionInfo hri = state.getRegion();
631         if (assignedRegions.contains(hri)) {
632           // Region is open on this region server, but in transition.
633           // This region must be moving away from this server, or splitting/merging.
634           // SSH will handle it, either skip assigning, or re-assign.
635           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
636         } else if (sn.equals(state.getServerName())) {
637           // Region is in transition on this region server, and this
638           // region is not open on this server. So the region must be
639           // moving to this server from another one (i.e. opening or
640           // pending open on this server, was open on another one.
641           // Offline state is also kind of pending open if the region is in
642           // transition. The region could be in failed_close state too if we have
643           // tried several times to open it while this region server is not reachable)
644           if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
645               State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
646             LOG.info("Found region in " + state +
647               " to be reassigned by ServerCrashProcedure for " + sn);
648             rits.add(hri);
649           } else if (isOneOfStates(state, State.SPLITTING_NEW)) {
650             regionsToCleanIfNoMetaEntry.add(state.getRegion());
651           } else {
652             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
653           }
654         }
655       }
656       this.notifyAll();
657     }
658 
659     for (HRegionInfo hri : regionsToOffline) {
660       regionOffline(hri);
661     }
662 
663     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
664     return rits;
665   }
666 
667   /**
668    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
669    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
670    */
671   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
672     if (hris.isEmpty()) return;
673     for (HRegionInfo hri: hris) {
674       try {
675         // This is RPC to meta table. It is done while we have a synchronize on
676         // regionstates. No progress will be made if meta is not available at this time.
677         // This is a cleanup task. Not critical.
678         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
679             null) {
680           regionOffline(hri);
681           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
682         }
683       } catch (IOException e) {
684         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
685       }
686     }
687   }
688 
689   /**
690    * Gets the online regions of the specified table.
691    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
692    * Only returns <em>online</em> regions.  If a region on this table has been
693    * closed during a disable, etc., it will be included in the returned list.
694    * So, the returned list may not necessarily be ALL regions in this table, its
695    * all the ONLINE regions in the table.
696    * @param tableName
697    * @return Online regions from <code>tableName</code>
698    */
699   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
700     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
701     // boundary needs to have table's name but regionID 0 so that it is sorted
702     // before all table's regions.
703     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
704     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
705       if(!hri.getTable().equals(tableName)) break;
706       tableRegions.add(hri);
707     }
708     return tableRegions;
709   }
710 
711   /**
712    * Gets current state of all regions of the table.
713    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
714    * Method guaranteed to return keys for all states
715    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
716    *
717    * @param tableName
718    * @return Online regions from <code>tableName</code>
719    */
720   public synchronized Map<RegionState.State, List<HRegionInfo>>
721   getRegionByStateOfTable(TableName tableName) {
722     Map<RegionState.State, List<HRegionInfo>> tableRegions =
723         new HashMap<State, List<HRegionInfo>>();
724     for (State state : State.values()) {
725       tableRegions.put(state, new ArrayList<HRegionInfo>());
726     }
727     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
728     if (indexMap == null)
729       return tableRegions;
730     for (RegionState regionState : indexMap.values()) {
731       tableRegions.get(regionState.getState()).add(regionState.getRegion());
732     }
733     return tableRegions;
734   }
735 
736   /**
737    * Wait on region to clear regions-in-transition.
738    * <p>
739    * If the region isn't in transition, returns immediately.  Otherwise, method
740    * blocks until the region is out of transition.
741    */
742   public synchronized void waitOnRegionToClearRegionsInTransition(
743       final HRegionInfo hri) throws InterruptedException {
744     if (!isRegionInTransition(hri)) return;
745 
746     while(!server.isStopped() && isRegionInTransition(hri)) {
747       RegionState rs = getRegionState(hri);
748       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
749       waitForUpdate(100);
750     }
751 
752     if (server.isStopped()) {
753       LOG.info("Giving up wait on region in " +
754         "transition because stoppable.isStopped is set");
755     }
756   }
757 
758   /**
759    * A table is deleted. Remove its regions from all internal maps.
760    * We loop through all regions assuming we don't delete tables too much.
761    */
762   public void tableDeleted(final TableName tableName) {
763     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
764     synchronized (this) {
765       for (RegionState state: regionStates.values()) {
766         HRegionInfo region = state.getRegion();
767         if (region.getTable().equals(tableName)) {
768           regionsToDelete.add(region);
769         }
770       }
771     }
772     for (HRegionInfo region: regionsToDelete) {
773       deleteRegion(region);
774     }
775   }
776 
777   /**
778    * Get a copy of all regions assigned to a server
779    */
780   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
781     Set<HRegionInfo> regions = serverHoldings.get(serverName);
782     if (regions == null) return null;
783     return new HashSet<HRegionInfo>(regions);
784   }
785 
786   /**
787    * Remove a region from all state maps.
788    */
789   @VisibleForTesting
790   public synchronized void deleteRegion(final HRegionInfo hri) {
791     String encodedName = hri.getEncodedName();
792     regionsInTransition.remove(encodedName);
793     regionStates.remove(encodedName);
794     TableName table = hri.getTable();
795     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
796     indexMap.remove(encodedName);
797     if (indexMap.size() == 0)
798       regionStatesTableIndex.remove(table);
799     lastAssignments.remove(encodedName);
800     ServerName sn = regionAssignments.remove(hri);
801     if (sn != null) {
802       Set<HRegionInfo> regions = serverHoldings.get(sn);
803       regions.remove(hri);
804     }
805   }
806 
807   /**
808    * Checking if a region was assigned to a server which is not online now.
809    * If so, we should hold re-assign this region till SSH has split its wals.
810    * Once logs are split, the last assignment of this region will be reset,
811    * which means a null last assignment server is ok for re-assigning.
812    *
813    * A region server could be dead but we don't know it yet. We may
814    * think it's online falsely. Therefore if a server is online, we still
815    * need to confirm it reachable and having the expected start code.
816    */
817   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
818     ServerName server = lastAssignments.get(encodedName);
819     return isServerDeadAndNotProcessed(server);
820   }
821 
822   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
823     if (server == null) return false;
824     if (serverManager.isServerOnline(server)) {
825       String hostAndPort = server.getHostAndPort();
826       long startCode = server.getStartcode();
827       Long deadCode = deadServers.get(hostAndPort);
828       if (deadCode == null || startCode > deadCode.longValue()) {
829         if (serverManager.isServerReachable(server)) {
830           return false;
831         }
832         // The size of deadServers won't grow unbounded.
833         deadServers.put(hostAndPort, Long.valueOf(startCode));
834       }
835       // Watch out! If the server is not dead, the region could
836       // remain unassigned. That's why ServerManager#isServerReachable
837       // should use some retry.
838       //
839       // We cache this info since it is very unlikely for that
840       // instance to come back up later on. We don't want to expire
841       // the server since we prefer to let it die naturally.
842       LOG.warn("Couldn't reach online server " + server);
843     }
844     // Now, we know it's dead. Check if it's processed
845     return !processedServers.containsKey(server);
846   }
847 
848  /**
849    * Get the last region server a region was on for purpose of re-assignment,
850    * i.e. should the re-assignment be held back till log split is done?
851    */
852   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
853     return lastAssignments.get(encodedName);
854   }
855 
856   synchronized void setLastRegionServerOfRegions(
857       final ServerName serverName, final List<HRegionInfo> regionInfos) {
858     for (HRegionInfo hri: regionInfos) {
859       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
860     }
861   }
862 
863   synchronized void setLastRegionServerOfRegion(
864       final ServerName serverName, final String encodedName) {
865     lastAssignments.put(encodedName, serverName);
866   }
867 
868   synchronized boolean isRegionOnServer(
869       final HRegionInfo hri, final ServerName serverName) {
870     Set<HRegionInfo> regions = serverHoldings.get(serverName);
871     return regions == null ? false : regions.contains(hri);
872   }
873 
874   void splitRegion(HRegionInfo p,
875       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
876 
877     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
878     synchronized (this) {
879       // After PONR, split is considered to be done.
880       // Update server holdings to be aligned with the meta.
881       Set<HRegionInfo> regions = serverHoldings.get(sn);
882       if (regions == null) {
883         throw new IllegalStateException(sn + " should host some regions");
884       }
885       regions.remove(p);
886       regions.add(a);
887       regions.add(b);
888     }
889   }
890 
891   void mergeRegions(HRegionInfo p,
892       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
893     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
894     synchronized (this) {
895       // After PONR, merge is considered to be done.
896       // Update server holdings to be aligned with the meta.
897       Set<HRegionInfo> regions = serverHoldings.get(sn);
898       if (regions == null) {
899         throw new IllegalStateException(sn + " should host some regions");
900       }
901       regions.remove(a);
902       regions.remove(b);
903       regions.add(p);
904     }
905   }
906 
907   private int getRegionReplication(HRegionInfo r) throws IOException {
908     if (tableStateManager != null) {
909       HTableDescriptor htd = server.getTableDescriptors().get(r.getTable());
910       if (htd != null) {
911         return htd.getRegionReplication();
912       }
913     }
914     return 1;
915   }
916 
917   /**
918    * At cluster clean re/start, mark all user regions closed except those of tables
919    * that are excluded, such as disabled/disabling/enabling tables. All user regions
920    * and their previous locations are returned.
921    */
922   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
923     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
924     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
925     for(RegionState state: regionStates.values()) {
926       HRegionInfo hri = state.getRegion();
927       if (state.isSplit() || hri.isSplit()) {
928         continue;
929       }
930       TableName tableName = hri.getTable();
931       if (!TableName.META_TABLE_NAME.equals(tableName)
932           && (noExcludeTables || !excludedTables.contains(tableName))) {
933         toBeClosed.add(hri);
934       }
935     }
936     Map<HRegionInfo, ServerName> allUserRegions =
937       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
938     for (HRegionInfo hri: toBeClosed) {
939       RegionState regionState = updateRegionState(hri, State.CLOSED);
940       allUserRegions.put(hri, regionState.getServerName());
941     }
942     return allUserRegions;
943   }
944 
945   /**
946    * Compute the average load across all region servers.
947    * Currently, this uses a very naive computation - just uses the number of
948    * regions being served, ignoring stats about number of requests.
949    * @return the average load
950    */
951   protected synchronized double getAverageLoad() {
952     int numServers = 0, totalLoad = 0;
953     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
954       Set<HRegionInfo> regions = e.getValue();
955       ServerName serverName = e.getKey();
956       int regionCount = regions.size();
957       if (serverManager.isServerOnline(serverName)) {
958         totalLoad += regionCount;
959         numServers++;
960       }
961     }
962     if (numServers > 1) {
963       // The master region server holds only a couple regions.
964       // Don't consider this server in calculating the average load
965       // if there are other region servers to avoid possible confusion.
966       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
967       if (hris != null) {
968         totalLoad -= hris.size();
969         numServers--;
970       }
971     }
972     return numServers == 0 ? 0.0 :
973       (double)totalLoad / (double)numServers;
974   }
975 
976   /**
977    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
978    * Can't let out original since it can change and at least the load balancer
979    * wants to iterate this exported list.  We need to synchronize on regions
980    * since all access to this.servers is under a lock on this.regions.
981    *
982    * @return A clone of current assignments by table.
983    */
984   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
985       getAssignmentsByTable() {
986     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
987       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
988     synchronized (this) {
989       if (!server.getConfiguration().getBoolean(
990             HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, false)) {
991         Map<ServerName, List<HRegionInfo>> svrToRegions =
992           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
993         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
994           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
995         }
996         result.put(TableName.valueOf(HConstants.ENSEMBLE_TABLE_NAME), svrToRegions);
997       } else {
998         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
999           for (HRegionInfo hri: e.getValue()) {
1000             if (hri.isMetaRegion()) continue;
1001             TableName tablename = hri.getTable();
1002             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
1003             if (svrToRegions == null) {
1004               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1005               result.put(tablename, svrToRegions);
1006             }
1007             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1008             if (regions == null) {
1009               regions = new ArrayList<HRegionInfo>();
1010               svrToRegions.put(e.getKey(), regions);
1011             }
1012             regions.add(hri);
1013           }
1014         }
1015       }
1016     }
1017 
1018     Map<ServerName, ServerLoad>
1019       onlineSvrs = serverManager.getOnlineServers();
1020     // Take care of servers w/o assignments, and remove servers in draining mode
1021     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1022     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1023       for (ServerName svr: onlineSvrs.keySet()) {
1024         if (!map.containsKey(svr)) {
1025           map.put(svr, new ArrayList<HRegionInfo>());
1026         }
1027       }
1028       map.keySet().removeAll(drainingServers);
1029     }
1030     return result;
1031   }
1032 
1033   protected RegionState getRegionState(final HRegionInfo hri) {
1034     return getRegionState(hri.getEncodedName());
1035   }
1036 
1037   /**
1038    * Returns a clone of region assignments per server
1039    * @return a Map of ServerName to a List of HRegionInfo's
1040    */
1041   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1042     Map<ServerName, List<HRegionInfo>> regionsByServer =
1043         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1044     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1045       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1046     }
1047     return regionsByServer;
1048   }
1049 
1050   protected synchronized RegionState getRegionState(final String encodedName) {
1051     return regionStates.get(encodedName);
1052   }
1053 
1054   /**
1055    * Get the HRegionInfo from cache, if not there, from the hbase:meta table.
1056    * Be careful. Does RPC. Do not hold a lock or synchronize when you call this method.
1057    * @param  regionName
1058    * @return HRegionInfo for the region
1059    */
1060   @SuppressWarnings("deprecation")
1061   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1062     String encodedName = HRegionInfo.encodeRegionName(regionName);
1063     RegionState regionState = getRegionState(encodedName);
1064     if (regionState != null) {
1065       return regionState.getRegion();
1066     }
1067 
1068     try {
1069       Pair<HRegionInfo, ServerName> p =
1070         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1071       HRegionInfo hri = p == null ? null : p.getFirst();
1072       if (hri != null) {
1073         createRegionState(hri);
1074       }
1075       return hri;
1076     } catch (IOException e) {
1077       server.abort("Aborting because error occoured while reading "
1078         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1079       return null;
1080     }
1081   }
1082 
1083   static boolean isOneOfStates(RegionState regionState, State... states) {
1084     State s = regionState != null ? regionState.getState() : null;
1085     for (State state: states) {
1086       if (s == state) return true;
1087     }
1088     return false;
1089   }
1090 
1091   /**
1092    * Update a region state. It will be put in transition if not already there.
1093    */
1094   private RegionState updateRegionState(final HRegionInfo hri,
1095       final RegionState.State state, final ServerName serverName, long openSeqNum) {
1096     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
1097       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1098         + " on " + serverName + ", set to " + state);
1099     }
1100 
1101     String encodedName = hri.getEncodedName();
1102     RegionState regionState = new RegionState(
1103       hri, state, System.currentTimeMillis(), serverName);
1104     RegionState oldState = getRegionState(encodedName);
1105     if (!regionState.equals(oldState)) {
1106       LOG.info("Transition " + oldState + " to " + regionState);
1107       // Persist region state before updating in-memory info, if needed
1108       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1109     }
1110 
1111     synchronized (this) {
1112       regionsInTransition.put(encodedName, regionState);
1113       putRegionState(regionState);
1114 
1115       // For these states, region should be properly closed.
1116       // There should be no log splitting issue.
1117       if ((state == State.CLOSED || state == State.MERGED
1118           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1119         ServerName last = lastAssignments.get(encodedName);
1120         if (last.equals(serverName)) {
1121           lastAssignments.remove(encodedName);
1122         } else {
1123           LOG.warn(encodedName + " moved to " + state + " on "
1124             + serverName + ", expected " + last);
1125         }
1126       }
1127 
1128       // Once a region is opened, record its last assignment right away.
1129       if (serverName != null && state == State.OPEN) {
1130         ServerName last = lastAssignments.get(encodedName);
1131         if (!serverName.equals(last)) {
1132           lastAssignments.put(encodedName, serverName);
1133           if (last != null && isServerDeadAndNotProcessed(last)) {
1134             LOG.warn(encodedName + " moved to " + serverName
1135               + ", while it's previous host " + last
1136               + " is dead but not processed yet");
1137           }
1138         }
1139       }
1140 
1141       // notify the change
1142       this.notifyAll();
1143     }
1144     return regionState;
1145   }
1146 }