View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.LinkedHashMap;
29  import java.util.LinkedList;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.SortedSet;
34  import java.util.TreeMap;
35  import java.util.TreeSet;
36
37  import com.google.common.annotations.VisibleForTesting;
38  import com.google.common.base.Preconditions;
39
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.hbase.classification.InterfaceAudience;
43  import org.apache.hadoop.conf.Configuration;
44  import org.apache.hadoop.hbase.HConstants;
45  import org.apache.hadoop.hbase.HRegionInfo;
46  import org.apache.hadoop.hbase.HTableDescriptor;
47  import org.apache.hadoop.hbase.MetaTableAccessor;
48  import org.apache.hadoop.hbase.ServerLoad;
49  import org.apache.hadoop.hbase.ServerName;
50  import org.apache.hadoop.hbase.TableName;
51  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
52  import org.apache.hadoop.hbase.master.RegionState.State;
53  import org.apache.hadoop.hbase.client.TableState;
54  import org.apache.hadoop.hbase.util.Bytes;
55  import org.apache.hadoop.hbase.util.FSUtils;
56  import org.apache.hadoop.hbase.util.Pair;
57
58  /**
59   * Region state accountant. It holds the states of all regions in the memory.
60   * In normal scenario, it should match the meta table and the true region states.
61   *
62   * This map is used by AssignmentManager to track region states.
63   */
64  @InterfaceAudience.Private
65  public class RegionStates {
66    private static final Log LOG = LogFactory.getLog(RegionStates.class);
67
68    public final static RegionStateStampComparator REGION_STATE_COMPARATOR =
69      new RegionStateStampComparator();
70
71    // This comparator sorts the RegionStates by time stamp then Region name.
72    // Comparing by timestamp alone can lead us to discard different RegionStates that happen
73    // to share a timestamp.
74    private static class RegionStateStampComparator implements Comparator<RegionState> {
75      @Override
76      public int compare(RegionState l, RegionState r) {
77        return Long.compare(l.getStamp(), r.getStamp()) == 0 ?
78            Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) :
79            Long.compare(l.getStamp(), r.getStamp());
80      }
81    }
82
83    /**
84     * Regions currently in transition.
85     */
86    final HashMap<String, RegionState> regionsInTransition =
87      new HashMap<String, RegionState>();
88
89    /**
90     * Region encoded name to state map.
91     * All the regions should be in this map.
92     */
93    private final Map<String, RegionState> regionStates =
94      new HashMap<String, RegionState>();
95
96    /**
97     * Holds mapping of table -> region state
98     */
99    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
100       new HashMap<TableName, Map<String, RegionState>>();
101
102   /**
103    * Server to regions assignment map.
104    * Contains the set of regions currently assigned to a given server.
105    */
106   private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
107     new HashMap<ServerName, Set<HRegionInfo>>();
108
109   /**
110    * Maintains the mapping from the default region to the replica regions.
111    */
112   private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
113     new HashMap<HRegionInfo, Set<HRegionInfo>>();
114
115   /**
116    * Region to server assignment map.
117    * Contains the server a given region is currently assigned to.
118    */
119   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
120     new TreeMap<HRegionInfo, ServerName>();
121
122   /**
123    * Encoded region name to server assignment map for re-assignment
124    * purpose. Contains the server a given region is last known assigned
125    * to, which has not completed log splitting, so not assignable.
126    * If a region is currently assigned, this server info in this
127    * map should be the same as that in regionAssignments.
128    * However the info in regionAssignments is cleared when the region
129    * is offline while the info in lastAssignments is cleared when
130    * the region is closed or the server is dead and processed.
131    */
132   private final HashMap<String, ServerName> lastAssignments =
133     new HashMap<String, ServerName>();
134
135   /**
136    * Encoded region name to server assignment map for the
137    * purpose to clean up serverHoldings when a region is online
138    * on a new server. When the region is offline from the previous
139    * server, we cleaned up regionAssignments so that it has the
140    * latest assignment map. But we didn't clean up serverHoldings
141    * to match the meta. We need this map to find out the old server
142    * whose serverHoldings needs cleanup, given a moved region.
143    */
144   private final HashMap<String, ServerName> oldAssignments =
145     new HashMap<String, ServerName>();
146
147   /**
148    * Map a host port pair string to the latest start code
149    * of a region server which is known to be dead. It is dead
150    * to us, but server manager may not know it yet.
151    */
152   private final HashMap<String, Long> deadServers =
153     new HashMap<String, Long>();
154
155   /**
156    * Map a dead servers to the time when log split is done.
157    * Since log splitting is not ordered, we have to remember
158    * all processed instances. The map is cleaned up based
159    * on a configured time. By default, we assume a dead
160    * server should be done with log splitting in two hours.
161    */
162   private final HashMap<ServerName, Long> processedServers =
163     new HashMap<ServerName, Long>();
164   private long lastProcessedServerCleanTime;
165
166   private final TableStateManager tableStateManager;
167   private final RegionStateStore regionStateStore;
168   private final ServerManager serverManager;
169   private final MasterServices server;
170
171   // The maximum time to keep a log split info in region states map
172   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
173   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
174
175   RegionStates(final MasterServices master, final TableStateManager tableStateManager,
176       final ServerManager serverManager, final RegionStateStore regionStateStore) {
177     this.tableStateManager = tableStateManager;
178     this.regionStateStore = regionStateStore;
179     this.serverManager = serverManager;
180     this.server = master;
181   }
182
183   /**
184    * @return a copy of the region assignment map
185    */
186   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
187     return new TreeMap<HRegionInfo, ServerName>(regionAssignments);
188   }
189
190   /**
191    * Return the replicas (including default) for the regions grouped by ServerName
192    * @param regions
193    * @return a pair containing the groupings as a map
194    */
195   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
196     Collection<HRegionInfo> regions) {
197     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
198     for (HRegionInfo region : regions) {
199       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
200       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
201       if (allReplicas != null) {
202         for (HRegionInfo hri : allReplicas) {
203           ServerName server = regionAssignments.get(hri);
204           if (server != null) {
205             List<HRegionInfo> regionsOnServer = map.get(server);
206             if (regionsOnServer == null) {
207               regionsOnServer = new ArrayList<HRegionInfo>(1);
208               map.put(server, regionsOnServer);
209             }
210             regionsOnServer.add(hri);
211           }
212         }
213       }
214     }
215     return map;
216   }
217
218   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
219     return regionAssignments.get(hri);
220   }
221
222   /**
223    * Get regions in transition and their states
224    */
225   public synchronized Set<RegionState> getRegionsInTransition() {
226     return new HashSet<RegionState>(regionsInTransition.values());
227   }
228
229   public synchronized SortedSet<RegionState> getRegionsInTransitionOrderedByTimestamp() {
230     final TreeSet<RegionState> rit = new TreeSet<RegionState>(REGION_STATE_COMPARATOR);
231     for (RegionState rs: regionsInTransition.values()) {
232       rit.add(rs);
233     }
234     return rit;
235   }
236
237   /**
238    * @return True if specified region in transition.
239    */
240   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
241     return regionsInTransition.containsKey(hri.getEncodedName());
242   }
243
244   /**
245    * @return True if specified region in transition.
246    */
247   public synchronized boolean isRegionInTransition(final String encodedName) {
248     return regionsInTransition.containsKey(encodedName);
249   }
250
251   /**
252    * @return True if any region in transition.
253    */
254   public synchronized boolean isRegionsInTransition() {
255     return !regionsInTransition.isEmpty();
256   }
257
258   /**
259    * @return True if hbase:meta table region is in transition.
260    */
261   public synchronized boolean isMetaRegionInTransition() {
262     for (RegionState state : regionsInTransition.values()) {
263       if (state.getRegion().isMetaRegion()) return true;
264     }
265     return false;
266   }
267
268   /**
269    * @return True if specified region assigned, and not in transition.
270    */
271   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
272     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
273   }
274
275   /**
276    * @return True if specified region offline/closed, but not in transition.
277    * If the region is not in the map, it is offline to us too.
278    */
279   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
280     return getRegionState(hri) == null || (!isRegionInTransition(hri)
281       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
282   }
283
284   /**
285    * @return True if specified region is in one of the specified states.
286    */
287   public boolean isRegionInState(
288       final HRegionInfo hri, final State... states) {
289     return isRegionInState(hri.getEncodedName(), states);
290   }
291
292   /**
293    * @return True if specified region is in one of the specified states.
294    */
295   public boolean isRegionInState(
296       final String encodedName, final State... states) {
297     RegionState regionState = getRegionState(encodedName);
298     return isOneOfStates(regionState, states);
299   }
300
301   /**
302    * Wait for the state map to be updated by assignment manager.
303    */
304   public synchronized void waitForUpdate(
305       final long timeout) throws InterruptedException {
306     this.wait(timeout);
307   }
308
309   /**
310    * Get region transition state
311    */
312   public RegionState getRegionTransitionState(final HRegionInfo hri) {
313     return getRegionTransitionState(hri.getEncodedName());
314   }
315
316   /**
317    * Get region transition state
318    */
319   public synchronized RegionState
320       getRegionTransitionState(final String encodedName) {
321     return regionsInTransition.get(encodedName);
322   }
323
324   /**
325    * Add a list of regions to RegionStates. If a region is split
326    * and offline, its state will be SPLIT. Otherwise, its state will
327    * be OFFLINE. Region already in RegionStates will be skipped.
328    */
329   public void createRegionStates(
330       final List<HRegionInfo> hris) {
331     for (HRegionInfo hri: hris) {
332       createRegionState(hri);
333     }
334   }
335
336   /**
337    * Add a region to RegionStates. If the region is split
338    * and offline, its state will be SPLIT. Otherwise, its state will
339    * be OFFLINE. If it is already in RegionStates, this call has
340    * no effect, and the original state is returned.
341    */
342   public RegionState createRegionState(final HRegionInfo hri) {
343     return createRegionState(hri, null, null, null);
344   }
345
346   /**
347    * Add a region to RegionStates with the specified state.
348    * If the region is already in RegionStates, this call has
349    * no effect, and the original state is returned.
350    *
351    * @param hri the region info to create a state for
352    * @param newState the state to the region in set to
353    * @param serverName the server the region is transitioning on
354    * @param lastHost the last server that hosts the region
355    * @return the current state
356    */
357   public synchronized RegionState createRegionState(final HRegionInfo hri,
358       State newState, ServerName serverName, ServerName lastHost) {
359     if (newState == null || (newState == State.OPEN && serverName == null)) {
360       newState =  State.OFFLINE;
361     }
362     if (hri.isOffline() && hri.isSplit()) {
363       newState = State.SPLIT;
364       serverName = null;
365     }
366     String encodedName = hri.getEncodedName();
367     RegionState regionState = regionStates.get(encodedName);
368     if (regionState != null) {
369       LOG.warn("Tried to create a state for a region already in RegionStates, "
370         + "used existing: " + regionState + ", ignored new: " + newState);
371     } else {
372       regionState = new RegionState(hri, newState, serverName);
373       putRegionState(regionState);
374       if (newState == State.OPEN) {
375         if (!serverName.equals(lastHost)) {
376           LOG.warn("Open region's last host " + lastHost
377             + " should be the same as the current one " + serverName
378             + ", ignored the last and used the current one");
379           lastHost = serverName;
380         }
381         lastAssignments.put(encodedName, lastHost);
382         regionAssignments.put(hri, lastHost);
383       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
384         regionsInTransition.put(encodedName, regionState);
385       }
386       if (lastHost != null && newState != State.SPLIT) {
387         addToServerHoldings(lastHost, hri);
388         if (newState != State.OPEN) {
389           oldAssignments.put(encodedName, lastHost);
390         }
391       }
392     }
393     return regionState;
394   }
395
396   private RegionState putRegionState(RegionState regionState) {
397     HRegionInfo hri = regionState.getRegion();
398     String encodedName = hri.getEncodedName();
399     TableName table = hri.getTable();
400     RegionState oldState = regionStates.put(encodedName, regionState);
401     Map<String, RegionState> map = regionStatesTableIndex.get(table);
402     if (map == null) {
403       map = new HashMap<String, RegionState>();
404       regionStatesTableIndex.put(table, map);
405     }
406     map.put(encodedName, regionState);
407     return oldState;
408   }
409
410   /**
411    * Update a region state. It will be put in transition if not already there.
412    */
413   public RegionState updateRegionState(
414       final HRegionInfo hri, final State state) {
415     RegionState regionState = getRegionState(hri.getEncodedName());
416     return updateRegionState(hri, state,
417       regionState == null ? null : regionState.getServerName());
418   }
419
420   /**
421    * Update a region state. It will be put in transition if not already there.
422    */
423   public RegionState updateRegionState(
424       final HRegionInfo hri, final State state, final ServerName serverName) {
425     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
426   }
427
428   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
429     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
430   }
431
432   /**
433    * A region is online, won't be in transition any more.
434    * We can't confirm it is really online on specified region server
435    * because it hasn't been put in region server's online region list yet.
436    */
437   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
438     String encodedName = hri.getEncodedName();
439     if (!serverManager.isServerOnline(serverName)) {
440       // This is possible if the region server dies before master gets a
441       // chance to handle ZK event in time. At this time, if the dead server
442       // is already processed by SSH, we should ignore this event.
443       // If not processed yet, ignore and let SSH deal with it.
444       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
445       return;
446     }
447     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
448
449     synchronized (this) {
450       regionsInTransition.remove(encodedName);
451       ServerName oldServerName = regionAssignments.put(hri, serverName);
452       if (!serverName.equals(oldServerName)) {
453         if (LOG.isDebugEnabled()) {
454           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
455         }
456         addToServerHoldings(serverName, hri);
457         addToReplicaMapping(hri);
458         if (oldServerName == null) {
459           oldServerName = oldAssignments.remove(encodedName);
460         }
461         if (oldServerName != null
462             && !oldServerName.equals(serverName)
463             && serverHoldings.containsKey(oldServerName)) {
464           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
465           removeFromServerHoldings(oldServerName, hri);
466         }
467       }
468     }
469   }
470
471   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
472     Set<HRegionInfo> regions = serverHoldings.get(serverName);
473     if (regions == null) {
474       regions = new HashSet<HRegionInfo>();
475       serverHoldings.put(serverName, regions);
476     }
477     regions.add(hri);
478   }
479
480   private void addToReplicaMapping(HRegionInfo hri) {
481     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
482     Set<HRegionInfo> replicas =
483         defaultReplicaToOtherReplicas.get(defaultReplica);
484     if (replicas == null) {
485       replicas = new HashSet<HRegionInfo>();
486       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
487     }
488     replicas.add(hri);
489   }
490
491   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
492     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
493     oldRegions.remove(hri);
494     if (oldRegions.isEmpty()) {
495       serverHoldings.remove(serverName);
496     }
497   }
498
499   private void removeFromReplicaMapping(HRegionInfo hri) {
500     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
501     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
502     if (replicas != null) {
503       replicas.remove(hri);
504       if (replicas.isEmpty()) {
505         defaultReplicaToOtherReplicas.remove(defaultReplica);
506       }
507     }
508   }
509
510   /**
511    * A dead server's wals have been split so that all the regions
512    * used to be open on it can be safely assigned now. Mark them assignable.
513    */
514   public synchronized void logSplit(final ServerName serverName) {
515     for (Iterator<Map.Entry<String, ServerName>> it
516         = lastAssignments.entrySet().iterator(); it.hasNext();) {
517       Map.Entry<String, ServerName> e = it.next();
518       if (e.getValue().equals(serverName)) {
519         it.remove();
520       }
521     }
522     long now = System.currentTimeMillis();
523     if (LOG.isDebugEnabled()) {
524       LOG.debug("Adding to log splitting servers " + serverName);
525     }
526     processedServers.put(serverName, Long.valueOf(now));
527     Configuration conf = server.getConfiguration();
528     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
529     // Doesn't have to be very accurate about the clean up time
530     if (now > lastProcessedServerCleanTime + obsoleteTime) {
531       lastProcessedServerCleanTime = now;
532       long cutoff = now - obsoleteTime;
533       for (Iterator<Map.Entry<ServerName, Long>> it
534           = processedServers.entrySet().iterator(); it.hasNext();) {
535         Map.Entry<ServerName, Long> e = it.next();
536         if (e.getValue().longValue() < cutoff) {
537           if (LOG.isDebugEnabled()) {
538             LOG.debug("Removed from log splitting servers " + e.getKey());
539           }
540           it.remove();
541         }
542       }
543     }
544   }
545
546   /**
547    * Log split is done for a given region, so it is assignable now.
548    */
549   public void logSplit(final HRegionInfo region) {
550     clearLastAssignment(region);
551   }
552
553   public synchronized void clearLastAssignment(final HRegionInfo region) {
554     lastAssignments.remove(region.getEncodedName());
555   }
556
557   /**
558    * A region is offline, won't be in transition any more.
559    */
560   public void regionOffline(final HRegionInfo hri) {
561     regionOffline(hri, null);
562   }
563
564   /**
565    * A region is offline, won't be in transition any more. Its state
566    * should be the specified expected state, which can only be
567    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
568    */
569   public void regionOffline(
570       final HRegionInfo hri, final State expectedState) {
571     Preconditions.checkArgument(expectedState == null
572       || RegionState.isUnassignable(expectedState),
573         "Offlined region should not be " + expectedState);
574     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
575       // Remove it from all region maps
576       deleteRegion(hri);
577       return;
578     }
579     State newState =
580       expectedState == null ? State.OFFLINE : expectedState;
581     updateRegionState(hri, newState);
582     String encodedName = hri.getEncodedName();
583     synchronized (this) {
584       regionsInTransition.remove(encodedName);
585       ServerName oldServerName = regionAssignments.remove(hri);
586       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
587         if (newState == State.MERGED || newState == State.SPLIT
588             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
589               TableState.State.DISABLED, TableState.State.DISABLING)) {
590           // Offline the region only if it's merged/split, or the table is disabled/disabling.
591           // Otherwise, offline it from this server only when it is online on a different server.
592           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
593           removeFromServerHoldings(oldServerName, hri);
594           removeFromReplicaMapping(hri);
595         } else {
596           // Need to remember it so that we can offline it from this
597           // server when it is online on a different server.
598           oldAssignments.put(encodedName, oldServerName);
599         }
600       }
601     }
602   }
603
604   /**
605    * A server is offline, all regions on it are dead.
606    */
607   public List<HRegionInfo> serverOffline(final ServerName sn) {
608     // Offline all regions on this server not already in transition.
609     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
610     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
611     // Offline regions outside the loop and synchronized block to avoid
612     // ConcurrentModificationException and deadlock in case of meta anassigned,
613     // but RegionState a blocked.
614     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
615     synchronized (this) {
616       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
617       if (assignedRegions == null) {
618         assignedRegions = new HashSet<HRegionInfo>();
619       }
620
621       for (HRegionInfo region : assignedRegions) {
622         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
623         if (isRegionOnline(region)) {
624           regionsToOffline.add(region);
625         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
626           LOG.debug("Offline splitting/merging region " + getRegionState(region));
627           regionsToOffline.add(region);
628         }
629       }
630
631       for (RegionState state : regionsInTransition.values()) {
632         HRegionInfo hri = state.getRegion();
633         if (assignedRegions.contains(hri)) {
634           // Region is open on this region server, but in transition.
635           // This region must be moving away from this server, or splitting/merging.
636           // SSH will handle it, either skip assigning, or re-assign.
637           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
638         } else if (sn.equals(state.getServerName())) {
639           // Region is in transition on this region server, and this
640           // region is not open on this server. So the region must be
641           // moving to this server from another one (i.e. opening or
642           // pending open on this server, was open on another one.
643           // Offline state is also kind of pending open if the region is in
644           // transition. The region could be in failed_close state too if we have
645           // tried several times to open it while this region server is not reachable)
646           if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
647               State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
648             LOG.info("Found region in " + state +
649               " to be reassigned by ServerCrashProcedure for " + sn);
650             rits.add(hri);
651           } else if (isOneOfStates(state, State.SPLITTING_NEW)) {
652             regionsToCleanIfNoMetaEntry.add(state.getRegion());
653           } else {
654             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
655           }
656         }
657       }
658       this.notifyAll();
659     }
660
661     for (HRegionInfo hri : regionsToOffline) {
662       regionOffline(hri);
663     }
664
665     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
666     return rits;
667   }
668
669   /**
670    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
671    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
672    */
673   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
674     if (hris.isEmpty()) return;
675     for (HRegionInfo hri: hris) {
676       try {
677         // This is RPC to meta table. It is done while we have a synchronize on
678         // regionstates. No progress will be made if meta is not available at this time.
679         // This is a cleanup task. Not critical.
680         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
681             null) {
682           regionOffline(hri);
683           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
684         }
685       } catch (IOException e) {
686         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
687       }
688     }
689   }
690
691   /**
692    * Gets the online regions of the specified table.
693    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
694    * Only returns <em>online</em> regions.  If a region on this table has been
695    * closed during a disable, etc., it will be included in the returned list.
696    * So, the returned list may not necessarily be ALL regions in this table, its
697    * all the ONLINE regions in the table.
698    * @param tableName
699    * @return Online regions from <code>tableName</code>
700    */
701   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
702     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
703     // boundary needs to have table's name but regionID 0 so that it is sorted
704     // before all table's regions.
705     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
706     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
707       if(!hri.getTable().equals(tableName)) break;
708       tableRegions.add(hri);
709     }
710     return tableRegions;
711   }
712
713   /**
714    * Gets current state of all regions of the table.
715    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
716    * Method guaranteed to return keys for all states
717    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
718    *
719    * @param tableName
720    * @return Online regions from <code>tableName</code>
721    */
722   public synchronized Map<RegionState.State, List<HRegionInfo>>
723   getRegionByStateOfTable(TableName tableName) {
724     Map<RegionState.State, List<HRegionInfo>> tableRegions =
725         new HashMap<State, List<HRegionInfo>>();
726     for (State state : State.values()) {
727       tableRegions.put(state, new ArrayList<HRegionInfo>());
728     }
729     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
730     if (indexMap == null)
731       return tableRegions;
732     for (RegionState regionState : indexMap.values()) {
733       tableRegions.get(regionState.getState()).add(regionState.getRegion());
734     }
735     return tableRegions;
736   }
737
738   /**
739    * Wait on region to clear regions-in-transition.
740    * <p>
741    * If the region isn't in transition, returns immediately.  Otherwise, method
742    * blocks until the region is out of transition.
743    */
744   public synchronized void waitOnRegionToClearRegionsInTransition(
745       final HRegionInfo hri) throws InterruptedException {
746     if (!isRegionInTransition(hri)) return;
747
748     while(!server.isStopped() && isRegionInTransition(hri)) {
749       RegionState rs = getRegionState(hri);
750       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
751       waitForUpdate(100);
752     }
753
754     if (server.isStopped()) {
755       LOG.info("Giving up wait on region in " +
756         "transition because stoppable.isStopped is set");
757     }
758   }
759
760   /**
761    * A table is deleted. Remove its regions from all internal maps.
762    * We loop through all regions assuming we don't delete tables too much.
763    */
764   public void tableDeleted(final TableName tableName) {
765     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
766     synchronized (this) {
767       for (RegionState state: regionStates.values()) {
768         HRegionInfo region = state.getRegion();
769         if (region.getTable().equals(tableName)) {
770           regionsToDelete.add(region);
771         }
772       }
773     }
774     for (HRegionInfo region: regionsToDelete) {
775       deleteRegion(region);
776     }
777   }
778
779   /**
780    * Get a copy of all regions assigned to a server
781    */
782   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
783     Set<HRegionInfo> regions = serverHoldings.get(serverName);
784     if (regions == null) return null;
785     return new HashSet<HRegionInfo>(regions);
786   }
787
788   /**
789    * Remove a region from all state maps.
790    */
791   @VisibleForTesting
792   public synchronized void deleteRegion(final HRegionInfo hri) {
793     String encodedName = hri.getEncodedName();
794     regionsInTransition.remove(encodedName);
795     regionStates.remove(encodedName);
796     TableName table = hri.getTable();
797     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
798     indexMap.remove(encodedName);
799     if (indexMap.size() == 0)
800       regionStatesTableIndex.remove(table);
801     lastAssignments.remove(encodedName);
802     ServerName sn = regionAssignments.remove(hri);
803     if (sn != null) {
804       Set<HRegionInfo> regions = serverHoldings.get(sn);
805       regions.remove(hri);
806     }
807   }
808
809   /**
810    * Checking if a region was assigned to a server which is not online now.
811    * If so, we should hold re-assign this region till SSH has split its wals.
812    * Once logs are split, the last assignment of this region will be reset,
813    * which means a null last assignment server is ok for re-assigning.
814    *
815    * A region server could be dead but we don't know it yet. We may
816    * think it's online falsely. Therefore if a server is online, we still
817    * need to confirm it reachable and having the expected start code.
818    */
819   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
820     ServerName server = lastAssignments.get(encodedName);
821     return isServerDeadAndNotProcessed(server);
822   }
823
824   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
825     if (server == null) return false;
826     if (serverManager.isServerOnline(server)) {
827       String hostAndPort = server.getHostAndPort();
828       long startCode = server.getStartcode();
829       Long deadCode = deadServers.get(hostAndPort);
830       if (deadCode == null || startCode > deadCode.longValue()) {
831         if (serverManager.isServerReachable(server)) {
832           return false;
833         }
834         // The size of deadServers won't grow unbounded.
835         deadServers.put(hostAndPort, Long.valueOf(startCode));
836       }
837       // Watch out! If the server is not dead, the region could
838       // remain unassigned. That's why ServerManager#isServerReachable
839       // should use some retry.
840       //
841       // We cache this info since it is very unlikely for that
842       // instance to come back up later on. We don't want to expire
843       // the server since we prefer to let it die naturally.
844       LOG.warn("Couldn't reach online server " + server);
845     }
846     // Now, we know it's dead. Check if it's processed
847     return !processedServers.containsKey(server);
848   }
849
850  /**
851    * Get the last region server a region was on for purpose of re-assignment,
852    * i.e. should the re-assignment be held back till log split is done?
853    */
854   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
855     return lastAssignments.get(encodedName);
856   }
857
858   synchronized void setLastRegionServerOfRegions(
859       final ServerName serverName, final List<HRegionInfo> regionInfos) {
860     for (HRegionInfo hri: regionInfos) {
861       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
862     }
863   }
864
865   synchronized void setLastRegionServerOfRegion(
866       final ServerName serverName, final String encodedName) {
867     lastAssignments.put(encodedName, serverName);
868   }
869
870   synchronized boolean isRegionOnServer(
871       final HRegionInfo hri, final ServerName serverName) {
872     Set<HRegionInfo> regions = serverHoldings.get(serverName);
873     return regions == null ? false : regions.contains(hri);
874   }
875
876   void splitRegion(HRegionInfo p,
877       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
878
879     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
880     synchronized (this) {
881       // After PONR, split is considered to be done.
882       // Update server holdings to be aligned with the meta.
883       Set<HRegionInfo> regions = serverHoldings.get(sn);
884       if (regions == null) {
885         throw new IllegalStateException(sn + " should host some regions");
886       }
887       regions.remove(p);
888       regions.add(a);
889       regions.add(b);
890     }
891   }
892
893   void mergeRegions(HRegionInfo p,
894       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
895     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
896     synchronized (this) {
897       // After PONR, merge is considered to be done.
898       // Update server holdings to be aligned with the meta.
899       Set<HRegionInfo> regions = serverHoldings.get(sn);
900       if (regions == null) {
901         throw new IllegalStateException(sn + " should host some regions");
902       }
903       regions.remove(a);
904       regions.remove(b);
905       regions.add(p);
906     }
907   }
908
909   private int getRegionReplication(HRegionInfo r) throws IOException {
910     if (tableStateManager != null) {
911       HTableDescriptor htd = server.getTableDescriptors().get(r.getTable());
912       if (htd != null) {
913         return htd.getRegionReplication();
914       }
915     }
916     return 1;
917   }
918
919   /**
920    * At cluster clean re/start, mark all user regions closed except those of tables
921    * that are excluded, such as disabled/disabling/enabling tables. All user regions
922    * and their previous locations are returned.
923    */
924   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
925     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
926     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
927     for(RegionState state: regionStates.values()) {
928       HRegionInfo hri = state.getRegion();
929       if (state.isSplit() || hri.isSplit()) {
930         continue;
931       }
932       TableName tableName = hri.getTable();
933       if (!TableName.META_TABLE_NAME.equals(tableName)
934           && (noExcludeTables || !excludedTables.contains(tableName))) {
935         toBeClosed.add(hri);
936       }
937     }
938     Map<HRegionInfo, ServerName> allUserRegions =
939       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
940     for (HRegionInfo hri: toBeClosed) {
941       RegionState regionState = updateRegionState(hri, State.CLOSED);
942       allUserRegions.put(hri, regionState.getServerName());
943     }
944     return allUserRegions;
945   }
946
947   /**
948    * Compute the average load across all region servers.
949    * Currently, this uses a very naive computation - just uses the number of
950    * regions being served, ignoring stats about number of requests.
951    * @return the average load
952    */
953   protected synchronized double getAverageLoad() {
954     int numServers = 0, totalLoad = 0;
955     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
956       Set<HRegionInfo> regions = e.getValue();
957       ServerName serverName = e.getKey();
958       int regionCount = regions.size();
959       if (serverManager.isServerOnline(serverName)) {
960         totalLoad += regionCount;
961         numServers++;
962       }
963     }
964     if (numServers > 1) {
965       // The master region server holds only a couple regions.
966       // Don't consider this server in calculating the average load
967       // if there are other region servers to avoid possible confusion.
968       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
969       if (hris != null) {
970         totalLoad -= hris.size();
971         numServers--;
972       }
973     }
974     return numServers == 0 ? 0.0 :
975       (double)totalLoad / (double)numServers;
976   }
977
978   /**
979    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
980    * Can't let out original since it can change and at least the load balancer
981    * wants to iterate this exported list.  We need to synchronize on regions
982    * since all access to this.servers is under a lock on this.regions.
983    *
984    * @return A clone of current assignments by table.
985    */
986   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
987       getAssignmentsByTable() {
988     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
989       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
990     synchronized (this) {
991       if (!server.getConfiguration().getBoolean(
992             HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, false)) {
993         Map<ServerName, List<HRegionInfo>> svrToRegions =
994           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
995         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
996           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
997         }
998         result.put(TableName.valueOf(HConstants.ENSEMBLE_TABLE_NAME), svrToRegions);
999       } else {
1000         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1001           for (HRegionInfo hri: e.getValue()) {
1002             if (hri.isMetaRegion()) continue;
1003             TableName tablename = hri.getTable();
1004             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
1005             if (svrToRegions == null) {
1006               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1007               result.put(tablename, svrToRegions);
1008             }
1009             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1010             if (regions == null) {
1011               regions = new ArrayList<HRegionInfo>();
1012               svrToRegions.put(e.getKey(), regions);
1013             }
1014             regions.add(hri);
1015           }
1016         }
1017       }
1018     }
1019
1020     Map<ServerName, ServerLoad>
1021       onlineSvrs = serverManager.getOnlineServers();
1022     // Take care of servers w/o assignments, and remove servers in draining mode
1023     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1024     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1025       for (ServerName svr: onlineSvrs.keySet()) {
1026         if (!map.containsKey(svr)) {
1027           map.put(svr, new ArrayList<HRegionInfo>());
1028         }
1029       }
1030       map.keySet().removeAll(drainingServers);
1031     }
1032     return result;
1033   }
1034
1035   protected RegionState getRegionState(final HRegionInfo hri) {
1036     return getRegionState(hri.getEncodedName());
1037   }
1038
1039   /**
1040    * Returns a clone of region assignments per server
1041    * @return a Map of ServerName to a List of HRegionInfo's
1042    */
1043   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1044     Map<ServerName, List<HRegionInfo>> regionsByServer =
1045         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1046     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1047       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1048     }
1049     return regionsByServer;
1050   }
1051
1052   public synchronized RegionState getRegionState(final String encodedName) {
1053     return regionStates.get(encodedName);
1054   }
1055
1056   /**
1057    * Get the HRegionInfo from cache, if not there, from the hbase:meta table.
1058    * Be careful. Does RPC. Do not hold a lock or synchronize when you call this method.
1059    * @param  regionName
1060    * @return HRegionInfo for the region
1061    */
1062   @SuppressWarnings("deprecation")
1063   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1064     String encodedName = HRegionInfo.encodeRegionName(regionName);
1065     RegionState regionState = getRegionState(encodedName);
1066     if (regionState != null) {
1067       return regionState.getRegion();
1068     }
1069
1070     try {
1071       Pair<HRegionInfo, ServerName> p =
1072         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1073       HRegionInfo hri = p == null ? null : p.getFirst();
1074       if (hri != null) {
1075         createRegionState(hri);
1076       }
1077       return hri;
1078     } catch (IOException e) {
1079       server.abort("Aborting because error occoured while reading "
1080         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1081       return null;
1082     }
1083   }
1084
1085   static boolean isOneOfStates(RegionState regionState, State... states) {
1086     State s = regionState != null ? regionState.getState() : null;
1087     for (State state: states) {
1088       if (s == state) return true;
1089     }
1090     return false;
1091   }
1092
1093   /**
1094    * Update a region state. It will be put in transition if not already there.
1095    */
1096   private RegionState updateRegionState(final HRegionInfo hri,
1097       final RegionState.State state, final ServerName serverName, long openSeqNum) {
1098     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
1099       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1100         + " on " + serverName + ", set to " + state);
1101     }
1102
1103     String encodedName = hri.getEncodedName();
1104     RegionState regionState = new RegionState(
1105       hri, state, System.currentTimeMillis(), serverName);
1106     RegionState oldState = getRegionState(encodedName);
1107     if (!regionState.equals(oldState)) {
1108       LOG.info("Transition " + oldState + " to " + regionState);
1109       // Persist region state before updating in-memory info, if needed
1110       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1111     }
1112
1113     synchronized (this) {
1114       regionsInTransition.put(encodedName, regionState);
1115       putRegionState(regionState);
1116
1117       // For these states, region should be properly closed.
1118       // There should be no log splitting issue.
1119       if ((state == State.CLOSED || state == State.MERGED
1120           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1121         ServerName last = lastAssignments.get(encodedName);
1122         if (last.equals(serverName)) {
1123           lastAssignments.remove(encodedName);
1124         } else {
1125           LOG.warn(encodedName + " moved to " + state + " on "
1126             + serverName + ", expected " + last);
1127         }
1128       }
1129
1130       // Once a region is opened, record its last assignment right away.
1131       if (serverName != null && state == State.OPEN) {
1132         ServerName last = lastAssignments.get(encodedName);
1133         if (!serverName.equals(last)) {
1134           lastAssignments.put(encodedName, serverName);
1135           if (last != null && isServerDeadAndNotProcessed(last)) {
1136             LOG.warn(encodedName + " moved to " + serverName
1137               + ", while it's previous host " + last
1138               + " is dead but not processed yet");
1139           }
1140         }
1141       }
1142
1143       // notify the change
1144       this.notifyAll();
1145     }
1146     return regionState;
1147   }
1148 }