View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.Iterator;
28  import java.util.LinkedHashMap;
29  import java.util.LinkedList;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Set;
33  import java.util.SortedSet;
34  import java.util.TreeMap;
35  import java.util.TreeSet;
36
37  import com.google.common.annotations.VisibleForTesting;
38  import com.google.common.base.Preconditions;
39
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.hadoop.hbase.classification.InterfaceAudience;
43  import org.apache.hadoop.conf.Configuration;
44  import org.apache.hadoop.hbase.HConstants;
45  import org.apache.hadoop.hbase.HRegionInfo;
46  import org.apache.hadoop.hbase.HTableDescriptor;
47  import org.apache.hadoop.hbase.MetaTableAccessor;
48  import org.apache.hadoop.hbase.ServerLoad;
49  import org.apache.hadoop.hbase.ServerName;
50  import org.apache.hadoop.hbase.TableName;
51  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
52  import org.apache.hadoop.hbase.master.RegionState.State;
53  import org.apache.hadoop.hbase.client.TableState;
54  import org.apache.hadoop.hbase.util.Bytes;
55  import org.apache.hadoop.hbase.util.FSUtils;
56  import org.apache.hadoop.hbase.util.Pair;
57
58  /**
59   * Region state accountant. It holds the states of all regions in the memory.
60   * In normal scenario, it should match the meta table and the true region states.
61   *
62   * This map is used by AssignmentManager to track region states.
63   */
64  @InterfaceAudience.Private
65  public class RegionStates {
66    private static final Log LOG = LogFactory.getLog(RegionStates.class);
67
68    public final static RegionStateStampComparator REGION_STATE_COMPARATOR =
69      new RegionStateStampComparator();
70    private static class RegionStateStampComparator implements Comparator<RegionState> {
71      @Override
72      public int compare(RegionState l, RegionState r) {
73        return Long.compare(l.getStamp(), r.getStamp());
74      }
75    }
76
77    /**
78     * Regions currently in transition.
79     */
80    final HashMap<String, RegionState> regionsInTransition =
81      new HashMap<String, RegionState>();
82
83    /**
84     * Region encoded name to state map.
85     * All the regions should be in this map.
86     */
87    private final Map<String, RegionState> regionStates =
88      new HashMap<String, RegionState>();
89
90    /**
91     * Holds mapping of table -> region state
92     */
93    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
94        new HashMap<TableName, Map<String, RegionState>>();
95
96    /**
97     * Server to regions assignment map.
98     * Contains the set of regions currently assigned to a given server.
99     */
100   private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
101     new HashMap<ServerName, Set<HRegionInfo>>();
102
103   /**
104    * Maintains the mapping from the default region to the replica regions.
105    */
106   private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
107     new HashMap<HRegionInfo, Set<HRegionInfo>>();
108
109   /**
110    * Region to server assignment map.
111    * Contains the server a given region is currently assigned to.
112    */
113   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
114     new TreeMap<HRegionInfo, ServerName>();
115
116   /**
117    * Encoded region name to server assignment map for re-assignment
118    * purpose. Contains the server a given region is last known assigned
119    * to, which has not completed log splitting, so not assignable.
120    * If a region is currently assigned, this server info in this
121    * map should be the same as that in regionAssignments.
122    * However the info in regionAssignments is cleared when the region
123    * is offline while the info in lastAssignments is cleared when
124    * the region is closed or the server is dead and processed.
125    */
126   private final HashMap<String, ServerName> lastAssignments =
127     new HashMap<String, ServerName>();
128
129   /**
130    * Encoded region name to server assignment map for the
131    * purpose to clean up serverHoldings when a region is online
132    * on a new server. When the region is offline from the previous
133    * server, we cleaned up regionAssignments so that it has the
134    * latest assignment map. But we didn't clean up serverHoldings
135    * to match the meta. We need this map to find out the old server
136    * whose serverHoldings needs cleanup, given a moved region.
137    */
138   private final HashMap<String, ServerName> oldAssignments =
139     new HashMap<String, ServerName>();
140
141   /**
142    * Map a host port pair string to the latest start code
143    * of a region server which is known to be dead. It is dead
144    * to us, but server manager may not know it yet.
145    */
146   private final HashMap<String, Long> deadServers =
147     new HashMap<String, Long>();
148 
149   /**
150    * Map a dead servers to the time when log split is done.
151    * Since log splitting is not ordered, we have to remember
152    * all processed instances. The map is cleaned up based
153    * on a configured time. By default, we assume a dead
154    * server should be done with log splitting in two hours.
155    */
156   private final HashMap<ServerName, Long> processedServers =
157     new HashMap<ServerName, Long>();
158   private long lastProcessedServerCleanTime;
159
160   private final TableStateManager tableStateManager;
161   private final RegionStateStore regionStateStore;
162   private final ServerManager serverManager;
163   private final MasterServices server;
164
165   // The maximum time to keep a log split info in region states map
166   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
167   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
168
169   RegionStates(final MasterServices master, final TableStateManager tableStateManager,
170       final ServerManager serverManager, final RegionStateStore regionStateStore) {
171     this.tableStateManager = tableStateManager;
172     this.regionStateStore = regionStateStore;
173     this.serverManager = serverManager;
174     this.server = master;
175   }
176
177   /**
178    * @return a copy of the region assignment map
179    */
180   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
181     return new TreeMap<HRegionInfo, ServerName>(regionAssignments);
182   }
183
184   /**
185    * Return the replicas (including default) for the regions grouped by ServerName
186    * @param regions
187    * @return a pair containing the groupings as a map
188    */
189   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
190     Collection<HRegionInfo> regions) {
191     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
192     for (HRegionInfo region : regions) {
193       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
194       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
195       if (allReplicas != null) {
196         for (HRegionInfo hri : allReplicas) {
197           ServerName server = regionAssignments.get(hri);
198           if (server != null) {
199             List<HRegionInfo> regionsOnServer = map.get(server);
200             if (regionsOnServer == null) {
201               regionsOnServer = new ArrayList<HRegionInfo>(1);
202               map.put(server, regionsOnServer);
203             }
204             regionsOnServer.add(hri);
205           }
206         }
207       }
208     }
209     return map;
210   }
211
212   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
213     return regionAssignments.get(hri);
214   }
215
216   /**
217    * Get regions in transition and their states
218    */
219   public synchronized Set<RegionState> getRegionsInTransition() {
220     return new HashSet<RegionState>(regionsInTransition.values());
221   }
222
223   public synchronized SortedSet<RegionState> getRegionsInTransitionOrderedByTimestamp() {
224     final TreeSet<RegionState> rit = new TreeSet<RegionState>(REGION_STATE_COMPARATOR);
225     for (RegionState rs: regionsInTransition.values()) {
226       rit.add(rs);
227     }
228     return rit;
229   }
230
231   /**
232    * @return True if specified region in transition.
233    */
234   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
235     return regionsInTransition.containsKey(hri.getEncodedName());
236   }
237
238   /**
239    * @return True if specified region in transition.
240    */
241   public synchronized boolean isRegionInTransition(final String encodedName) {
242     return regionsInTransition.containsKey(encodedName);
243   }
244
245   /**
246    * @return True if any region in transition.
247    */
248   public synchronized boolean isRegionsInTransition() {
249     return !regionsInTransition.isEmpty();
250   }
251
252   /**
253    * @return True if hbase:meta table region is in transition.
254    */
255   public synchronized boolean isMetaRegionInTransition() {
256     for (RegionState state : regionsInTransition.values()) {
257       if (state.getRegion().isMetaRegion()) return true;
258     }
259     return false;
260   }
261
262   /**
263    * @return True if specified region assigned, and not in transition.
264    */
265   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
266     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
267   }
268
269   /**
270    * @return True if specified region offline/closed, but not in transition.
271    * If the region is not in the map, it is offline to us too.
272    */
273   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
274     return getRegionState(hri) == null || (!isRegionInTransition(hri)
275       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
276   }
277
278   /**
279    * @return True if specified region is in one of the specified states.
280    */
281   public boolean isRegionInState(
282       final HRegionInfo hri, final State... states) {
283     return isRegionInState(hri.getEncodedName(), states);
284   }
285
286   /**
287    * @return True if specified region is in one of the specified states.
288    */
289   public boolean isRegionInState(
290       final String encodedName, final State... states) {
291     RegionState regionState = getRegionState(encodedName);
292     return isOneOfStates(regionState, states);
293   }
294
295   /**
296    * Wait for the state map to be updated by assignment manager.
297    */
298   public synchronized void waitForUpdate(
299       final long timeout) throws InterruptedException {
300     this.wait(timeout);
301   }
302
303   /**
304    * Get region transition state
305    */
306   public RegionState getRegionTransitionState(final HRegionInfo hri) {
307     return getRegionTransitionState(hri.getEncodedName());
308   }
309
310   /**
311    * Get region transition state
312    */
313   public synchronized RegionState
314       getRegionTransitionState(final String encodedName) {
315     return regionsInTransition.get(encodedName);
316   }
317
318   /**
319    * Add a list of regions to RegionStates. If a region is split
320    * and offline, its state will be SPLIT. Otherwise, its state will
321    * be OFFLINE. Region already in RegionStates will be skipped.
322    */
323   public void createRegionStates(
324       final List<HRegionInfo> hris) {
325     for (HRegionInfo hri: hris) {
326       createRegionState(hri);
327     }
328   }
329
330   /**
331    * Add a region to RegionStates. If the region is split
332    * and offline, its state will be SPLIT. Otherwise, its state will
333    * be OFFLINE. If it is already in RegionStates, this call has
334    * no effect, and the original state is returned.
335    */
336   public RegionState createRegionState(final HRegionInfo hri) {
337     return createRegionState(hri, null, null, null);
338   }
339
340   /**
341    * Add a region to RegionStates with the specified state.
342    * If the region is already in RegionStates, this call has
343    * no effect, and the original state is returned.
344    *
345    * @param hri the region info to create a state for
346    * @param newState the state to the region in set to
347    * @param serverName the server the region is transitioning on
348    * @param lastHost the last server that hosts the region
349    * @return the current state
350    */
351   public synchronized RegionState createRegionState(final HRegionInfo hri,
352       State newState, ServerName serverName, ServerName lastHost) {
353     if (newState == null || (newState == State.OPEN && serverName == null)) {
354       newState =  State.OFFLINE;
355     }
356     if (hri.isOffline() && hri.isSplit()) {
357       newState = State.SPLIT;
358       serverName = null;
359     }
360     String encodedName = hri.getEncodedName();
361     RegionState regionState = regionStates.get(encodedName);
362     if (regionState != null) {
363       LOG.warn("Tried to create a state for a region already in RegionStates, "
364         + "used existing: " + regionState + ", ignored new: " + newState);
365     } else {
366       regionState = new RegionState(hri, newState, serverName);
367       putRegionState(regionState);
368       if (newState == State.OPEN) {
369         if (!serverName.equals(lastHost)) {
370           LOG.warn("Open region's last host " + lastHost
371             + " should be the same as the current one " + serverName
372             + ", ignored the last and used the current one");
373           lastHost = serverName;
374         }
375         lastAssignments.put(encodedName, lastHost);
376         regionAssignments.put(hri, lastHost);
377       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
378         regionsInTransition.put(encodedName, regionState);
379       }
380       if (lastHost != null && newState != State.SPLIT) {
381         addToServerHoldings(lastHost, hri);
382         if (newState != State.OPEN) {
383           oldAssignments.put(encodedName, lastHost);
384         }
385       }
386     }
387     return regionState;
388   }
389
390   private RegionState putRegionState(RegionState regionState) {
391     HRegionInfo hri = regionState.getRegion();
392     String encodedName = hri.getEncodedName();
393     TableName table = hri.getTable();
394     RegionState oldState = regionStates.put(encodedName, regionState);
395     Map<String, RegionState> map = regionStatesTableIndex.get(table);
396     if (map == null) {
397       map = new HashMap<String, RegionState>();
398       regionStatesTableIndex.put(table, map);
399     }
400     map.put(encodedName, regionState);
401     return oldState;
402   }
403
404   /**
405    * Update a region state. It will be put in transition if not already there.
406    */
407   public RegionState updateRegionState(
408       final HRegionInfo hri, final State state) {
409     RegionState regionState = getRegionState(hri.getEncodedName());
410     return updateRegionState(hri, state,
411       regionState == null ? null : regionState.getServerName());
412   }
413
414   /**
415    * Update a region state. It will be put in transition if not already there.
416    */
417   public RegionState updateRegionState(
418       final HRegionInfo hri, final State state, final ServerName serverName) {
419     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
420   }
421
422   public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
423     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
424   }
425 
426   /**
427    * A region is online, won't be in transition any more.
428    * We can't confirm it is really online on specified region server
429    * because it hasn't been put in region server's online region list yet.
430    */
431   public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
432     String encodedName = hri.getEncodedName();
433     if (!serverManager.isServerOnline(serverName)) {
434       // This is possible if the region server dies before master gets a
435       // chance to handle ZK event in time. At this time, if the dead server
436       // is already processed by SSH, we should ignore this event.
437       // If not processed yet, ignore and let SSH deal with it.
438       LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
439       return;
440     }
441     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
442
443     synchronized (this) {
444       regionsInTransition.remove(encodedName);
445       ServerName oldServerName = regionAssignments.put(hri, serverName);
446       if (!serverName.equals(oldServerName)) {
447         if (LOG.isDebugEnabled()) {
448           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
449         }
450         addToServerHoldings(serverName, hri);
451         addToReplicaMapping(hri);
452         if (oldServerName == null) {
453           oldServerName = oldAssignments.remove(encodedName);
454         }
455         if (oldServerName != null
456             && !oldServerName.equals(serverName)
457             && serverHoldings.containsKey(oldServerName)) {
458           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
459           removeFromServerHoldings(oldServerName, hri);
460         }
461       }
462     }
463   }
464
465   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
466     Set<HRegionInfo> regions = serverHoldings.get(serverName);
467     if (regions == null) {
468       regions = new HashSet<HRegionInfo>();
469       serverHoldings.put(serverName, regions);
470     }
471     regions.add(hri);
472   }
473
474   private void addToReplicaMapping(HRegionInfo hri) {
475     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
476     Set<HRegionInfo> replicas =
477         defaultReplicaToOtherReplicas.get(defaultReplica);
478     if (replicas == null) {
479       replicas = new HashSet<HRegionInfo>();
480       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
481     }
482     replicas.add(hri);
483   }
484
485   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
486     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
487     oldRegions.remove(hri);
488     if (oldRegions.isEmpty()) {
489       serverHoldings.remove(serverName);
490     }
491   }
492
493   private void removeFromReplicaMapping(HRegionInfo hri) {
494     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
495     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
496     if (replicas != null) {
497       replicas.remove(hri);
498       if (replicas.isEmpty()) {
499         defaultReplicaToOtherReplicas.remove(defaultReplica);
500       }
501     }
502   }
503
504   /**
505    * A dead server's wals have been split so that all the regions
506    * used to be open on it can be safely assigned now. Mark them assignable.
507    */
508   public synchronized void logSplit(final ServerName serverName) {
509     for (Iterator<Map.Entry<String, ServerName>> it
510         = lastAssignments.entrySet().iterator(); it.hasNext();) {
511       Map.Entry<String, ServerName> e = it.next();
512       if (e.getValue().equals(serverName)) {
513         it.remove();
514       }
515     }
516     long now = System.currentTimeMillis();
517     if (LOG.isDebugEnabled()) {
518       LOG.debug("Adding to log splitting servers " + serverName);
519     }
520     processedServers.put(serverName, Long.valueOf(now));
521     Configuration conf = server.getConfiguration();
522     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
523     // Doesn't have to be very accurate about the clean up time
524     if (now > lastProcessedServerCleanTime + obsoleteTime) {
525       lastProcessedServerCleanTime = now;
526       long cutoff = now - obsoleteTime;
527       for (Iterator<Map.Entry<ServerName, Long>> it
528           = processedServers.entrySet().iterator(); it.hasNext();) {
529         Map.Entry<ServerName, Long> e = it.next();
530         if (e.getValue().longValue() < cutoff) {
531           if (LOG.isDebugEnabled()) {
532             LOG.debug("Removed from log splitting servers " + e.getKey());
533           }
534           it.remove();
535         }
536       }
537     }
538   }
539
540   /**
541    * Log split is done for a given region, so it is assignable now.
542    */
543   public void logSplit(final HRegionInfo region) {
544     clearLastAssignment(region);
545   }
546
547   public synchronized void clearLastAssignment(final HRegionInfo region) {
548     lastAssignments.remove(region.getEncodedName());
549   }
550 
551   /**
552    * A region is offline, won't be in transition any more.
553    */
554   public void regionOffline(final HRegionInfo hri) {
555     regionOffline(hri, null);
556   }
557
558   /**
559    * A region is offline, won't be in transition any more. Its state
560    * should be the specified expected state, which can only be
561    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
562    */
563   public void regionOffline(
564       final HRegionInfo hri, final State expectedState) {
565     Preconditions.checkArgument(expectedState == null
566       || RegionState.isUnassignable(expectedState),
567         "Offlined region should not be " + expectedState);
568     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
569       // Remove it from all region maps
570       deleteRegion(hri);
571       return;
572     }
573     State newState =
574       expectedState == null ? State.OFFLINE : expectedState;
575     updateRegionState(hri, newState);
576     String encodedName = hri.getEncodedName();
577     synchronized (this) {
578       regionsInTransition.remove(encodedName);
579       ServerName oldServerName = regionAssignments.remove(hri);
580       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
581         if (newState == State.MERGED || newState == State.SPLIT
582             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
583               TableState.State.DISABLED, TableState.State.DISABLING)) {
584           // Offline the region only if it's merged/split, or the table is disabled/disabling.
585           // Otherwise, offline it from this server only when it is online on a different server.
586           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
587           removeFromServerHoldings(oldServerName, hri);
588           removeFromReplicaMapping(hri);
589         } else {
590           // Need to remember it so that we can offline it from this
591           // server when it is online on a different server.
592           oldAssignments.put(encodedName, oldServerName);
593         }
594       }
595     }
596   }
597
598   /**
599    * A server is offline, all regions on it are dead.
600    */
601   public List<HRegionInfo> serverOffline(final ServerName sn) {
602     // Offline all regions on this server not already in transition.
603     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
604     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
605     // Offline regions outside the loop and synchronized block to avoid
606     // ConcurrentModificationException and deadlock in case of meta anassigned,
607     // but RegionState a blocked.
608     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
609     synchronized (this) {
610       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
611       if (assignedRegions == null) {
612         assignedRegions = new HashSet<HRegionInfo>();
613       }
614
615       for (HRegionInfo region : assignedRegions) {
616         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
617         if (isRegionOnline(region)) {
618           regionsToOffline.add(region);
619         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
620           LOG.debug("Offline splitting/merging region " + getRegionState(region));
621           regionsToOffline.add(region);
622         }
623       }
624
625       for (RegionState state : regionsInTransition.values()) {
626         HRegionInfo hri = state.getRegion();
627         if (assignedRegions.contains(hri)) {
628           // Region is open on this region server, but in transition.
629           // This region must be moving away from this server, or splitting/merging.
630           // SSH will handle it, either skip assigning, or re-assign.
631           LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
632         } else if (sn.equals(state.getServerName())) {
633           // Region is in transition on this region server, and this
634           // region is not open on this server. So the region must be
635           // moving to this server from another one (i.e. opening or
636           // pending open on this server, was open on another one.
637           // Offline state is also kind of pending open if the region is in
638           // transition. The region could be in failed_close state too if we have
639           // tried several times to open it while this region server is not reachable)
640           if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
641               State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
642             LOG.info("Found region in " + state +
643               " to be reassigned by ServerCrashProcedure for " + sn);
644             rits.add(hri);
645           } else if (isOneOfStates(state, State.SPLITTING_NEW)) {
646             regionsToCleanIfNoMetaEntry.add(state.getRegion());
647           } else {
648             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
649           }
650         }
651       }
652       this.notifyAll();
653     }
654
655     for (HRegionInfo hri : regionsToOffline) {
656       regionOffline(hri);
657     }
658 
659     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
660     return rits;
661   }
662 
663   /**
664    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
665    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
666    */
667   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
668     if (hris.isEmpty()) return;
669     for (HRegionInfo hri: hris) {
670       try {
671         // This is RPC to meta table. It is done while we have a synchronize on
672         // regionstates. No progress will be made if meta is not available at this time.
673         // This is a cleanup task. Not critical.
674         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
675             null) {
676           regionOffline(hri);
677           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
678         }
679       } catch (IOException e) {
680         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
681       }
682     }
683   }
684
685   /**
686    * Gets the online regions of the specified table.
687    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
688    * Only returns <em>online</em> regions.  If a region on this table has been
689    * closed during a disable, etc., it will be included in the returned list.
690    * So, the returned list may not necessarily be ALL regions in this table, its
691    * all the ONLINE regions in the table.
692    * @param tableName
693    * @return Online regions from <code>tableName</code>
694    */
695   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
696     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
697     // boundary needs to have table's name but regionID 0 so that it is sorted
698     // before all table's regions.
699     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
700     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
701       if(!hri.getTable().equals(tableName)) break;
702       tableRegions.add(hri);
703     }
704     return tableRegions;
705   }
706
707   /**
708    * Gets current state of all regions of the table.
709    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
710    * Method guaranteed to return keys for all states
711    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
712    *
713    * @param tableName
714    * @return Online regions from <code>tableName</code>
715    */
716   public synchronized Map<RegionState.State, List<HRegionInfo>>
717   getRegionByStateOfTable(TableName tableName) {
718     Map<RegionState.State, List<HRegionInfo>> tableRegions =
719         new HashMap<State, List<HRegionInfo>>();
720     for (State state : State.values()) {
721       tableRegions.put(state, new ArrayList<HRegionInfo>());
722     }
723     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
724     if (indexMap == null)
725       return tableRegions;
726     for (RegionState regionState : indexMap.values()) {
727       tableRegions.get(regionState.getState()).add(regionState.getRegion());
728     }
729     return tableRegions;
730   }
731
732   /**
733    * Wait on region to clear regions-in-transition.
734    * <p>
735    * If the region isn't in transition, returns immediately.  Otherwise, method
736    * blocks until the region is out of transition.
737    */
738   public synchronized void waitOnRegionToClearRegionsInTransition(
739       final HRegionInfo hri) throws InterruptedException {
740     if (!isRegionInTransition(hri)) return;
741
742     while(!server.isStopped() && isRegionInTransition(hri)) {
743       RegionState rs = getRegionState(hri);
744       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
745       waitForUpdate(100);
746     }
747
748     if (server.isStopped()) {
749       LOG.info("Giving up wait on region in " +
750         "transition because stoppable.isStopped is set");
751     }
752   }
753
754   /**
755    * A table is deleted. Remove its regions from all internal maps.
756    * We loop through all regions assuming we don't delete tables too much.
757    */
758   public void tableDeleted(final TableName tableName) {
759     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
760     synchronized (this) {
761       for (RegionState state: regionStates.values()) {
762         HRegionInfo region = state.getRegion();
763         if (region.getTable().equals(tableName)) {
764           regionsToDelete.add(region);
765         }
766       }
767     }
768     for (HRegionInfo region: regionsToDelete) {
769       deleteRegion(region);
770     }
771   }
772
773   /**
774    * Get a copy of all regions assigned to a server
775    */
776   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
777     Set<HRegionInfo> regions = serverHoldings.get(serverName);
778     if (regions == null) return null;
779     return new HashSet<HRegionInfo>(regions);
780   }
781
782   /**
783    * Remove a region from all state maps.
784    */
785   @VisibleForTesting
786   public synchronized void deleteRegion(final HRegionInfo hri) {
787     String encodedName = hri.getEncodedName();
788     regionsInTransition.remove(encodedName);
789     regionStates.remove(encodedName);
790     TableName table = hri.getTable();
791     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
792     indexMap.remove(encodedName);
793     if (indexMap.size() == 0)
794       regionStatesTableIndex.remove(table);
795     lastAssignments.remove(encodedName);
796     ServerName sn = regionAssignments.remove(hri);
797     if (sn != null) {
798       Set<HRegionInfo> regions = serverHoldings.get(sn);
799       regions.remove(hri);
800     }
801   }
802
803   /**
804    * Checking if a region was assigned to a server which is not online now.
805    * If so, we should hold re-assign this region till SSH has split its wals.
806    * Once logs are split, the last assignment of this region will be reset,
807    * which means a null last assignment server is ok for re-assigning.
808    *
809    * A region server could be dead but we don't know it yet. We may
810    * think it's online falsely. Therefore if a server is online, we still
811    * need to confirm it reachable and having the expected start code.
812    */
813   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
814     ServerName server = lastAssignments.get(encodedName);
815     return isServerDeadAndNotProcessed(server);
816   }
817
818   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
819     if (server == null) return false;
820     if (serverManager.isServerOnline(server)) {
821       String hostAndPort = server.getHostAndPort();
822       long startCode = server.getStartcode();
823       Long deadCode = deadServers.get(hostAndPort);
824       if (deadCode == null || startCode > deadCode.longValue()) {
825         if (serverManager.isServerReachable(server)) {
826           return false;
827         }
828         // The size of deadServers won't grow unbounded.
829         deadServers.put(hostAndPort, Long.valueOf(startCode));
830       }
831       // Watch out! If the server is not dead, the region could
832       // remain unassigned. That's why ServerManager#isServerReachable
833       // should use some retry.
834       //
835       // We cache this info since it is very unlikely for that
836       // instance to come back up later on. We don't want to expire
837       // the server since we prefer to let it die naturally.
838       LOG.warn("Couldn't reach online server " + server);
839     }
840     // Now, we know it's dead. Check if it's processed
841     return !processedServers.containsKey(server);
842   }
843
844  /**
845    * Get the last region server a region was on for purpose of re-assignment,
846    * i.e. should the re-assignment be held back till log split is done?
847    */
848   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
849     return lastAssignments.get(encodedName);
850   }
851
852   synchronized void setLastRegionServerOfRegions(
853       final ServerName serverName, final List<HRegionInfo> regionInfos) {
854     for (HRegionInfo hri: regionInfos) {
855       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
856     }
857   }
858
859   synchronized void setLastRegionServerOfRegion(
860       final ServerName serverName, final String encodedName) {
861     lastAssignments.put(encodedName, serverName);
862   }
863
864   synchronized boolean isRegionOnServer(
865       final HRegionInfo hri, final ServerName serverName) {
866     Set<HRegionInfo> regions = serverHoldings.get(serverName);
867     return regions == null ? false : regions.contains(hri);
868   }
869
870   void splitRegion(HRegionInfo p,
871       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
872
873     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
874     synchronized (this) {
875       // After PONR, split is considered to be done.
876       // Update server holdings to be aligned with the meta.
877       Set<HRegionInfo> regions = serverHoldings.get(sn);
878       if (regions == null) {
879         throw new IllegalStateException(sn + " should host some regions");
880       }
881       regions.remove(p);
882       regions.add(a);
883       regions.add(b);
884     }
885   }
886
887   void mergeRegions(HRegionInfo p,
888       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
889     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
890     synchronized (this) {
891       // After PONR, merge is considered to be done.
892       // Update server holdings to be aligned with the meta.
893       Set<HRegionInfo> regions = serverHoldings.get(sn);
894       if (regions == null) {
895         throw new IllegalStateException(sn + " should host some regions");
896       }
897       regions.remove(a);
898       regions.remove(b);
899       regions.add(p);
900     }
901   }
902
903   private int getRegionReplication(HRegionInfo r) throws IOException {
904     if (tableStateManager != null) {
905       HTableDescriptor htd = server.getTableDescriptors().get(r.getTable());
906       if (htd != null) {
907         return htd.getRegionReplication();
908       }
909     }
910     return 1;
911   }
912
913   /**
914    * At cluster clean re/start, mark all user regions closed except those of tables
915    * that are excluded, such as disabled/disabling/enabling tables. All user regions
916    * and their previous locations are returned.
917    */
918   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
919     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
920     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
921     for(RegionState state: regionStates.values()) {
922       HRegionInfo hri = state.getRegion();
923       if (state.isSplit() || hri.isSplit()) {
924         continue;
925       }
926       TableName tableName = hri.getTable();
927       if (!TableName.META_TABLE_NAME.equals(tableName)
928           && (noExcludeTables || !excludedTables.contains(tableName))) {
929         toBeClosed.add(hri);
930       }
931     }
932     Map<HRegionInfo, ServerName> allUserRegions =
933       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
934     for (HRegionInfo hri: toBeClosed) {
935       RegionState regionState = updateRegionState(hri, State.CLOSED);
936       allUserRegions.put(hri, regionState.getServerName());
937     }
938     return allUserRegions;
939   }
940
941   /**
942    * Compute the average load across all region servers.
943    * Currently, this uses a very naive computation - just uses the number of
944    * regions being served, ignoring stats about number of requests.
945    * @return the average load
946    */
947   protected synchronized double getAverageLoad() {
948     int numServers = 0, totalLoad = 0;
949     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
950       Set<HRegionInfo> regions = e.getValue();
951       ServerName serverName = e.getKey();
952       int regionCount = regions.size();
953       if (serverManager.isServerOnline(serverName)) {
954         totalLoad += regionCount;
955         numServers++;
956       }
957     }
958     if (numServers > 1) {
959       // The master region server holds only a couple regions.
960       // Don't consider this server in calculating the average load
961       // if there are other region servers to avoid possible confusion.
962       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
963       if (hris != null) {
964         totalLoad -= hris.size();
965         numServers--;
966       }
967     }
968     return numServers == 0 ? 0.0 :
969       (double)totalLoad / (double)numServers;
970   }
971
972   /**
973    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
974    * Can't let out original since it can change and at least the load balancer
975    * wants to iterate this exported list.  We need to synchronize on regions
976    * since all access to this.servers is under a lock on this.regions.
977    *
978    * @return A clone of current assignments by table.
979    */
980   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
981       getAssignmentsByTable() {
982     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
983       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
984     synchronized (this) {
985       if (!server.getConfiguration().getBoolean(
986             HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, false)) {
987         Map<ServerName, List<HRegionInfo>> svrToRegions =
988           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
989         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
990           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
991         }
992         result.put(TableName.valueOf(HConstants.ENSEMBLE_TABLE_NAME), svrToRegions);
993       } else {
994         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
995           for (HRegionInfo hri: e.getValue()) {
996             if (hri.isMetaRegion()) continue;
997             TableName tablename = hri.getTable();
998             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
999             if (svrToRegions == null) {
1000               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1001               result.put(tablename, svrToRegions);
1002             }
1003             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1004             if (regions == null) {
1005               regions = new ArrayList<HRegionInfo>();
1006               svrToRegions.put(e.getKey(), regions);
1007             }
1008             regions.add(hri);
1009           }
1010         }
1011       }
1012     }
1013
1014     Map<ServerName, ServerLoad>
1015       onlineSvrs = serverManager.getOnlineServers();
1016     // Take care of servers w/o assignments, and remove servers in draining mode
1017     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1018     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1019       for (ServerName svr: onlineSvrs.keySet()) {
1020         if (!map.containsKey(svr)) {
1021           map.put(svr, new ArrayList<HRegionInfo>());
1022         }
1023       }
1024       map.keySet().removeAll(drainingServers);
1025     }
1026     return result;
1027   }
1028
1029   protected RegionState getRegionState(final HRegionInfo hri) {
1030     return getRegionState(hri.getEncodedName());
1031   }
1032 
1033   /**
1034    * Returns a clone of region assignments per server
1035    * @return a Map of ServerName to a List of HRegionInfo's
1036    */
1037   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1038     Map<ServerName, List<HRegionInfo>> regionsByServer =
1039         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1040     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1041       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1042     }
1043     return regionsByServer;
1044   }
1045
1046   public synchronized RegionState getRegionState(final String encodedName) {
1047     return regionStates.get(encodedName);
1048   }
1049 
1050   /**
1051    * Get the HRegionInfo from cache, if not there, from the hbase:meta table.
1052    * Be careful. Does RPC. Do not hold a lock or synchronize when you call this method.
1053    * @param  regionName
1054    * @return HRegionInfo for the region
1055    */
1056   @SuppressWarnings("deprecation")
1057   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1058     String encodedName = HRegionInfo.encodeRegionName(regionName);
1059     RegionState regionState = getRegionState(encodedName);
1060     if (regionState != null) {
1061       return regionState.getRegion();
1062     }
1063
1064     try {
1065       Pair<HRegionInfo, ServerName> p =
1066         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1067       HRegionInfo hri = p == null ? null : p.getFirst();
1068       if (hri != null) {
1069         createRegionState(hri);
1070       }
1071       return hri;
1072     } catch (IOException e) {
1073       server.abort("Aborting because error occoured while reading "
1074         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1075       return null;
1076     }
1077   }
1078
1079   static boolean isOneOfStates(RegionState regionState, State... states) {
1080     State s = regionState != null ? regionState.getState() : null;
1081     for (State state: states) {
1082       if (s == state) return true;
1083     }
1084     return false;
1085   }
1086
1087   /**
1088    * Update a region state. It will be put in transition if not already there.
1089    */
1090   private RegionState updateRegionState(final HRegionInfo hri,
1091       final RegionState.State state, final ServerName serverName, long openSeqNum) {
1092     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
1093       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1094         + " on " + serverName + ", set to " + state);
1095     }
1096
1097     String encodedName = hri.getEncodedName();
1098     RegionState regionState = new RegionState(
1099       hri, state, System.currentTimeMillis(), serverName);
1100     RegionState oldState = getRegionState(encodedName);
1101     if (!regionState.equals(oldState)) {
1102       LOG.info("Transition " + oldState + " to " + regionState);
1103       // Persist region state before updating in-memory info, if needed
1104       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1105     }
1106
1107     synchronized (this) {
1108       regionsInTransition.put(encodedName, regionState);
1109       putRegionState(regionState);
1110 
1111       // For these states, region should be properly closed.
1112       // There should be no log splitting issue.
1113       if ((state == State.CLOSED || state == State.MERGED
1114           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1115         ServerName last = lastAssignments.get(encodedName);
1116         if (last.equals(serverName)) {
1117           lastAssignments.remove(encodedName);
1118         } else {
1119           LOG.warn(encodedName + " moved to " + state + " on "
1120             + serverName + ", expected " + last);
1121         }
1122       }
1123
1124       // Once a region is opened, record its last assignment right away.
1125       if (serverName != null && state == State.OPEN) {
1126         ServerName last = lastAssignments.get(encodedName);
1127         if (!serverName.equals(last)) {
1128           lastAssignments.put(encodedName, serverName);
1129           if (last != null && isServerDeadAndNotProcessed(last)) {
1130             LOG.warn(encodedName + " moved to " + serverName
1131               + ", while it's previous host " + last
1132               + " is dead but not processed yet");
1133           }
1134         }
1135       }
1136
1137       // notify the change
1138       this.notifyAll();
1139     }
1140     return regionState;
1141   }
1142 }