View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import com.google.common.annotations.VisibleForTesting;
33  import com.google.common.base.Preconditions;
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.hbase.classification.InterfaceAudience;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.hbase.HConstants;
39  import org.apache.hadoop.hbase.HRegionInfo;
40  import org.apache.hadoop.hbase.MetaTableAccessor;
41  import org.apache.hadoop.hbase.Server;
42  import org.apache.hadoop.hbase.ServerLoad;
43  import org.apache.hadoop.hbase.ServerName;
44  import org.apache.hadoop.hbase.TableName;
45  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
46  import org.apache.hadoop.hbase.master.RegionState.State;
47  import org.apache.hadoop.hbase.client.TableState;
48  import org.apache.hadoop.hbase.util.Bytes;
49  import org.apache.hadoop.hbase.util.Pair;
50  
51  /**
52   * Region state accountant. It holds the states of all regions in the memory.
53   * In normal scenario, it should match the meta table and the true region states.
54   *
55   * This map is used by AssignmentManager to track region states.
56   */
57  @InterfaceAudience.Private
58  public class RegionStates {
59    private static final Log LOG = LogFactory.getLog(RegionStates.class);
60  
61    /**
62     * Regions currently in transition.
63     */
64    final HashMap<String, RegionState> regionsInTransition =
65      new HashMap<String, RegionState>();
66  
67    /**
68     * Region encoded name to state map.
69     * All the regions should be in this map.
70     */
71    private final Map<String, RegionState> regionStates =
72      new HashMap<String, RegionState>();
73  
74    /**
75     * Server to regions assignment map.
76     * Contains the set of regions currently assigned to a given server.
77     */
78    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
79      new HashMap<ServerName, Set<HRegionInfo>>();
80  
81    /**
82     * Maintains the mapping from the default region to the replica regions.
83     */
84    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
85      new HashMap<HRegionInfo, Set<HRegionInfo>>();
86  
87    /**
88     * Region to server assignment map.
89     * Contains the server a given region is currently assigned to.
90     */
91    private final TreeMap<HRegionInfo, ServerName> regionAssignments =
92      new TreeMap<HRegionInfo, ServerName>();
93  
94    /**
95     * Encoded region name to server assignment map for re-assignment
96     * purpose. Contains the server a given region is last known assigned
97     * to, which has not completed log splitting, so not assignable.
98     * If a region is currently assigned, this server info in this
99     * map should be the same as that in regionAssignments.
100    * However the info in regionAssignments is cleared when the region
101    * is offline while the info in lastAssignments is cleared when
102    * the region is closed or the server is dead and processed.
103    */
104   private final HashMap<String, ServerName> lastAssignments =
105     new HashMap<String, ServerName>();
106 
107   /**
108    * Encoded region name to server assignment map for the
109    * purpose to clean up serverHoldings when a region is online
110    * on a new server. When the region is offline from the previous
111    * server, we cleaned up regionAssignments so that it has the
112    * latest assignment map. But we didn't clean up serverHoldings
113    * to match the meta. We need this map to find out the old server
114    * whose serverHoldings needs cleanup, given a moved region.
115    */
116   private final HashMap<String, ServerName> oldAssignments =
117     new HashMap<String, ServerName>();
118 
119   /**
120    * Map a host port pair string to the latest start code
121    * of a region server which is known to be dead. It is dead
122    * to us, but server manager may not know it yet.
123    */
124   private final HashMap<String, Long> deadServers =
125     new HashMap<String, Long>();
126 
127   /**
128    * Map a dead servers to the time when log split is done.
129    * Since log splitting is not ordered, we have to remember
130    * all processed instances. The map is cleaned up based
131    * on a configured time. By default, we assume a dead
132    * server should be done with log splitting in two hours.
133    */
134   private final HashMap<ServerName, Long> processedServers =
135     new HashMap<ServerName, Long>();
136   private long lastProcessedServerCleanTime;
137 
138   private final TableStateManager tableStateManager;
139   private final RegionStateStore regionStateStore;
140   private final ServerManager serverManager;
141   private final Server server;
142 
143   // The maximum time to keep a log split info in region states map
144   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
145   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
146 
147   RegionStates(final Server master, final TableStateManager tableStateManager,
148       final ServerManager serverManager, final RegionStateStore regionStateStore) {
149     this.tableStateManager = tableStateManager;
150     this.regionStateStore = regionStateStore;
151     this.serverManager = serverManager;
152     this.server = master;
153   }
154 
155   /**
156    * @return an unmodifiable the region assignment map
157    */
158   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
159     return Collections.unmodifiableMap(regionAssignments);
160   }
161 
162   /**
163    * Return the replicas (including default) for the regions grouped by ServerName
164    * @param regions
165    * @return a pair containing the groupings as a map
166    */
167   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
168     Collection<HRegionInfo> regions) {
169     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
170     for (HRegionInfo region : regions) {
171       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
172       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
173       if (allReplicas != null) {
174         for (HRegionInfo hri : allReplicas) {
175           ServerName server = regionAssignments.get(hri);
176           if (server != null) {
177             List<HRegionInfo> regionsOnServer = map.get(server);
178             if (regionsOnServer == null) {
179               regionsOnServer = new ArrayList<HRegionInfo>(1);
180               map.put(server, regionsOnServer);
181             }
182             regionsOnServer.add(hri);
183           }
184         }
185       }
186     }
187     return map;
188   }
189 
190   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
191     return regionAssignments.get(hri);
192   }
193 
194   /**
195    * Get regions in transition and their states
196    */
197   @SuppressWarnings("unchecked")
198   public synchronized Map<String, RegionState> getRegionsInTransition() {
199     return (Map<String, RegionState>)regionsInTransition.clone();
200   }
201 
202   /**
203    * @return True if specified region in transition.
204    */
205   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
206     return regionsInTransition.containsKey(hri.getEncodedName());
207   }
208 
209   /**
210    * @return True if specified region in transition.
211    */
212   public synchronized boolean isRegionInTransition(final String encodedName) {
213     return regionsInTransition.containsKey(encodedName);
214   }
215 
216   /**
217    * @return True if any region in transition.
218    */
219   public synchronized boolean isRegionsInTransition() {
220     return !regionsInTransition.isEmpty();
221   }
222 
223   /**
224    * @return True if specified region assigned, and not in transition.
225    */
226   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
227     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
228   }
229 
230   /**
231    * @return True if specified region offline/closed, but not in transition.
232    * If the region is not in the map, it is offline to us too.
233    */
234   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
235     return getRegionState(hri) == null || (!isRegionInTransition(hri)
236       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
237   }
238 
239   /**
240    * @return True if specified region is in one of the specified states.
241    */
242   public boolean isRegionInState(
243       final HRegionInfo hri, final State... states) {
244     return isRegionInState(hri.getEncodedName(), states);
245   }
246 
247   /**
248    * @return True if specified region is in one of the specified states.
249    */
250   public boolean isRegionInState(
251       final String encodedName, final State... states) {
252     RegionState regionState = getRegionState(encodedName);
253     return isOneOfStates(regionState, states);
254   }
255 
256   /**
257    * Wait for the state map to be updated by assignment manager.
258    */
259   public synchronized void waitForUpdate(
260       final long timeout) throws InterruptedException {
261     this.wait(timeout);
262   }
263 
264   /**
265    * Get region transition state
266    */
267   public RegionState getRegionTransitionState(final HRegionInfo hri) {
268     return getRegionTransitionState(hri.getEncodedName());
269   }
270 
271   /**
272    * Get region transition state
273    */
274   public synchronized RegionState
275       getRegionTransitionState(final String encodedName) {
276     return regionsInTransition.get(encodedName);
277   }
278 
279   /**
280    * Add a list of regions to RegionStates. If a region is split
281    * and offline, its state will be SPLIT. Otherwise, its state will
282    * be OFFLINE. Region already in RegionStates will be skipped.
283    */
284   public void createRegionStates(
285       final List<HRegionInfo> hris) {
286     for (HRegionInfo hri: hris) {
287       createRegionState(hri);
288     }
289   }
290 
291   /**
292    * Add a region to RegionStates. If the region is split
293    * and offline, its state will be SPLIT. Otherwise, its state will
294    * be OFFLINE. If it is already in RegionStates, this call has
295    * no effect, and the original state is returned.
296    */
297   public RegionState createRegionState(final HRegionInfo hri) {
298     return createRegionState(hri, null, null, null);
299   }
300 
301   /**
302    * Add a region to RegionStates with the specified state.
303    * If the region is already in RegionStates, this call has
304    * no effect, and the original state is returned.
305    *
306    * @param hri the region info to create a state for
307    * @param newState the state to the region in set to
308    * @param serverName the server the region is transitioning on
309    * @param lastHost the last server that hosts the region
310    * @return the current state
311    */
312   public synchronized RegionState createRegionState(final HRegionInfo hri,
313       State newState, ServerName serverName, ServerName lastHost) {
314     if (newState == null || (newState == State.OPEN && serverName == null)) {
315       newState =  State.OFFLINE;
316     }
317     if (hri.isOffline() && hri.isSplit()) {
318       newState = State.SPLIT;
319       serverName = null;
320     }
321     String encodedName = hri.getEncodedName();
322     RegionState regionState = regionStates.get(encodedName);
323     if (regionState != null) {
324       LOG.warn("Tried to create a state for a region already in RegionStates, "
325         + "used existing: " + regionState + ", ignored new: " + newState);
326     } else {
327       regionState = new RegionState(hri, newState, serverName);
328       regionStates.put(encodedName, regionState);
329       if (newState == State.OPEN) {
330         if (!serverName.equals(lastHost)) {
331           LOG.warn("Open region's last host " + lastHost
332             + " should be the same as the current one " + serverName
333             + ", ignored the last and used the current one");
334           lastHost = serverName;
335         }
336         lastAssignments.put(encodedName, lastHost);
337         regionAssignments.put(hri, lastHost);
338       } else if (!isOneOfStates(regionState, State.MERGED, State.SPLIT, State.OFFLINE)) {
339         regionsInTransition.put(encodedName, regionState);
340       }
341       if (lastHost != null && newState != State.SPLIT) {
342         addToServerHoldings(lastHost, hri);
343         if (newState != State.OPEN) {
344           oldAssignments.put(encodedName, lastHost);
345         }
346       }
347     }
348     return regionState;
349   }
350 
351   /**
352    * Update a region state. It will be put in transition if not already there.
353    */
354   public RegionState updateRegionState(
355       final HRegionInfo hri, final State state) {
356     RegionState regionState = getRegionState(hri.getEncodedName());
357     return updateRegionState(hri, state,
358       regionState == null ? null : regionState.getServerName());
359   }
360 
361   /**
362    * Update a region state. It will be put in transition if not already there.
363    */
364   public RegionState updateRegionState(
365       final HRegionInfo hri, final State state, final ServerName serverName) {
366     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
367   }
368 
369   public void regionOnline(
370       final HRegionInfo hri, final ServerName serverName) {
371     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
372   }
373 
374   /**
375    * A region is online, won't be in transition any more.
376    * We can't confirm it is really online on specified region server
377    * because it hasn't been put in region server's online region list yet.
378    */
379   public void regionOnline(final HRegionInfo hri,
380       final ServerName serverName, long openSeqNum) {
381     String encodedName = hri.getEncodedName();
382     if (!serverManager.isServerOnline(serverName)) {
383       // This is possible if the region server dies before master gets a
384       // chance to handle ZK event in time. At this time, if the dead server
385       // is already processed by SSH, we should ignore this event.
386       // If not processed yet, ignore and let SSH deal with it.
387       LOG.warn("Ignored, " + encodedName
388         + " was opened on a dead server: " + serverName);
389       return;
390     }
391     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
392 
393     synchronized (this) {
394       regionsInTransition.remove(encodedName);
395       ServerName oldServerName = regionAssignments.put(hri, serverName);
396       if (!serverName.equals(oldServerName)) {
397         if (LOG.isDebugEnabled()) {
398           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName + " " + hri);
399         } else {
400           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
401         }
402         addToServerHoldings(serverName, hri);
403         addToReplicaMapping(hri);
404         if (oldServerName == null) {
405           oldServerName = oldAssignments.remove(encodedName);
406         }
407         if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
408           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
409           removeFromServerHoldings(oldServerName, hri);
410         }
411       }
412     }
413   }
414 
415   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
416     Set<HRegionInfo> regions = serverHoldings.get(serverName);
417     if (regions == null) {
418       regions = new HashSet<HRegionInfo>();
419       serverHoldings.put(serverName, regions);
420     }
421     regions.add(hri);
422   }
423 
424   private void addToReplicaMapping(HRegionInfo hri) {
425     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
426     Set<HRegionInfo> replicas =
427         defaultReplicaToOtherReplicas.get(defaultReplica);
428     if (replicas == null) {
429       replicas = new HashSet<HRegionInfo>();
430       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
431     }
432     replicas.add(hri);
433   }
434 
435   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
436     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
437     oldRegions.remove(hri);
438     if (oldRegions.isEmpty()) {
439       serverHoldings.remove(serverName);
440     }
441   }
442 
443   private void removeFromReplicaMapping(HRegionInfo hri) {
444     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
445     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
446     if (replicas != null) {
447       replicas.remove(hri);
448       if (replicas.isEmpty()) {
449         defaultReplicaToOtherReplicas.remove(defaultReplica);
450       }
451     }
452   }
453 
454   /**
455    * A dead server's wals have been split so that all the regions
456    * used to be open on it can be safely assigned now. Mark them assignable.
457    */
458   public synchronized void logSplit(final ServerName serverName) {
459     for (Iterator<Map.Entry<String, ServerName>> it
460         = lastAssignments.entrySet().iterator(); it.hasNext();) {
461       Map.Entry<String, ServerName> e = it.next();
462       if (e.getValue().equals(serverName)) {
463         it.remove();
464       }
465     }
466     long now = System.currentTimeMillis();
467     if (LOG.isDebugEnabled()) {
468       LOG.debug("Adding to processed servers " + serverName);
469     }
470     processedServers.put(serverName, Long.valueOf(now));
471     Configuration conf = server.getConfiguration();
472     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
473     // Doesn't have to be very accurate about the clean up time
474     if (now > lastProcessedServerCleanTime + obsoleteTime) {
475       lastProcessedServerCleanTime = now;
476       long cutoff = now - obsoleteTime;
477       for (Iterator<Map.Entry<ServerName, Long>> it
478           = processedServers.entrySet().iterator(); it.hasNext();) {
479         Map.Entry<ServerName, Long> e = it.next();
480         if (e.getValue().longValue() < cutoff) {
481           if (LOG.isDebugEnabled()) {
482             LOG.debug("Removed from processed servers " + e.getKey());
483           }
484           it.remove();
485         }
486       }
487     }
488   }
489 
490   /**
491    * Log split is done for a given region, so it is assignable now.
492    */
493   public void logSplit(final HRegionInfo region) {
494     clearLastAssignment(region);
495   }
496 
497   public synchronized void clearLastAssignment(final HRegionInfo region) {
498     lastAssignments.remove(region.getEncodedName());
499   }
500 
501   /**
502    * A region is offline, won't be in transition any more.
503    */
504   public void regionOffline(final HRegionInfo hri) {
505     regionOffline(hri, null);
506   }
507 
508   /**
509    * A region is offline, won't be in transition any more. Its state
510    * should be the specified expected state, which can only be
511    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
512    */
513   public void regionOffline(
514       final HRegionInfo hri, final State expectedState) {
515     Preconditions.checkArgument(expectedState == null
516       || RegionState.isUnassignable(expectedState),
517         "Offlined region should not be " + expectedState);
518     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
519       // Remove it from all region maps
520       deleteRegion(hri);
521       return;
522     }
523     State newState =
524       expectedState == null ? State.OFFLINE : expectedState;
525     updateRegionState(hri, newState);
526     String encodedName = hri.getEncodedName();
527     synchronized (this) {
528       regionsInTransition.remove(encodedName);
529       ServerName oldServerName = regionAssignments.remove(hri);
530       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
531         if (newState == State.MERGED || newState == State.SPLIT
532             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
533               TableState.State.DISABLED, TableState.State.DISABLING)) {
534           // Offline the region only if it's merged/split, or the table is disabled/disabling.
535           // Otherwise, offline it from this server only when it is online on a different server.
536           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
537           removeFromServerHoldings(oldServerName, hri);
538           removeFromReplicaMapping(hri);
539         } else {
540           // Need to remember it so that we can offline it from this
541           // server when it is online on a different server.
542           oldAssignments.put(encodedName, oldServerName);
543         }
544       }
545     }
546   }
547 
548   /**
549    * A server is offline, all regions on it are dead.
550    */
551   public synchronized List<HRegionInfo> serverOffline(final ServerName sn) {
552     // Offline all regions on this server not already in transition.
553     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
554     Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
555     if (assignedRegions == null) {
556       assignedRegions = new HashSet<HRegionInfo>();
557     }
558 
559     // Offline regions outside the loop to avoid ConcurrentModificationException
560     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
561     for (HRegionInfo region : assignedRegions) {
562       // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
563       if (isRegionOnline(region)) {
564         regionsToOffline.add(region);
565       } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
566         LOG.debug("Offline splitting/merging region " + getRegionState(region));
567         regionsToOffline.add(region);
568       }
569     }
570 
571     for (HRegionInfo hri : regionsToOffline) {
572       regionOffline(hri);
573     }
574 
575     for (RegionState state : regionsInTransition.values()) {
576       HRegionInfo hri = state.getRegion();
577       if (assignedRegions.contains(hri)) {
578         // Region is open on this region server, but in transition.
579         // This region must be moving away from this server, or splitting/merging.
580         // SSH will handle it, either skip assigning, or re-assign.
581         LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
582       } else if (sn.equals(state.getServerName())) {
583         // Region is in transition on this region server, and this
584         // region is not open on this server. So the region must be
585         // moving to this server from another one (i.e. opening or
586         // pending open on this server, was open on another one.
587         // Offline state is also kind of pending open if the region is in
588         // transition. The region could be in failed_close state too if we have
589         // tried several times to open it while this region server is not reachable)
590         if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN,
591             State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
592           LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
593           rits.add(hri);
594         } else {
595           LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
596         }
597       }
598     }
599 
600     this.notifyAll();
601     return rits;
602   }
603 
604   /**
605    * Gets the online regions of the specified table.
606    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
607    * Only returns <em>online</em> regions.  If a region on this table has been
608    * closed during a disable, etc., it will be included in the returned list.
609    * So, the returned list may not necessarily be ALL regions in this table, its
610    * all the ONLINE regions in the table.
611    * @param tableName
612    * @return Online regions from <code>tableName</code>
613    */
614   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
615     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
616     // boundary needs to have table's name but regionID 0 so that it is sorted
617     // before all table's regions.
618     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
619     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
620       if(!hri.getTable().equals(tableName)) break;
621       tableRegions.add(hri);
622     }
623     return tableRegions;
624   }
625 
626 
627   /**
628    * Wait on region to clear regions-in-transition.
629    * <p>
630    * If the region isn't in transition, returns immediately.  Otherwise, method
631    * blocks until the region is out of transition.
632    */
633   public synchronized void waitOnRegionToClearRegionsInTransition(
634       final HRegionInfo hri) throws InterruptedException {
635     if (!isRegionInTransition(hri)) return;
636 
637     while(!server.isStopped() && isRegionInTransition(hri)) {
638       RegionState rs = getRegionState(hri);
639       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
640       waitForUpdate(100);
641     }
642 
643     if (server.isStopped()) {
644       LOG.info("Giving up wait on region in " +
645         "transition because stoppable.isStopped is set");
646     }
647   }
648 
649   /**
650    * A table is deleted. Remove its regions from all internal maps.
651    * We loop through all regions assuming we don't delete tables too much.
652    */
653   public void tableDeleted(final TableName tableName) {
654     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
655     synchronized (this) {
656       for (RegionState state: regionStates.values()) {
657         HRegionInfo region = state.getRegion();
658         if (region.getTable().equals(tableName)) {
659           regionsToDelete.add(region);
660         }
661       }
662     }
663     for (HRegionInfo region: regionsToDelete) {
664       deleteRegion(region);
665     }
666   }
667 
668   /**
669    * Get a copy of all regions assigned to a server
670    */
671   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
672     Set<HRegionInfo> regions = serverHoldings.get(serverName);
673     if (regions == null) return null;
674     return new HashSet<HRegionInfo>(regions);
675   }
676 
677   /**
678    * Remove a region from all state maps.
679    */
680   @VisibleForTesting
681   public synchronized void deleteRegion(final HRegionInfo hri) {
682     String encodedName = hri.getEncodedName();
683     regionsInTransition.remove(encodedName);
684     regionStates.remove(encodedName);
685     lastAssignments.remove(encodedName);
686     ServerName sn = regionAssignments.remove(hri);
687     if (sn != null) {
688       Set<HRegionInfo> regions = serverHoldings.get(sn);
689       regions.remove(hri);
690     }
691   }
692 
693   /**
694    * Checking if a region was assigned to a server which is not online now.
695    * If so, we should hold re-assign this region till SSH has split its wals.
696    * Once logs are split, the last assignment of this region will be reset,
697    * which means a null last assignment server is ok for re-assigning.
698    *
699    * A region server could be dead but we don't know it yet. We may
700    * think it's online falsely. Therefore if a server is online, we still
701    * need to confirm it reachable and having the expected start code.
702    */
703   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
704     ServerName server = lastAssignments.get(encodedName);
705     return isServerDeadAndNotProcessed(server);
706   }
707 
708   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
709     if (server == null) return false;
710     if (serverManager.isServerOnline(server)) {
711       String hostAndPort = server.getHostAndPort();
712       long startCode = server.getStartcode();
713       Long deadCode = deadServers.get(hostAndPort);
714       if (deadCode == null || startCode > deadCode.longValue()) {
715         if (serverManager.isServerReachable(server)) {
716           return false;
717         }
718         // The size of deadServers won't grow unbounded.
719         deadServers.put(hostAndPort, Long.valueOf(startCode));
720       }
721       // Watch out! If the server is not dead, the region could
722       // remain unassigned. That's why ServerManager#isServerReachable
723       // should use some retry.
724       //
725       // We cache this info since it is very unlikely for that
726       // instance to come back up later on. We don't want to expire
727       // the server since we prefer to let it die naturally.
728       LOG.warn("Couldn't reach online server " + server);
729     }
730     // Now, we know it's dead. Check if it's processed
731     return !processedServers.containsKey(server);
732   }
733 
734  /**
735    * Get the last region server a region was on for purpose of re-assignment,
736    * i.e. should the re-assignment be held back till log split is done?
737    */
738   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
739     return lastAssignments.get(encodedName);
740   }
741 
742   synchronized void setLastRegionServerOfRegions(
743       final ServerName serverName, final List<HRegionInfo> regionInfos) {
744     for (HRegionInfo hri: regionInfos) {
745       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
746     }
747   }
748 
749   synchronized void setLastRegionServerOfRegion(
750       final ServerName serverName, final String encodedName) {
751     lastAssignments.put(encodedName, serverName);
752   }
753 
754   synchronized boolean isRegionOnServer(
755       final HRegionInfo hri, final ServerName serverName) {
756     Set<HRegionInfo> regions = serverHoldings.get(serverName);
757     return regions == null ? false : regions.contains(hri);
758   }
759 
760   void splitRegion(HRegionInfo p,
761       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
762     regionStateStore.splitRegion(p, a, b, sn);
763     synchronized (this) {
764       // After PONR, split is considered to be done.
765       // Update server holdings to be aligned with the meta.
766       Set<HRegionInfo> regions = serverHoldings.get(sn);
767       if (regions == null) {
768         throw new IllegalStateException(sn + " should host some regions");
769       }
770       regions.remove(p);
771       regions.add(a);
772       regions.add(b);
773     }
774   }
775 
776   void mergeRegions(HRegionInfo p,
777       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
778     regionStateStore.mergeRegions(p, a, b, sn);
779     synchronized (this) {
780       // After PONR, merge is considered to be done.
781       // Update server holdings to be aligned with the meta.
782       Set<HRegionInfo> regions = serverHoldings.get(sn);
783       if (regions == null) {
784         throw new IllegalStateException(sn + " should host some regions");
785       }
786       regions.remove(a);
787       regions.remove(b);
788       regions.add(p);
789     }
790   }
791 
792   /**
793    * At cluster clean re/start, mark all user regions closed except those of tables
794    * that are excluded, such as disabled/disabling/enabling tables. All user regions
795    * and their previous locations are returned.
796    */
797   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
798     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
799     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
800     for(RegionState state: regionStates.values()) {
801       HRegionInfo hri = state.getRegion();
802       if (state.isSplit() || hri.isSplit()) {
803         continue;
804       }
805       TableName tableName = hri.getTable();
806       if (!TableName.META_TABLE_NAME.equals(tableName)
807           && (noExcludeTables || !excludedTables.contains(tableName))) {
808         toBeClosed.add(hri);
809       }
810     }
811     Map<HRegionInfo, ServerName> allUserRegions =
812       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
813     for (HRegionInfo hri: toBeClosed) {
814       RegionState regionState = updateRegionState(hri, State.CLOSED);
815       allUserRegions.put(hri, regionState.getServerName());
816     }
817     return allUserRegions;
818   }
819 
820   /**
821    * Compute the average load across all region servers.
822    * Currently, this uses a very naive computation - just uses the number of
823    * regions being served, ignoring stats about number of requests.
824    * @return the average load
825    */
826   protected synchronized double getAverageLoad() {
827     int numServers = 0, totalLoad = 0;
828     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
829       Set<HRegionInfo> regions = e.getValue();
830       ServerName serverName = e.getKey();
831       int regionCount = regions.size();
832       if (serverManager.isServerOnline(serverName)) {
833         totalLoad += regionCount;
834         numServers++;
835       }
836     }
837     if (numServers > 1) {
838       // The master region server holds only a couple regions.
839       // Don't consider this server in calculating the average load
840       // if there are other region servers to avoid possible confusion.
841       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
842       if (hris != null) {
843         totalLoad -= hris.size();
844         numServers--;
845       }
846     }
847     return numServers == 0 ? 0.0 :
848       (double)totalLoad / (double)numServers;
849   }
850 
851   /**
852    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
853    * Can't let out original since it can change and at least the load balancer
854    * wants to iterate this exported list.  We need to synchronize on regions
855    * since all access to this.servers is under a lock on this.regions.
856    *
857    * @return A clone of current assignments by table.
858    */
859   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
860       getAssignmentsByTable() {
861     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
862       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
863     synchronized (this) {
864       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
865         Map<ServerName, List<HRegionInfo>> svrToRegions =
866           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
867         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
868           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
869         }
870         result.put(TableName.valueOf("ensemble"), svrToRegions);
871       } else {
872         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
873           for (HRegionInfo hri: e.getValue()) {
874             if (hri.isMetaRegion()) continue;
875             TableName tablename = hri.getTable();
876             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
877             if (svrToRegions == null) {
878               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
879               result.put(tablename, svrToRegions);
880             }
881             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
882             if (regions == null) {
883               regions = new ArrayList<HRegionInfo>();
884               svrToRegions.put(e.getKey(), regions);
885             }
886             regions.add(hri);
887           }
888         }
889       }
890     }
891 
892     Map<ServerName, ServerLoad>
893       onlineSvrs = serverManager.getOnlineServers();
894     // Take care of servers w/o assignments.
895     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
896       for (ServerName svr: onlineSvrs.keySet()) {
897         if (!map.containsKey(svr)) {
898           map.put(svr, new ArrayList<HRegionInfo>());
899         }
900       }
901     }
902     return result;
903   }
904 
905   protected RegionState getRegionState(final HRegionInfo hri) {
906     return getRegionState(hri.getEncodedName());
907   }
908 
909   /**
910    * Returns a clone of region assignments per server
911    * @return a Map of ServerName to a List of HRegionInfo's
912    */
913   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
914     Map<ServerName, List<HRegionInfo>> regionsByServer =
915         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
916     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
917       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
918     }
919     return regionsByServer;
920   }
921 
922   protected synchronized RegionState getRegionState(final String encodedName) {
923     return regionStates.get(encodedName);
924   }
925 
926   /**
927    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
928    * @param  regionName
929    * @return HRegionInfo for the region
930    */
931   @SuppressWarnings("deprecation")
932   protected HRegionInfo getRegionInfo(final byte [] regionName) {
933     String encodedName = HRegionInfo.encodeRegionName(regionName);
934     RegionState regionState = getRegionState(encodedName);
935     if (regionState != null) {
936       return regionState.getRegion();
937     }
938 
939     try {
940       Pair<HRegionInfo, ServerName> p =
941         MetaTableAccessor.getRegion(server.getConnection(), regionName);
942       HRegionInfo hri = p == null ? null : p.getFirst();
943       if (hri != null) {
944         createRegionState(hri);
945       }
946       return hri;
947     } catch (IOException e) {
948       server.abort("Aborting because error occoured while reading "
949         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
950       return null;
951     }
952   }
953 
954   static boolean isOneOfStates(RegionState regionState, State... states) {
955     State s = regionState != null ? regionState.getState() : null;
956     for (State state: states) {
957       if (s == state) return true;
958     }
959     return false;
960   }
961 
962   /**
963    * Update a region state. It will be put in transition if not already there.
964    */
965   private RegionState updateRegionState(final HRegionInfo hri,
966       final RegionState.State state, final ServerName serverName, long openSeqNum) {
967     if (state == RegionState.State.FAILED_CLOSE || state == RegionState.State.FAILED_OPEN) {
968       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
969         + " on " + serverName + ", set to " + state);
970     }
971 
972     String encodedName = hri.getEncodedName();
973     RegionState regionState = new RegionState(
974       hri, state, System.currentTimeMillis(), serverName);
975     RegionState oldState = getRegionState(encodedName);
976     if (!regionState.equals(oldState)) {
977       LOG.info("Transition " + oldState + " to " + regionState);
978       // Persist region state before updating in-memory info, if needed
979       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
980     }
981 
982     synchronized (this) {
983       regionsInTransition.put(encodedName, regionState);
984       regionStates.put(encodedName, regionState);
985 
986       // For these states, region should be properly closed.
987       // There should be no log splitting issue.
988       if ((state == State.CLOSED || state == State.MERGED
989           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
990         ServerName last = lastAssignments.get(encodedName);
991         if (last.equals(serverName)) {
992           lastAssignments.remove(encodedName);
993         } else {
994           LOG.warn(encodedName + " moved to " + state + " on "
995             + serverName + ", expected " + last);
996         }
997       }
998 
999       // Once a region is opened, record its last assignment right away.
1000       if (serverName != null && state == State.OPEN) {
1001         ServerName last = lastAssignments.get(encodedName);
1002         if (!serverName.equals(last)) {
1003           lastAssignments.put(encodedName, serverName);
1004           if (last != null && isServerDeadAndNotProcessed(last)) {
1005             LOG.warn(encodedName + " moved to " + serverName
1006               + ", while it's previous host " + last
1007               + " is dead but not processed yet");
1008           }
1009         }
1010       }
1011 
1012       // notify the change
1013       this.notifyAll();
1014     }
1015     return regionState;
1016   }
1017 }