View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.Deque;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Map.Entry;
32  import java.util.NavigableMap;
33  import java.util.Random;
34  import java.util.Set;
35  import java.util.TreeMap;
36  
37  import org.apache.commons.logging.Log;
38  import org.apache.commons.logging.LogFactory;
39  import org.apache.hadoop.conf.Configuration;
40  import org.apache.hadoop.hbase.ClusterStatus;
41  import org.apache.hadoop.hbase.HBaseIOException;
42  import org.apache.hadoop.hbase.HRegionInfo;
43  import org.apache.hadoop.hbase.RegionLoad;
44  import org.apache.hadoop.hbase.ServerName;
45  import org.apache.hadoop.hbase.TableName;
46  import org.apache.hadoop.hbase.master.AssignmentManager;
47  import org.apache.hadoop.hbase.master.LoadBalancer;
48  import org.apache.hadoop.hbase.master.MasterServices;
49  import org.apache.hadoop.hbase.master.RegionPlan;
50  import org.apache.hadoop.hbase.security.access.AccessControlLists;
51  
52  import com.google.common.base.Joiner;
53  import com.google.common.collect.ArrayListMultimap;
54  import com.google.common.collect.Sets;
55  
56  /**
57   * The base class for load balancers. It provides the the functions used to by
58   * {@link AssignmentManager} to assign regions in the edge cases. It doesn't
59   * provide an implementation of the actual balancing algorithm.
60   *
61   */
62  public abstract class BaseLoadBalancer implements LoadBalancer {
63    private static final int MIN_SERVER_BALANCE = 2;
64    private volatile boolean stopped = false;
65  
66    /**
67     * An efficient array based implementation similar to ClusterState for keeping
68     * the status of the cluster in terms of region assignment and distribution.
69     * To be used by LoadBalancers.
70     */
71    protected static class Cluster {
72      ServerName masterServerName;
73      Set<String> tablesOnMaster;
74      ServerName[] servers;
75      ArrayList<String> tables;
76      HRegionInfo[] regions;
77      Deque<RegionLoad>[] regionLoads;
78      boolean[] backupMasterFlags;
79      int activeMasterIndex = -1;
80      int[][] regionLocations; //regionIndex -> list of serverIndex sorted by locality
81  
82      int[][] regionsPerServer;            //serverIndex -> region list
83      int[]   regionIndexToServerIndex;    //regionIndex -> serverIndex
84      int[]   initialRegionIndexToServerIndex;    //regionIndex -> serverIndex (initial cluster state)
85      int[]   regionIndexToTableIndex;     //regionIndex -> tableIndex
86      int[][] numRegionsPerServerPerTable; //serverIndex -> tableIndex -> # regions
87      int[]   numMaxRegionsPerTable;       //tableIndex -> max number of regions in a single RS
88      int     numUserRegionsOnMaster;      //number of user regions on the active master
89  
90      Integer[] serverIndicesSortedByRegionCount;
91  
92      Map<String, Integer> serversToIndex;
93      Map<String, Integer> tablesToIndex;
94  
95      int numRegions;
96      int numServers;
97      int numTables;
98  
99      int numMovedRegions = 0; //num moved regions from the initial configuration
100     // num of moved regions away from master that should be on the master
101     int numMovedMasterHostedRegions = 0;
102 
103     @SuppressWarnings("unchecked")
104     protected Cluster(ServerName masterServerName,
105         Map<ServerName, List<HRegionInfo>> clusterState,
106         Map<String, Deque<RegionLoad>> loads,
107         RegionLocationFinder regionFinder,
108         Collection<ServerName> backupMasters,
109         Set<String> tablesOnMaster) {
110 
111       this.tablesOnMaster = tablesOnMaster;
112       this.masterServerName = masterServerName;
113       serversToIndex = new HashMap<String, Integer>();
114       tablesToIndex = new HashMap<String, Integer>();
115       //regionsToIndex = new HashMap<HRegionInfo, Integer>();
116 
117       //TODO: We should get the list of tables from master
118       tables = new ArrayList<String>();
119 
120       numRegions = 0;
121 
122       int serverIndex = 0;
123 
124       // Use servername and port as there can be dead servers in this list. We want everything with
125       // a matching hostname and port to have the same index.
126       for (ServerName sn:clusterState.keySet()) {
127         if (serversToIndex.get(sn.getHostAndPort()) == null) {
128           serversToIndex.put(sn.getHostAndPort(), serverIndex++);
129         }
130       }
131 
132       // Count how many regions there are.
133       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
134         numRegions += entry.getValue().size();
135       }
136 
137       numServers = serversToIndex.size();
138       regionsPerServer = new int[serversToIndex.size()][];
139 
140       servers = new ServerName[numServers];
141       regions = new HRegionInfo[numRegions];
142       regionIndexToServerIndex = new int[numRegions];
143       initialRegionIndexToServerIndex = new int[numRegions];
144       regionIndexToTableIndex = new int[numRegions];
145       regionLoads = new Deque[numRegions];
146       regionLocations = new int[numRegions][];
147       serverIndicesSortedByRegionCount = new Integer[numServers];
148       backupMasterFlags = new boolean[numServers];
149 
150       int tableIndex = 0, regionIndex = 0, regionPerServerIndex = 0;
151 
152       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
153         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
154 
155         // keep the servername if this is the first server name for this hostname
156         // or this servername has the newest startcode.
157         if (servers[serverIndex] == null ||
158             servers[serverIndex].getStartcode() < entry.getKey().getStartcode()) {
159           servers[serverIndex] = entry.getKey();
160           backupMasterFlags[serverIndex] = backupMasters != null
161             && backupMasters.contains(servers[serverIndex]);
162         }
163 
164         if (regionsPerServer[serverIndex] != null) {
165           // there is another server with the same hostAndPort in ClusterState.
166           // allocate the array for the total size
167           regionsPerServer[serverIndex] = new int[entry.getValue().size() + regionsPerServer[serverIndex].length];
168         } else {
169           regionsPerServer[serverIndex] = new int[entry.getValue().size()];
170         }
171         serverIndicesSortedByRegionCount[serverIndex] = serverIndex;
172 
173         if (servers[serverIndex].equals(masterServerName)) {
174           activeMasterIndex = serverIndex;
175           for (HRegionInfo hri: entry.getValue()) {
176             if (!shouldBeOnMaster(hri)) {
177               numUserRegionsOnMaster++;
178             }
179           }
180         }
181       }
182 
183       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
184         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
185         regionPerServerIndex = 0;
186 
187         for (HRegionInfo region : entry.getValue()) {
188           String tableName = region.getTable().getNameAsString();
189           Integer idx = tablesToIndex.get(tableName);
190           if (idx == null) {
191             tables.add(tableName);
192             idx = tableIndex;
193             tablesToIndex.put(tableName, tableIndex++);
194           }
195 
196           regions[regionIndex] = region;
197           regionIndexToServerIndex[regionIndex] = serverIndex;
198           initialRegionIndexToServerIndex[regionIndex] = serverIndex;
199           regionIndexToTableIndex[regionIndex] = idx;
200           regionsPerServer[serverIndex][regionPerServerIndex++] = regionIndex;
201 
202           // region load
203           if (loads != null) {
204             Deque<RegionLoad> rl = loads.get(region.getRegionNameAsString());
205             // That could have failed if the RegionLoad is using the other regionName
206             if (rl == null) {
207               // Try getting the region load using encoded name.
208               rl = loads.get(region.getEncodedName());
209             }
210             regionLoads[regionIndex] = rl;
211           }
212 
213           if (regionFinder != null) {
214             //region location
215             List<ServerName> loc = regionFinder.getTopBlockLocations(region);
216             regionLocations[regionIndex] = new int[loc.size()];
217             for (int i=0; i < loc.size(); i++) {
218               regionLocations[regionIndex][i] =
219                   loc.get(i) == null ? -1 :
220                     (serversToIndex.get(loc.get(i).getHostAndPort()) == null ? -1 : serversToIndex.get(loc.get(i).getHostAndPort()));
221             }
222           }
223 
224           regionIndex++;
225         }
226       }
227 
228       numTables = tables.size();
229       numRegionsPerServerPerTable = new int[numServers][numTables];
230 
231       for (int i = 0; i < numServers; i++) {
232         for (int j = 0; j < numTables; j++) {
233           numRegionsPerServerPerTable[i][j] = 0;
234         }
235       }
236 
237       for (int i=0; i < regionIndexToServerIndex.length; i++) {
238         numRegionsPerServerPerTable[regionIndexToServerIndex[i]][regionIndexToTableIndex[i]]++;
239       }
240 
241       numMaxRegionsPerTable = new int[numTables];
242       for (serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
243         for (tableIndex = 0 ; tableIndex < numRegionsPerServerPerTable[serverIndex].length; tableIndex++) {
244           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
245             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
246           }
247         }
248       }
249     }
250 
251     public void moveOrSwapRegion(int lServer, int rServer, int lRegion, int rRegion) {
252       if (servers[lServer].equals(masterServerName)) {
253         if (lRegion >= 0 && !shouldBeOnMaster(regions[lRegion])) {
254           numUserRegionsOnMaster--;
255         }
256         if (rRegion >= 0 && !shouldBeOnMaster(regions[rRegion])) {
257           numUserRegionsOnMaster++;
258         }
259       } else if (servers[rServer].equals(masterServerName)) {
260         if (lRegion >= 0 && !shouldBeOnMaster(regions[lRegion])) {
261           numUserRegionsOnMaster++;
262         }
263         if (rRegion >= 0 && !shouldBeOnMaster(regions[rRegion])) {
264           numUserRegionsOnMaster--;
265         }
266       }
267       //swap
268       if (rRegion >= 0 && lRegion >= 0) {
269         regionMoved(rRegion, rServer, lServer);
270         regionsPerServer[rServer] = replaceRegion(regionsPerServer[rServer], rRegion, lRegion);
271         regionMoved(lRegion, lServer, rServer);
272         regionsPerServer[lServer] = replaceRegion(regionsPerServer[lServer], lRegion, rRegion);
273       } else if (rRegion >= 0) { //move rRegion
274         regionMoved(rRegion, rServer, lServer);
275         regionsPerServer[rServer] = removeRegion(regionsPerServer[rServer], rRegion);
276         regionsPerServer[lServer] = addRegion(regionsPerServer[lServer], rRegion);
277       } else if (lRegion >= 0) { //move lRegion
278         regionMoved(lRegion, lServer, rServer);
279         regionsPerServer[lServer] = removeRegion(regionsPerServer[lServer], lRegion);
280         regionsPerServer[rServer] = addRegion(regionsPerServer[rServer], lRegion);
281       }
282     }
283 
284     /** Region moved out of the server */
285     void regionMoved(int regionIndex, int oldServerIndex, int newServerIndex) {
286       regionIndexToServerIndex[regionIndex] = newServerIndex;
287       if (initialRegionIndexToServerIndex[regionIndex] == newServerIndex) {
288         numMovedRegions--; //region moved back to original location
289         if (shouldBeOnMaster(regions[regionIndex]) && isActiveMaster(newServerIndex)) {
290           // Master hosted region moved back to the active master
291           numMovedMasterHostedRegions--;
292         }
293       } else if (initialRegionIndexToServerIndex[regionIndex] == oldServerIndex) {
294         numMovedRegions++; //region moved from original location
295         if (shouldBeOnMaster(regions[regionIndex]) && isActiveMaster(oldServerIndex)) {
296           // Master hosted region moved away from active the master
297           numMovedMasterHostedRegions++;
298         }
299       }
300       int tableIndex = regionIndexToTableIndex[regionIndex];
301       numRegionsPerServerPerTable[oldServerIndex][tableIndex]--;
302       numRegionsPerServerPerTable[newServerIndex][tableIndex]++;
303 
304       //check whether this caused maxRegionsPerTable in the new Server to be updated
305       if (numRegionsPerServerPerTable[newServerIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
306         numRegionsPerServerPerTable[newServerIndex][tableIndex] = numMaxRegionsPerTable[tableIndex];
307       } else if ((numRegionsPerServerPerTable[oldServerIndex][tableIndex] + 1)
308           == numMaxRegionsPerTable[tableIndex]) {
309         //recompute maxRegionsPerTable since the previous value was coming from the old server
310         for (int serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
311           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
312             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
313           }
314         }
315       }
316     }
317 
318     int[] removeRegion(int[] regions, int regionIndex) {
319       //TODO: this maybe costly. Consider using linked lists
320       int[] newRegions = new int[regions.length - 1];
321       int i = 0;
322       for (i = 0; i < regions.length; i++) {
323         if (regions[i] == regionIndex) {
324           break;
325         }
326         newRegions[i] = regions[i];
327       }
328       System.arraycopy(regions, i+1, newRegions, i, newRegions.length - i);
329       return newRegions;
330     }
331 
332     int[] addRegion(int[] regions, int regionIndex) {
333       int[] newRegions = new int[regions.length + 1];
334       System.arraycopy(regions, 0, newRegions, 0, regions.length);
335       newRegions[newRegions.length - 1] = regionIndex;
336       return newRegions;
337     }
338 
339     int[] replaceRegion(int[] regions, int regionIndex, int newRegionIndex) {
340       int i = 0;
341       for (i = 0; i < regions.length; i++) {
342         if (regions[i] == regionIndex) {
343           regions[i] = newRegionIndex;
344           break;
345         }
346       }
347       return regions;
348     }
349 
350     void sortServersByRegionCount() {
351       Arrays.sort(serverIndicesSortedByRegionCount, numRegionsComparator);
352     }
353 
354     int getNumRegions(int server) {
355       return regionsPerServer[server].length;
356     }
357 
358     boolean isBackupMaster(int server) {
359       return backupMasterFlags[server];
360     }
361 
362     boolean isActiveMaster(int server) {
363       return activeMasterIndex == server;
364     }
365 
366     boolean shouldBeOnMaster(HRegionInfo region) {
367       return tablesOnMaster != null && tablesOnMaster.contains(
368         region.getTable().getNameAsString());
369     }
370 
371     private Comparator<Integer> numRegionsComparator = new Comparator<Integer>() {
372       @Override
373       public int compare(Integer integer, Integer integer2) {
374         return Integer.valueOf(getNumRegions(integer)).compareTo(getNumRegions(integer2));
375       }
376     };
377 
378     @Override
379     public String toString() {
380       String desc = "Cluster{" +
381           "servers=[";
382           for(ServerName sn:servers) {
383              desc += sn.getHostAndPort() + ", ";
384           }
385           desc +=
386           ", serverIndicesSortedByRegionCount="+
387           Arrays.toString(serverIndicesSortedByRegionCount) +
388           ", regionsPerServer=[";
389 
390           for (int[]r:regionsPerServer) {
391             desc += Arrays.toString(r);
392           }
393           desc += "]" +
394           ", numMaxRegionsPerTable=" +
395           Arrays.toString(numMaxRegionsPerTable) +
396           ", numRegions=" +
397           numRegions +
398           ", numServers=" +
399           numServers +
400           ", numTables=" +
401           numTables +
402           ", numMovedRegions=" +
403           numMovedRegions +
404           ", numMovedMasterHostedRegions=" +
405           numMovedMasterHostedRegions +
406           '}';
407       return desc;
408     }
409   }
410 
411   // slop for regions
412   protected float slop;
413   protected Configuration config;
414   private static final Random RANDOM = new Random(System.currentTimeMillis());
415   private static final Log LOG = LogFactory.getLog(BaseLoadBalancer.class);
416 
417   // The weight means that each region on the active/backup master is
418   // equal to that many regions on a normal regionserver, in calculating
419   // the region load by the load balancer. So that the active/backup master
420   // can host less (or equal if weight = 1) regions than normal regionservers.
421   //
422   // The weight can be used to control the number of regions on backup
423   // masters, which shouldn't host as many regions as normal regionservers.
424   // So that we don't need to move around too many regions when a
425   // backup master becomes the active one.
426   //
427   // Currently, the active master weight is used only by StockasticLoadBalancer.
428   // Generally, we don't put any user regions on the active master, which
429   // only hosts regions of tables defined in TABLES_ON_MASTER.
430   // That's why the default activeMasterWeight is high.
431   public static final String BACKUP_MASTER_WEIGHT_KEY =
432     "hbase.balancer.backupMasterWeight";
433   public static final int DEFAULT_BACKUP_MASTER_WEIGHT = 1;
434 
435   private static final String ACTIVE_MASTER_WEIGHT_KEY =
436     "hbase.balancer.activeMasterWeight";
437   private static final int DEFAULT_ACTIVE_MASTER_WEIGHT = 200;
438 
439   // Regions of these tables are put on the master by default.
440   private static final String[] DEFAULT_TABLES_ON_MASTER =
441     new String[] {AccessControlLists.ACL_TABLE_NAME.getNameAsString(),
442       TableName.NAMESPACE_TABLE_NAME.getNameAsString(),
443       TableName.META_TABLE_NAME.getNameAsString()};
444 
445   protected int activeMasterWeight;
446   protected int backupMasterWeight;
447 
448   // a flag to indicate if assigning regions to backup masters
449   protected boolean usingBackupMasters = true;
450   protected final Set<ServerName> excludedServers =
451     Collections.synchronizedSet(new HashSet<ServerName>());
452 
453   protected final Set<String> tablesOnMaster = new HashSet<String>();
454   protected final MetricsBalancer metricsBalancer = new MetricsBalancer();
455   protected ClusterStatus clusterStatus = null;
456   protected ServerName masterServerName;
457   protected MasterServices services;
458 
459   @Override
460   public void setConf(Configuration conf) {
461     setSlop(conf);
462     if (slop < 0) slop = 0;
463     else if (slop > 1) slop = 1;
464 
465     this.config = conf;
466     activeMasterWeight = conf.getInt(
467       ACTIVE_MASTER_WEIGHT_KEY, DEFAULT_ACTIVE_MASTER_WEIGHT);
468     backupMasterWeight = conf.getInt(
469       BACKUP_MASTER_WEIGHT_KEY, DEFAULT_BACKUP_MASTER_WEIGHT);
470     if (backupMasterWeight < 1) {
471       usingBackupMasters = false;
472       LOG.info("Backup master won't host any region since "
473         + BACKUP_MASTER_WEIGHT_KEY + " is " + backupMasterWeight
474         + "(<1)");
475     }
476     String[] tables = conf.getStrings(
477       "hbase.balancer.tablesOnMaster", DEFAULT_TABLES_ON_MASTER);
478     if (tables != null) {
479       for (String table: tables) {
480         tablesOnMaster.add(table);
481       }
482     }
483   }
484 
485   protected void setSlop(Configuration conf) {
486     this.slop = conf.getFloat("hbase.regions.slop", (float) 0.2);
487   }
488 
489   /**
490    * If there is any server excluded, filter it out from the cluster map so
491    * we won't assign any region to it, assuming none's already assigned there.
492    */
493   protected void filterExcludedServers(Map<ServerName, List<HRegionInfo>> clusterMap) {
494     if (excludedServers.isEmpty()) { // No server to filter out
495       return;
496     }
497     Iterator<Map.Entry<ServerName, List<HRegionInfo>>> it = clusterMap.entrySet().iterator();
498     while (it.hasNext()) {
499       Map.Entry<ServerName, List<HRegionInfo>> en = it.next();
500       if (excludedServers.contains(en.getKey()) && en.getValue().isEmpty()) {
501         it.remove();
502       }
503     }
504   }
505 
506   /**
507    * Check if a region belongs to some small system table.
508    * If so, it may be expected to be put on the master regionserver.
509    */
510   protected boolean shouldBeOnMaster(HRegionInfo region) {
511     return tablesOnMaster.contains(region.getTable().getNameAsString());
512   }
513 
514   /**
515    * Balance the regions that should be on master regionserver.
516    */
517   protected List<RegionPlan> balanceMasterRegions(
518       Map<ServerName, List<HRegionInfo>> clusterMap) {
519     if (services == null || clusterMap.size() <= 1) return null;
520     List<RegionPlan> plans = null;
521     List<HRegionInfo> regions = clusterMap.get(masterServerName);
522     if (regions != null) {
523       Iterator<ServerName> keyIt = null;
524       for (HRegionInfo region: regions) {
525         if (shouldBeOnMaster(region)) continue;
526 
527         // Find a non-master regionserver to host the region
528         if (keyIt == null || !keyIt.hasNext()) {
529           keyIt = clusterMap.keySet().iterator();
530         }
531         ServerName dest = keyIt.next();
532         if (masterServerName.equals(dest)) {
533           dest = keyIt.next();
534         }
535 
536         // Move this region away from the master regionserver
537         RegionPlan plan = new RegionPlan(region, masterServerName, dest);
538         if (plans == null) {
539           plans = new ArrayList<RegionPlan>();
540         }
541         plans.add(plan);
542       }
543     }
544     for (Map.Entry<ServerName, List<HRegionInfo>> server: clusterMap.entrySet()) {
545       if (masterServerName.equals(server.getKey())) continue;
546       for (HRegionInfo region: server.getValue()) {
547         if (!shouldBeOnMaster(region)) continue;
548 
549         // Move this region to the master regionserver
550         RegionPlan plan = new RegionPlan(region, server.getKey(), masterServerName);
551         if (plans == null) {
552           plans = new ArrayList<RegionPlan>();
553         }
554         plans.add(plan);
555       }
556     }
557     return plans;
558   }
559 
560   public void excludeServer(ServerName serverName) {
561     if (!usingBackupMasters) excludedServers.add(serverName);
562   }
563 
564   public Set<ServerName> getExcludedServers() {
565     return excludedServers;
566   }
567 
568   @Override
569   public Configuration getConf() {
570     return this.config;
571   }
572 
573   @Override
574   public void setClusterStatus(ClusterStatus st) {
575     this.clusterStatus = st;
576     if (st == null || usingBackupMasters) return;
577 
578     // Not assign any region to backup masters.
579     // Put them on the excluded server list.
580     // Assume there won't be too much backup masters
581     // re/starting, so this won't leak much memory.
582     excludedServers.addAll(st.getBackupMasters());
583   }
584 
585   @Override
586   public void setMasterServices(MasterServices masterServices) {
587     masterServerName = masterServices.getServerName();
588     excludedServers.remove(masterServerName);
589     this.services = masterServices;
590   }
591 
592   protected Collection<ServerName> getBackupMasters() {
593     return clusterStatus == null ? null : clusterStatus.getBackupMasters();
594   }
595 
596   protected boolean needsBalance(ClusterLoadState cs) {
597     if (cs.getNumServers() < MIN_SERVER_BALANCE) {
598       if (LOG.isDebugEnabled()) {
599         LOG.debug("Not running balancer because only " + cs.getNumServers()
600             + " active regionserver(s)");
601       }
602       return false;
603     }
604     // Check if we even need to do any load balancing
605     // HBASE-3681 check sloppiness first
606     float average = cs.getLoadAverage(); // for logging
607     int floor = (int) Math.floor(average * (1 - slop));
608     int ceiling = (int) Math.ceil(average * (1 + slop));
609     if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {
610       NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
611       if (LOG.isTraceEnabled()) {
612         // If nothing to balance, then don't say anything unless trace-level logging.
613         LOG.trace("Skipping load balancing because balanced cluster; " +
614           "servers=" + cs.getNumServers() + "(backupMasters=" + cs.getNumBackupMasters() +
615           ") regions=" + cs.getNumRegions() + " average=" + average + " " +
616           "mostloaded=" + serversByLoad.lastKey().getLoad() +
617           " leastloaded=" + serversByLoad.firstKey().getLoad());
618       }
619       return false;
620     }
621     return true;
622   }
623 
624   /**
625    * Generates a bulk assignment plan to be used on cluster startup using a
626    * simple round-robin assignment.
627    * <p>
628    * Takes a list of all the regions and all the servers in the cluster and
629    * returns a map of each server to the regions that it should be assigned.
630    * <p>
631    * Currently implemented as a round-robin assignment. Same invariant as load
632    * balancing, all servers holding floor(avg) or ceiling(avg).
633    *
634    * TODO: Use block locations from HDFS to place regions with their blocks
635    *
636    * @param regions all regions
637    * @param servers all servers
638    * @return map of server to the regions it should take, or null if no
639    *         assignment is possible (ie. no regions or no servers)
640    */
641   @Override
642   public Map<ServerName, List<HRegionInfo>> roundRobinAssignment(List<HRegionInfo> regions,
643       List<ServerName> servers) {
644     metricsBalancer.incrMiscInvocations();
645     if (regions == null || regions.isEmpty()) {
646       return null;
647     }
648 
649     List<ServerName> backupMasters = normalizeServers(servers);
650     int numServers = servers == null ? 0 : servers.size();
651     int numBackupMasters = backupMasters == null ? 0 : backupMasters.size();
652     if (numServers == 0 && numBackupMasters == 0) {
653       LOG.warn("Wanted to do round robin assignment but no servers to assign to");
654       return null;
655     }
656     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
657     if (numServers + numBackupMasters == 1) { // Only one server, nothing fancy we can do here
658       ServerName server = numServers > 0 ? servers.get(0) : backupMasters.get(0);
659       assignments.put(server, new ArrayList<HRegionInfo>(regions));
660       return assignments;
661     }
662     List<HRegionInfo> masterRegions = null;
663     if (numServers > 0 && servers.contains(masterServerName)) {
664       masterRegions = new ArrayList<HRegionInfo>();
665       if (numServers == 1) {
666         // The only server in servers is the master,
667         // Assign all regions to backup masters
668         numServers = 0;
669       }
670     }
671     int total = regions.size();
672     // Get the number of regions to be assigned
673     // to backup masters based on the weight
674     int numRegions = total * numBackupMasters
675       / (numServers * backupMasterWeight + numBackupMasters);
676     if (numRegions > 0) {
677       // backupMasters can't be null, according to the formula, numBackupMasters != 0
678       roundRobinAssignment(regions, 0,
679         numRegions, backupMasters, masterRegions, assignments);
680     }
681     int remainder = total - numRegions;
682     if (remainder > 0) {
683       // servers can't be null, or contains the master only since numServers != 0
684       roundRobinAssignment(regions, numRegions, remainder,
685         servers, masterRegions, assignments);
686     }
687     if (masterRegions != null && !masterRegions.isEmpty()) {
688       assignments.put(masterServerName, masterRegions);
689     }
690     return assignments;
691   }
692 
693   /**
694    * Generates an immediate assignment plan to be used by a new master for
695    * regions in transition that do not have an already known destination.
696    *
697    * Takes a list of regions that need immediate assignment and a list of all
698    * available servers. Returns a map of regions to the server they should be
699    * assigned to.
700    *
701    * This method will return quickly and does not do any intelligent balancing.
702    * The goal is to make a fast decision not the best decision possible.
703    *
704    * Currently this is random.
705    *
706    * @param regions
707    * @param servers
708    * @return map of regions to the server it should be assigned to
709    */
710   @Override
711   public Map<HRegionInfo, ServerName> immediateAssignment(List<HRegionInfo> regions,
712       List<ServerName> servers) {
713     metricsBalancer.incrMiscInvocations();
714     if (servers == null || servers.isEmpty()) {
715       LOG.warn("Wanted to do random assignment but no servers to assign to");
716       return null;
717     }
718 
719     Map<HRegionInfo, ServerName> assignments = new TreeMap<HRegionInfo, ServerName>();
720     List<ServerName> backupMasters = normalizeServers(servers);
721     for (HRegionInfo region : regions) {
722       assignments.put(region, randomAssignment(region, servers, backupMasters));
723     }
724     return assignments;
725   }
726 
727   /**
728    * Used to assign a single region to a random server.
729    */
730   @Override
731   public ServerName randomAssignment(HRegionInfo regionInfo, List<ServerName> servers) {
732     metricsBalancer.incrMiscInvocations();
733     if (servers == null || servers.isEmpty()) {
734       LOG.warn("Wanted to do random assignment but no servers to assign to");
735       return null;
736     }
737     return randomAssignment(regionInfo, servers,
738       normalizeServers(servers));
739   }
740 
741   /**
742    * Generates a bulk assignment startup plan, attempting to reuse the existing
743    * assignment information from META, but adjusting for the specified list of
744    * available/online servers available for assignment.
745    * <p>
746    * Takes a map of all regions to their existing assignment from META. Also
747    * takes a list of online servers for regions to be assigned to. Attempts to
748    * retain all assignment, so in some instances initial assignment will not be
749    * completely balanced.
750    * <p>
751    * Any leftover regions without an existing server to be assigned to will be
752    * assigned randomly to available servers.
753    *
754    * @param regions regions and existing assignment from meta
755    * @param servers available servers
756    * @return map of servers and regions to be assigned to them
757    */
758   @Override
759   public Map<ServerName, List<HRegionInfo>> retainAssignment(Map<HRegionInfo, ServerName> regions,
760       List<ServerName> servers) {
761     // Update metrics
762     metricsBalancer.incrMiscInvocations();
763     if (regions == null || regions.isEmpty()) {
764       return null;
765     }
766 
767     List<ServerName> backupMasters = normalizeServers(servers);
768     int numServers = servers == null ? 0 : servers.size();
769     int numBackupMasters = backupMasters == null ? 0 : backupMasters.size();
770     if (numServers == 0 && numBackupMasters == 0) {
771       LOG.warn("Wanted to do retain assignment but no servers to assign to");
772       return null;
773     }
774     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
775     if (numServers + numBackupMasters == 1) { // Only one server, nothing fancy we can do here
776       ServerName server = numServers > 0 ? servers.get(0) : backupMasters.get(0);
777       assignments.put(server, new ArrayList<HRegionInfo>(regions.keySet()));
778       return assignments;
779     }
780 
781     // Group all of the old assignments by their hostname.
782     // We can't group directly by ServerName since the servers all have
783     // new start-codes.
784 
785     // Group the servers by their hostname. It's possible we have multiple
786     // servers on the same host on different ports.
787     ArrayListMultimap<String, ServerName> serversByHostname = ArrayListMultimap.create();
788     for (ServerName server : servers) {
789       assignments.put(server, new ArrayList<HRegionInfo>());
790       if (!server.equals(masterServerName)) {
791         serversByHostname.put(server.getHostname(), server);
792       }
793     }
794     if (numBackupMasters > 0) {
795       for (ServerName server : backupMasters) {
796         assignments.put(server, new ArrayList<HRegionInfo>());
797       }
798     }
799 
800     // Collection of the hostnames that used to have regions
801     // assigned, but for which we no longer have any RS running
802     // after the cluster restart.
803     Set<String> oldHostsNoLongerPresent = Sets.newTreeSet();
804 
805     // Master regionserver is in the server list.
806     boolean masterIncluded = servers.contains(masterServerName);
807 
808     int numRandomAssignments = 0;
809     int numRetainedAssigments = 0;
810     for (Map.Entry<HRegionInfo, ServerName> entry : regions.entrySet()) {
811       HRegionInfo region = entry.getKey();
812       ServerName oldServerName = entry.getValue();
813       List<ServerName> localServers = new ArrayList<ServerName>();
814       if (oldServerName != null) {
815         localServers = serversByHostname.get(oldServerName.getHostname());
816       }
817       if (masterIncluded && shouldBeOnMaster(region)) {
818         assignments.get(masterServerName).add(region);
819         if (localServers.contains(masterServerName)) {
820           numRetainedAssigments++;
821         } else {
822           numRandomAssignments++;
823         }
824       } else if (localServers.isEmpty()) {
825         // No servers on the new cluster match up with this hostname,
826         // assign randomly.
827         ServerName randomServer = randomAssignment(region, servers, backupMasters);
828         assignments.get(randomServer).add(region);
829         numRandomAssignments++;
830         if (oldServerName != null) oldHostsNoLongerPresent.add(oldServerName.getHostname());
831       } else if (localServers.size() == 1) {
832         // the usual case - one new server on same host
833         assignments.get(localServers.get(0)).add(region);
834         numRetainedAssigments++;
835       } else {
836         // multiple new servers in the cluster on this same host
837         ServerName target = null;
838         for (ServerName tmp: localServers) {
839           if (tmp.getPort() == oldServerName.getPort()) {
840             target = tmp;
841             break;
842           }
843         }
844         if (target == null) {
845           int size = localServers.size();
846           target = localServers.get(RANDOM.nextInt(size));
847         }
848         assignments.get(target).add(region);
849         numRetainedAssigments++;
850       }
851     }
852 
853     String randomAssignMsg = "";
854     if (numRandomAssignments > 0) {
855       randomAssignMsg =
856           numRandomAssignments + " regions were assigned "
857               + "to random hosts, since the old hosts for these regions are no "
858               + "longer present in the cluster. These hosts were:\n  "
859               + Joiner.on("\n  ").join(oldHostsNoLongerPresent);
860     }
861 
862     LOG.info("Reassigned " + regions.size() + " regions. " + numRetainedAssigments
863         + " retained the pre-restart assignment. " + randomAssignMsg);
864     return assignments;
865   }
866 
867   @Override
868   public void initialize() throws HBaseIOException{
869   }
870 
871   @Override
872   public void regionOnline(HRegionInfo regionInfo, ServerName sn) {
873   }
874 
875   @Override
876   public void regionOffline(HRegionInfo regionInfo) {
877   }
878 
879   @Override
880   public boolean isStopped() {
881     return stopped;
882   }
883 
884   @Override
885   public void stop(String why) {
886     LOG.info("Load Balancer stop requested: "+why);
887     stopped = true;
888   }
889 
890   /**
891    * Prepare the list of target regionservers so that it doesn't
892    * contain any excluded server, or backup master. Those backup masters
893    * used to be in the original list are returned.
894    */
895   private List<ServerName> normalizeServers(List<ServerName> servers) {
896     if (servers == null) {
897       return null;
898     }
899     if (!excludedServers.isEmpty()) {
900       servers.removeAll(excludedServers);
901     }
902     Collection<ServerName> allBackupMasters = getBackupMasters();
903     List<ServerName> backupMasters = null;
904     if (allBackupMasters != null && !allBackupMasters.isEmpty()) {
905       for (ServerName server: allBackupMasters) {
906         if (!servers.contains(server)) {
907           // Ignore backup masters not included
908           continue;
909         }
910         servers.remove(server);
911         if (backupMasters == null) {
912           backupMasters = new ArrayList<ServerName>();
913         }
914         backupMasters.add(server);
915       }
916     }
917     return backupMasters;
918   }
919 
920   /**
921    * Used to assign a single region to a random server. The input should
922    * have been already normalized: 1) servers doesn't include any exclude sever,
923    * 2) servers doesn't include any backup master, 3) backupMasters contains
924    * only backup masters that are intended to host this region, i.e, it
925    * may not have all the backup masters.
926    */
927   private ServerName randomAssignment(HRegionInfo regionInfo,
928       List<ServerName> servers, List<ServerName> backupMasters) {
929     int numServers = servers == null ? 0 : servers.size();
930     int numBackupMasters = backupMasters == null ? 0 : backupMasters.size();
931     if (numServers == 0 && numBackupMasters == 0) {
932       LOG.warn("Wanted to do random assignment but no servers to assign to");
933       return null;
934     }
935     if (servers != null && shouldBeOnMaster(regionInfo)
936         && servers.contains(masterServerName)) {
937       return masterServerName;
938     }
939     // Generate a random number weighted more towards
940     // regular regionservers instead of backup masters.
941     // This formula is chosen for simplicity.
942     int i = RANDOM.nextInt(
943       numBackupMasters + numServers * backupMasterWeight);
944     if (i < numBackupMasters) {
945       return backupMasters.get(i);
946     }
947     i = (i - numBackupMasters)/backupMasterWeight;
948     ServerName sn = servers.get(i);
949     if (sn.equals(masterServerName)) {
950       // Try to avoid master for a user region
951       if (numServers > 1) {
952         i = (i == 0 ? 1 : i - 1);
953         sn = servers.get(i);
954       } else if (numBackupMasters > 0) {
955         sn = backupMasters.get(0);
956       }
957     }
958     return sn;
959   }
960 
961   /**
962    * Round robin a chunk of a list of regions to a list of servers
963    */
964   private void roundRobinAssignment(List<HRegionInfo> regions, int offset,
965       int numRegions, List<ServerName> servers, List<HRegionInfo> masterRegions,
966       Map<ServerName, List<HRegionInfo>> assignments) {
967     boolean masterIncluded = servers.contains(masterServerName);
968     int numServers = servers.size();
969     int skipServers = numServers;
970     if (masterIncluded) {
971       skipServers--;
972     }
973     int max = (int) Math.ceil((float) numRegions / skipServers);
974     int serverIdx = RANDOM.nextInt(numServers);
975     int regionIdx = 0;
976     for (int j = 0; j < numServers; j++) {
977       ServerName server = servers.get((j + serverIdx) % numServers);
978       if (masterIncluded && server.equals(masterServerName)) {
979         // Don't put non-special region on the master regionserver,
980         // So that it is not overloaded.
981         continue;
982       }
983       List<HRegionInfo> serverRegions = new ArrayList<HRegionInfo>(max);
984       for (int i = regionIdx; i < numRegions; i += skipServers) {
985         HRegionInfo region = regions.get(offset + i % numRegions);
986         if (masterRegions == null || !shouldBeOnMaster(region)) {
987           serverRegions.add(region);
988           continue;
989         }
990         // Master is in the list and this is a special region
991         masterRegions.add(region);
992       }
993       assignments.put(server, serverRegions);
994       regionIdx++;
995     }
996   }
997 }