001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019
020package org.apache.hadoop.hbase.favored;
021
022import static org.apache.hadoop.hbase.ServerName.NON_STARTCODE;
023
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.HashMap;
027import java.util.HashSet;
028import java.util.List;
029import java.util.Map;
030import java.util.Map.Entry;
031import java.util.Random;
032import java.util.Set;
033
034import org.apache.commons.lang3.StringUtils;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.hbase.Cell.Type;
037import org.apache.hadoop.hbase.CellBuilderFactory;
038import org.apache.hadoop.hbase.CellBuilderType;
039import org.apache.hadoop.hbase.HBaseIOException;
040import org.apache.hadoop.hbase.HConstants;
041import org.apache.hadoop.hbase.MetaTableAccessor;
042import org.apache.hadoop.hbase.ServerName;
043import org.apache.hadoop.hbase.TableName;
044import org.apache.hadoop.hbase.client.Connection;
045import org.apache.hadoop.hbase.client.ConnectionFactory;
046import org.apache.hadoop.hbase.client.Put;
047import org.apache.hadoop.hbase.client.RegionInfo;
048import org.apache.hadoop.hbase.client.Table;
049import org.apache.hadoop.hbase.master.RackManager;
050import org.apache.hadoop.hbase.util.Bytes;
051import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
052import org.apache.yetus.audience.InterfaceAudience;
053import org.slf4j.Logger;
054import org.slf4j.LoggerFactory;
055
056import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
057import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
058import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
059import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos;
060import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.FavoredNodes;
061
062/**
063 * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence for racks,
064 * meta scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer} when needed (from
065 * within calls like {@link FavoredNodeLoadBalancer#randomAssignment(RegionInfo, List)}).
066 * All updates to favored nodes should only be done from {@link FavoredNodesManager} and not
067 * through this helper class (except for tests).
068 */
069@InterfaceAudience.Private
070public class FavoredNodeAssignmentHelper {
071  private static final Logger LOG = LoggerFactory.getLogger(FavoredNodeAssignmentHelper.class);
072  private RackManager rackManager;
073  private Map<String, List<ServerName>> rackToRegionServerMap;
074  private List<String> uniqueRackList;
075  // This map serves as a cache for rack to sn lookups. The num of
076  // region server entries might not match with that is in servers.
077  private Map<String, String> regionServerToRackMap;
078  private Random random;
079  private List<ServerName> servers;
080  public static final byte [] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn");
081  public final static short FAVORED_NODES_NUM = 3;
082  public final static short MAX_ATTEMPTS_FN_GENERATION = 10;
083
084  public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) {
085    this(servers, new RackManager(conf));
086  }
087
088  public FavoredNodeAssignmentHelper(final List<ServerName> servers,
089      final RackManager rackManager) {
090    this.servers = servers;
091    this.rackManager = rackManager;
092    this.rackToRegionServerMap = new HashMap<>();
093    this.regionServerToRackMap = new HashMap<>();
094    this.uniqueRackList = new ArrayList<>();
095    this.random = new Random();
096  }
097
098  // Always initialize() when FavoredNodeAssignmentHelper is constructed.
099  public void initialize() {
100    for (ServerName sn : this.servers) {
101      String rackName = getRackOfServer(sn);
102      List<ServerName> serverList = this.rackToRegionServerMap.get(rackName);
103      if (serverList == null) {
104        serverList = Lists.newArrayList();
105        // Add the current rack to the unique rack list
106        this.uniqueRackList.add(rackName);
107        this.rackToRegionServerMap.put(rackName, serverList);
108      }
109      for (ServerName serverName : serverList) {
110        if (ServerName.isSameAddress(sn, serverName)) {
111          // The server is already present, ignore.
112          break;
113        }
114      }
115      serverList.add((sn));
116      this.regionServerToRackMap.put(sn.getHostname(), rackName);
117    }
118  }
119
120  /**
121   * Update meta table with favored nodes info
122   * @param regionToFavoredNodes map of RegionInfo's to their favored nodes
123   * @param connection connection to be used
124   * @throws IOException
125   */
126  public static void updateMetaWithFavoredNodesInfo(
127      Map<RegionInfo, List<ServerName>> regionToFavoredNodes,
128      Connection connection) throws IOException {
129    List<Put> puts = new ArrayList<>();
130    for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
131      Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue());
132      if (put != null) {
133        puts.add(put);
134      }
135    }
136    MetaTableAccessor.putsToMetaTable(connection, puts);
137    LOG.info("Added " + puts.size() + " regions in META");
138  }
139
140  /**
141   * Update meta table with favored nodes info
142   * @param regionToFavoredNodes
143   * @param conf
144   * @throws IOException
145   */
146  public static void updateMetaWithFavoredNodesInfo(
147      Map<RegionInfo, List<ServerName>> regionToFavoredNodes,
148      Configuration conf) throws IOException {
149    List<Put> puts = new ArrayList<>();
150    for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
151      Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue());
152      if (put != null) {
153        puts.add(put);
154      }
155    }
156    // Write the region assignments to the meta table.
157    // TODO: See above overrides take a Connection rather than a Configuration only the
158    // Connection is a short circuit connection. That is not going to good in all cases, when
159    // master and meta are not colocated. Fix when this favored nodes feature is actually used
160    // someday.
161    try (Connection connection = ConnectionFactory.createConnection(conf)) {
162      try (Table metaTable = connection.getTable(TableName.META_TABLE_NAME)) {
163        metaTable.put(puts);
164      }
165    }
166    LOG.info("Added " + puts.size() + " regions in META");
167  }
168
169  /**
170   * Generates and returns a Put containing the region info for the catalog table and the servers
171   * @return Put object
172   */
173  private static Put makePutFromRegionInfo(RegionInfo regionInfo, List<ServerName> favoredNodeList)
174      throws IOException {
175    Put put = null;
176    if (favoredNodeList != null) {
177      long time = EnvironmentEdgeManager.currentTime();
178      put = MetaTableAccessor.makePutFromRegionInfo(regionInfo, time);
179      byte[] favoredNodes = getFavoredNodes(favoredNodeList);
180      put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
181          .setRow(put.getRow())
182          .setFamily(HConstants.CATALOG_FAMILY)
183          .setQualifier(FAVOREDNODES_QUALIFIER)
184          .setTimestamp(time)
185          .setType(Type.Put)
186          .setValue(favoredNodes)
187          .build());
188      LOG.debug("Create the region {} with favored nodes {}", regionInfo.getRegionNameAsString(),
189        favoredNodeList);
190    }
191    return put;
192  }
193
194  /**
195   * @param favoredNodes The PB'ed bytes of favored nodes
196   * @return the array of {@link ServerName} for the byte array of favored nodes.
197   * @throws IOException
198   */
199  public static ServerName[] getFavoredNodesList(byte[] favoredNodes) throws IOException {
200    FavoredNodes f = FavoredNodes.parseFrom(favoredNodes);
201    List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList();
202    ServerName[] servers = new ServerName[protoNodes.size()];
203    int i = 0;
204    for (HBaseProtos.ServerName node : protoNodes) {
205      servers[i++] = ProtobufUtil.toServerName(node);
206    }
207    return servers;
208  }
209
210  /**
211   * @param serverAddrList
212   * @return PB'ed bytes of {@link FavoredNodes} generated by the server list.
213   */
214  public static byte[] getFavoredNodes(List<ServerName> serverAddrList) {
215    FavoredNodes.Builder f = FavoredNodes.newBuilder();
216    for (ServerName s : serverAddrList) {
217      HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder();
218      b.setHostName(s.getHostname());
219      b.setPort(s.getPort());
220      b.setStartCode(ServerName.NON_STARTCODE);
221      f.addFavoredNode(b.build());
222    }
223    return f.build().toByteArray();
224  }
225
226  // Place the regions round-robin across the racks picking one server from each
227  // rack at a time. Start with a random rack, and a random server from every rack.
228  // If a rack doesn't have enough servers it will go to the next rack and so on.
229  // for choosing a primary.
230  // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible
231  // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6..
232  // If there were fewer servers in one rack, say r3, which had 3 servers, one possible
233  // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ...
234  // The regions should be distributed proportionately to the racksizes
235  public void placePrimaryRSAsRoundRobin(Map<ServerName, List<RegionInfo>> assignmentMap,
236      Map<RegionInfo, ServerName> primaryRSMap, List<RegionInfo> regions) {
237    List<String> rackList = new ArrayList<>(rackToRegionServerMap.size());
238    rackList.addAll(rackToRegionServerMap.keySet());
239    int rackIndex = random.nextInt(rackList.size());
240    int maxRackSize = 0;
241    for (Map.Entry<String,List<ServerName>> r : rackToRegionServerMap.entrySet()) {
242      if (r.getValue().size() > maxRackSize) {
243        maxRackSize = r.getValue().size();
244      }
245    }
246    int numIterations = 0;
247    // Initialize the current processing host index.
248    int serverIndex = random.nextInt(maxRackSize);
249    for (RegionInfo regionInfo : regions) {
250      List<ServerName> currentServerList;
251      String rackName;
252      while (true) {
253        rackName = rackList.get(rackIndex);
254        numIterations++;
255        // Get the server list for the current rack
256        currentServerList = rackToRegionServerMap.get(rackName);
257
258        if (serverIndex >= currentServerList.size()) { //not enough machines in this rack
259          if (numIterations % rackList.size() == 0) {
260            if (++serverIndex >= maxRackSize) serverIndex = 0;
261          }
262          if ((++rackIndex) >= rackList.size()) {
263            rackIndex = 0; // reset the rack index to 0
264          }
265        } else break;
266      }
267
268      // Get the current process region server
269      ServerName currentServer = currentServerList.get(serverIndex);
270
271      // Place the current region with the current primary region server
272      primaryRSMap.put(regionInfo, currentServer);
273      if (assignmentMap != null) {
274        List<RegionInfo> regionsForServer = assignmentMap.get(currentServer);
275        if (regionsForServer == null) {
276          regionsForServer = new ArrayList<>();
277          assignmentMap.put(currentServer, regionsForServer);
278        }
279        regionsForServer.add(regionInfo);
280      }
281
282      // Set the next processing index
283      if (numIterations % rackList.size() == 0) {
284        ++serverIndex;
285      }
286      if ((++rackIndex) >= rackList.size()) {
287        rackIndex = 0; // reset the rack index to 0
288      }
289    }
290  }
291
292  public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryRS(
293      Map<RegionInfo, ServerName> primaryRSMap) {
294    Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>();
295    for (Map.Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
296      // Get the target region and its primary region server rack
297      RegionInfo regionInfo = entry.getKey();
298      ServerName primaryRS = entry.getValue();
299      try {
300        // Create the secondary and tertiary region server pair object.
301        ServerName[] favoredNodes = getSecondaryAndTertiary(regionInfo, primaryRS);
302        if (favoredNodes != null) {
303          secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
304          LOG.debug("Place the secondary and tertiary region server for region "
305              + regionInfo.getRegionNameAsString());
306        }
307      } catch (Exception e) {
308        LOG.warn("Cannot place the favored nodes for region " +
309            regionInfo.getRegionNameAsString() + " because " + e, e);
310        continue;
311      }
312    }
313    return secondaryAndTertiaryMap;
314  }
315
316  public ServerName[] getSecondaryAndTertiary(RegionInfo regionInfo, ServerName primaryRS)
317      throws IOException {
318
319    ServerName[] favoredNodes;// Get the rack for the primary region server
320    String primaryRack = getRackOfServer(primaryRS);
321
322    if (getTotalNumberOfRacks() == 1) {
323      favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
324    } else {
325      favoredNodes = multiRackCase(regionInfo, primaryRS, primaryRack);
326    }
327    return favoredNodes;
328  }
329
330  private Map<ServerName, Set<RegionInfo>> mapRSToPrimaries(
331      Map<RegionInfo, ServerName> primaryRSMap) {
332    Map<ServerName, Set<RegionInfo>> primaryServerMap = new HashMap<>();
333    for (Entry<RegionInfo, ServerName> e : primaryRSMap.entrySet()) {
334      Set<RegionInfo> currentSet = primaryServerMap.get(e.getValue());
335      if (currentSet == null) {
336        currentSet = new HashSet<>();
337      }
338      currentSet.add(e.getKey());
339      primaryServerMap.put(e.getValue(), currentSet);
340    }
341    return primaryServerMap;
342  }
343
344  /**
345   * For regions that share the primary, avoid placing the secondary and tertiary
346   * on a same RS. Used for generating new assignments for the
347   * primary/secondary/tertiary RegionServers
348   * @param primaryRSMap
349   * @return the map of regions to the servers the region-files should be hosted on
350   */
351  public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryWithRestrictions(
352      Map<RegionInfo, ServerName> primaryRSMap) {
353    Map<ServerName, Set<RegionInfo>> serverToPrimaries =
354        mapRSToPrimaries(primaryRSMap);
355    Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>();
356
357    for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
358      // Get the target region and its primary region server rack
359      RegionInfo regionInfo = entry.getKey();
360      ServerName primaryRS = entry.getValue();
361      try {
362        // Get the rack for the primary region server
363        String primaryRack = getRackOfServer(primaryRS);
364        ServerName[] favoredNodes = null;
365        if (getTotalNumberOfRacks() == 1) {
366          // Single rack case: have to pick the secondary and tertiary
367          // from the same rack
368          favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
369        } else {
370          favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries,
371              secondaryAndTertiaryMap, primaryRack, primaryRS, regionInfo);
372        }
373        if (favoredNodes != null) {
374          secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
375          LOG.debug("Place the secondary and tertiary region server for region "
376              + regionInfo.getRegionNameAsString());
377        }
378      } catch (Exception e) {
379        LOG.warn("Cannot place the favored nodes for region "
380            + regionInfo.getRegionNameAsString() + " because " + e, e);
381        continue;
382      }
383    }
384    return secondaryAndTertiaryMap;
385  }
386
387  private ServerName[] multiRackCaseWithRestrictions(
388      Map<ServerName, Set<RegionInfo>> serverToPrimaries,
389      Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap,
390      String primaryRack, ServerName primaryRS, RegionInfo regionInfo) throws IOException {
391    // Random to choose the secondary and tertiary region server
392    // from another rack to place the secondary and tertiary
393    // Random to choose one rack except for the current rack
394    Set<String> rackSkipSet = new HashSet<>();
395    rackSkipSet.add(primaryRack);
396    String secondaryRack = getOneRandomRack(rackSkipSet);
397    List<ServerName> serverList = getServersFromRack(secondaryRack);
398    Set<ServerName> serverSet = new HashSet<>();
399    serverSet.addAll(serverList);
400    ServerName[] favoredNodes;
401    if (serverList.size() >= 2) {
402      // Randomly pick up two servers from this secondary rack
403      // Skip the secondary for the tertiary placement
404      // skip the servers which share the primary already
405      Set<RegionInfo> primaries = serverToPrimaries.get(primaryRS);
406      Set<ServerName> skipServerSet = new HashSet<>();
407      while (true) {
408        ServerName[] secondaryAndTertiary = null;
409        if (primaries.size() > 1) {
410          // check where his tertiary and secondary are
411          for (RegionInfo primary : primaries) {
412            secondaryAndTertiary = secondaryAndTertiaryMap.get(primary);
413            if (secondaryAndTertiary != null) {
414              if (getRackOfServer(secondaryAndTertiary[0]).equals(secondaryRack)) {
415                skipServerSet.add(secondaryAndTertiary[0]);
416              }
417              if (getRackOfServer(secondaryAndTertiary[1]).equals(secondaryRack)) {
418                skipServerSet.add(secondaryAndTertiary[1]);
419              }
420            }
421          }
422        }
423        if (skipServerSet.size() + 2 <= serverSet.size())
424          break;
425        skipServerSet.clear();
426        rackSkipSet.add(secondaryRack);
427        // we used all racks
428        if (rackSkipSet.size() == getTotalNumberOfRacks()) {
429          // remove the last two added and break
430          skipServerSet.remove(secondaryAndTertiary[0]);
431          skipServerSet.remove(secondaryAndTertiary[1]);
432          break;
433        }
434        secondaryRack = getOneRandomRack(rackSkipSet);
435        serverList = getServersFromRack(secondaryRack);
436        serverSet = new HashSet<>();
437        serverSet.addAll(serverList);
438      }
439
440      // Place the secondary RS
441      ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet);
442      skipServerSet.add(secondaryRS);
443      // Place the tertiary RS
444      ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet);
445
446      if (secondaryRS == null || tertiaryRS == null) {
447        LOG.error("Cannot place the secondary and tertiary"
448            + " region server for region "
449            + regionInfo.getRegionNameAsString());
450      }
451      // Create the secondary and tertiary pair
452      favoredNodes = new ServerName[2];
453      favoredNodes[0] = secondaryRS;
454      favoredNodes[1] = tertiaryRS;
455    } else {
456      // Pick the secondary rs from this secondary rack
457      // and pick the tertiary from another random rack
458      favoredNodes = new ServerName[2];
459      ServerName secondary = getOneRandomServer(secondaryRack);
460      favoredNodes[0] = secondary;
461
462      // Pick the tertiary
463      if (getTotalNumberOfRacks() == 2) {
464        // Pick the tertiary from the same rack of the primary RS
465        Set<ServerName> serverSkipSet = new HashSet<>();
466        serverSkipSet.add(primaryRS);
467        favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet);
468      } else {
469        // Pick the tertiary from another rack
470        rackSkipSet.add(secondaryRack);
471        String tertiaryRandomRack = getOneRandomRack(rackSkipSet);
472        favoredNodes[1] = getOneRandomServer(tertiaryRandomRack);
473      }
474    }
475    return favoredNodes;
476  }
477
478  private ServerName[] singleRackCase(RegionInfo regionInfo,
479      ServerName primaryRS,
480      String primaryRack) throws IOException {
481    // Single rack case: have to pick the secondary and tertiary
482    // from the same rack
483    List<ServerName> serverList = getServersFromRack(primaryRack);
484    if ((serverList == null) || (serverList.size() <= 2)) {
485      // Single region server case: cannot not place the favored nodes
486      // on any server;
487      return null;
488    } else {
489      // Randomly select two region servers from the server list and make sure
490      // they are not overlap with the primary region server;
491     Set<ServerName> serverSkipSet = new HashSet<>();
492     serverSkipSet.add(primaryRS);
493
494     // Place the secondary RS
495     ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet);
496     // Skip the secondary for the tertiary placement
497     serverSkipSet.add(secondaryRS);
498     ServerName tertiaryRS = getOneRandomServer(primaryRack, serverSkipSet);
499
500     if (secondaryRS == null || tertiaryRS == null) {
501       LOG.error("Cannot place the secondary, tertiary favored node for region " +
502           regionInfo.getRegionNameAsString());
503     }
504     // Create the secondary and tertiary pair
505     ServerName[] favoredNodes = new ServerName[2];
506     favoredNodes[0] = secondaryRS;
507     favoredNodes[1] = tertiaryRS;
508     return favoredNodes;
509    }
510  }
511
512  /**
513   * Place secondary and tertiary nodes in a multi rack case.
514   * If there are only two racks, then we try the place the secondary
515   * and tertiary on different rack than primary. But if the other rack has
516   * only one region server, then we place primary and tertiary on one rack
517   * and secondary on another. The aim is two distribute the three favored nodes
518   * on >= 2 racks.
519   * TODO: see how we can use generateMissingFavoredNodeMultiRack API here
520   * @param regionInfo Region for which we are trying to generate FN
521   * @param primaryRS The primary favored node.
522   * @param primaryRack The rack of the primary favored node.
523   * @return Array containing secondary and tertiary favored nodes.
524   * @throws IOException Signals that an I/O exception has occurred.
525   */
526  private ServerName[] multiRackCase(RegionInfo regionInfo, ServerName primaryRS,
527      String primaryRack) throws IOException {
528
529    List<ServerName>favoredNodes = Lists.newArrayList(primaryRS);
530    // Create the secondary and tertiary pair
531    ServerName secondaryRS = generateMissingFavoredNodeMultiRack(favoredNodes);
532    favoredNodes.add(secondaryRS);
533    String secondaryRack = getRackOfServer(secondaryRS);
534
535    ServerName tertiaryRS;
536    if (primaryRack.equals(secondaryRack)) {
537      tertiaryRS = generateMissingFavoredNode(favoredNodes);
538    } else {
539      // Try to place tertiary in secondary RS rack else place on primary rack.
540      tertiaryRS = getOneRandomServer(secondaryRack, Sets.newHashSet(secondaryRS));
541      if (tertiaryRS == null) {
542        tertiaryRS = getOneRandomServer(primaryRack, Sets.newHashSet(primaryRS));
543      }
544      // We couldn't find anything in secondary rack, get any FN
545      if (tertiaryRS == null) {
546        tertiaryRS = generateMissingFavoredNode(Lists.newArrayList(primaryRS, secondaryRS));
547      }
548    }
549    return new ServerName[]{ secondaryRS, tertiaryRS };
550  }
551
552  public boolean canPlaceFavoredNodes() {
553    return (this.servers.size() >= FAVORED_NODES_NUM);
554  }
555
556  private int getTotalNumberOfRacks() {
557    return this.uniqueRackList.size();
558  }
559
560  private List<ServerName> getServersFromRack(String rack) {
561    return this.rackToRegionServerMap.get(rack);
562  }
563
564  /**
565   * Gets a random server from the specified rack and skips anything specified.
566
567   * @param rack rack from a server is needed
568   * @param skipServerSet the server shouldn't belong to this set
569   */
570  protected ServerName getOneRandomServer(String rack, Set<ServerName> skipServerSet) {
571
572    // Is the rack valid? Do we recognize it?
573    if (rack == null || getServersFromRack(rack) == null ||
574        getServersFromRack(rack).isEmpty()) {
575      return null;
576    }
577
578    // Lets use a set so we can eliminate duplicates
579    Set<StartcodeAgnosticServerName> serversToChooseFrom = Sets.newHashSet();
580    for (ServerName sn : getServersFromRack(rack)) {
581      serversToChooseFrom.add(StartcodeAgnosticServerName.valueOf(sn));
582    }
583
584    if (skipServerSet != null && skipServerSet.size() > 0) {
585      for (ServerName sn : skipServerSet) {
586        serversToChooseFrom.remove(StartcodeAgnosticServerName.valueOf(sn));
587      }
588      // Do we have any servers left to choose from?
589      if (serversToChooseFrom.isEmpty()) {
590        return null;
591      }
592    }
593
594    ServerName randomServer = null;
595    int randomIndex = random.nextInt(serversToChooseFrom.size());
596    int j = 0;
597    for (StartcodeAgnosticServerName sn : serversToChooseFrom) {
598      if (j == randomIndex) {
599        randomServer = sn;
600        break;
601      }
602      j++;
603    }
604
605    if (randomServer != null) {
606      return ServerName.valueOf(randomServer.getHostAndPort(), randomServer.getStartcode());
607    } else {
608      return null;
609    }
610  }
611
612  private ServerName getOneRandomServer(String rack) throws IOException {
613    return this.getOneRandomServer(rack, null);
614  }
615
616  protected String getOneRandomRack(Set<String> skipRackSet) throws IOException {
617    if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) {
618      throw new IOException("Cannot randomly pick another random server");
619    }
620
621    String randomRack;
622    do {
623      int randomIndex = random.nextInt(this.uniqueRackList.size());
624      randomRack = this.uniqueRackList.get(randomIndex);
625    } while (skipRackSet.contains(randomRack));
626
627    return randomRack;
628  }
629
630  public static String getFavoredNodesAsString(List<ServerName> nodes) {
631    StringBuilder strBuf = new StringBuilder();
632    int i = 0;
633    for (ServerName node : nodes) {
634      strBuf.append(node.getHostAndPort());
635      if (++i != nodes.size()) strBuf.append(";");
636    }
637    return strBuf.toString();
638  }
639
640  /*
641   * Generates a missing favored node based on the input favored nodes. This helps to generate
642   * new FN when there is already 2 FN and we need a third one. For eg, while generating new FN
643   * for split daughters after inheriting 2 FN from the parent. If the cluster has only one rack
644   * it generates from the same rack. If the cluster has multiple racks, then it ensures the new
645   * FN respects the rack constraints similar to HDFS. For eg: if there are 3 FN, they will be
646   * spread across 2 racks.
647   */
648  public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes) throws IOException {
649    if (this.uniqueRackList.size() == 1) {
650      return generateMissingFavoredNodeSingleRack(favoredNodes, null);
651    } else {
652      return generateMissingFavoredNodeMultiRack(favoredNodes, null);
653    }
654  }
655
656  public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes,
657      List<ServerName> excludeNodes) throws IOException {
658    if (this.uniqueRackList.size() == 1) {
659      return generateMissingFavoredNodeSingleRack(favoredNodes, excludeNodes);
660    } else {
661      return generateMissingFavoredNodeMultiRack(favoredNodes, excludeNodes);
662    }
663  }
664
665  /*
666   * Generate FN for a single rack scenario, don't generate from one of the excluded nodes. Helps
667   * when we would like to find a replacement node.
668   */
669  private ServerName generateMissingFavoredNodeSingleRack(List<ServerName> favoredNodes,
670      List<ServerName> excludeNodes) throws IOException {
671    ServerName newServer = null;
672    Set<ServerName> excludeFNSet = Sets.newHashSet(favoredNodes);
673    if (excludeNodes != null && excludeNodes.size() > 0) {
674      excludeFNSet.addAll(excludeNodes);
675    }
676    if (favoredNodes.size() < FAVORED_NODES_NUM) {
677      newServer = this.getOneRandomServer(this.uniqueRackList.get(0), excludeFNSet);
678    }
679    return newServer;
680  }
681
682  private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes)
683      throws IOException {
684    return generateMissingFavoredNodeMultiRack(favoredNodes, null);
685  }
686
687  /*
688   * Generates a missing FN based on the input favoredNodes and also the nodes to be skipped.
689   *
690   * Get the current layout of favored nodes arrangement and nodes to be excluded and get a
691   * random node that goes with HDFS block placement. Eg: If the existing nodes are on one rack,
692   * generate one from another rack. We exclude as much as possible so the random selection
693   * has more chance to generate a node within a few iterations, ideally 1.
694   */
695  private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes,
696      List<ServerName> excludeNodes) throws IOException {
697
698    Set<String> racks = Sets.newHashSet();
699    Map<String, Set<ServerName>> rackToFNMapping = new HashMap<>();
700
701    // Lets understand the current rack distribution of the FN
702    for (ServerName sn : favoredNodes) {
703      String rack = getRackOfServer(sn);
704      racks.add(rack);
705
706      Set<ServerName> serversInRack = rackToFNMapping.get(rack);
707      if (serversInRack == null) {
708        serversInRack = Sets.newHashSet();
709        rackToFNMapping.put(rack, serversInRack);
710      }
711      serversInRack.add(sn);
712    }
713
714    // What racks should be skipped while getting a FN?
715    Set<String> skipRackSet = Sets.newHashSet();
716
717    /*
718     * If both the FN are from the same rack, then we don't want to generate another FN on the
719     * same rack. If that rack fails, the region would be unavailable.
720     */
721    if (racks.size() == 1 && favoredNodes.size() > 1) {
722      skipRackSet.add(racks.iterator().next());
723    }
724
725    /*
726     * If there are no free nodes on the existing racks, we should skip those racks too. We can
727     * reduce the number of iterations for FN selection.
728     */
729    for (String rack : racks) {
730      if (getServersFromRack(rack) != null &&
731        rackToFNMapping.get(rack).size() == getServersFromRack(rack).size()) {
732        skipRackSet.add(rack);
733      }
734    }
735
736    Set<ServerName> favoredNodeSet = Sets.newHashSet(favoredNodes);
737    if (excludeNodes != null && excludeNodes.size() > 0) {
738      favoredNodeSet.addAll(excludeNodes);
739    }
740
741    /*
742     * Lets get a random rack by excluding skipRackSet and generate a random FN from that rack.
743     */
744    int i = 0;
745    Set<String> randomRacks = Sets.newHashSet();
746    ServerName newServer = null;
747    do {
748      String randomRack = this.getOneRandomRack(skipRackSet);
749      newServer = this.getOneRandomServer(randomRack, favoredNodeSet);
750      randomRacks.add(randomRack);
751      i++;
752    } while ((i < MAX_ATTEMPTS_FN_GENERATION) && (newServer == null));
753
754    if (newServer == null) {
755      if (LOG.isTraceEnabled()) {
756        LOG.trace(String.format("Unable to generate additional favored nodes for %s after "
757            + "considering racks %s and skip rack %s with a unique rack list of %s and rack "
758            + "to RS map of %s and RS to rack map of %s",
759          StringUtils.join(favoredNodes, ","), randomRacks, skipRackSet, uniqueRackList,
760          rackToRegionServerMap, regionServerToRackMap));
761      }
762      throw new IOException(" Unable to generate additional favored nodes for "
763          + StringUtils.join(favoredNodes, ","));
764    }
765    return newServer;
766  }
767
768  /*
769   * Generate favored nodes for a region.
770   *
771   * Choose a random server as primary and then choose secondary and tertiary FN so its spread
772   * across two racks.
773   */
774  public List<ServerName> generateFavoredNodes(RegionInfo hri) throws IOException {
775
776    List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM);
777    ServerName primary = servers.get(random.nextInt(servers.size()));
778    favoredNodesForRegion.add(ServerName.valueOf(primary.getHostAndPort(), ServerName.NON_STARTCODE));
779
780    Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(1);
781    primaryRSMap.put(hri, primary);
782    Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap =
783        placeSecondaryAndTertiaryRS(primaryRSMap);
784    ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(hri);
785    if (secondaryAndTertiaryNodes != null && secondaryAndTertiaryNodes.length == 2) {
786      for (ServerName sn : secondaryAndTertiaryNodes) {
787        favoredNodesForRegion.add(ServerName.valueOf(sn.getHostAndPort(), ServerName.NON_STARTCODE));
788      }
789      return favoredNodesForRegion;
790    } else {
791      throw new HBaseIOException("Unable to generate secondary and tertiary favored nodes.");
792    }
793  }
794
795  public Map<RegionInfo, List<ServerName>> generateFavoredNodesRoundRobin(
796      Map<ServerName, List<RegionInfo>> assignmentMap, List<RegionInfo> regions)
797      throws IOException {
798
799    if (regions.size() > 0) {
800      if (canPlaceFavoredNodes()) {
801        Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>();
802        // Lets try to have an equal distribution for primary favored node
803        placePrimaryRSAsRoundRobin(assignmentMap, primaryRSMap, regions);
804        return generateFavoredNodes(primaryRSMap);
805
806      } else {
807        throw new HBaseIOException("Not enough nodes to generate favored nodes");
808      }
809    }
810    return null;
811  }
812
813  /*
814   * Generate favored nodes for a set of regions when we know where they are currently hosted.
815   */
816  private Map<RegionInfo, List<ServerName>> generateFavoredNodes(
817      Map<RegionInfo, ServerName> primaryRSMap) {
818
819    Map<RegionInfo, List<ServerName>> generatedFavNodes = new HashMap<>();
820    Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap =
821      placeSecondaryAndTertiaryRS(primaryRSMap);
822
823    for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
824      List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM);
825      RegionInfo region = entry.getKey();
826      ServerName primarySN = entry.getValue();
827      favoredNodesForRegion.add(ServerName.valueOf(primarySN.getHostname(), primarySN.getPort(),
828        NON_STARTCODE));
829      ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(region);
830      if (secondaryAndTertiaryNodes != null) {
831        favoredNodesForRegion.add(ServerName.valueOf(
832          secondaryAndTertiaryNodes[0].getHostname(), secondaryAndTertiaryNodes[0].getPort(),
833          NON_STARTCODE));
834        favoredNodesForRegion.add(ServerName.valueOf(
835          secondaryAndTertiaryNodes[1].getHostname(), secondaryAndTertiaryNodes[1].getPort(),
836          NON_STARTCODE));
837      }
838      generatedFavNodes.put(region, favoredNodesForRegion);
839    }
840    return generatedFavNodes;
841  }
842
843  /*
844   * Get the rack of server from local mapping when present, saves lookup by the RackManager.
845   */
846  private String getRackOfServer(ServerName sn) {
847    if (this.regionServerToRackMap.containsKey(sn.getHostname())) {
848      return this.regionServerToRackMap.get(sn.getHostname());
849    } else {
850      String rack = this.rackManager.getRack(sn);
851      this.regionServerToRackMap.put(sn.getHostname(), rack);
852      return rack;
853    }
854  }
855}