001/** 002 * 003 * Licensed to the Apache Software Foundation (ASF) under one 004 * or more contributor license agreements. See the NOTICE file 005 * distributed with this work for additional information 006 * regarding copyright ownership. The ASF licenses this file 007 * to you under the Apache License, Version 2.0 (the 008 * "License"); you may not use this file except in compliance 009 * with the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020package org.apache.hadoop.hbase.favored; 021 022import static org.apache.hadoop.hbase.ServerName.NON_STARTCODE; 023 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.HashMap; 027import java.util.HashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Map.Entry; 031import java.util.Random; 032import java.util.Set; 033 034import org.apache.commons.lang3.StringUtils; 035import org.apache.hadoop.conf.Configuration; 036import org.apache.hadoop.hbase.Cell.Type; 037import org.apache.hadoop.hbase.CellBuilderFactory; 038import org.apache.hadoop.hbase.CellBuilderType; 039import org.apache.hadoop.hbase.HBaseIOException; 040import org.apache.hadoop.hbase.HConstants; 041import org.apache.hadoop.hbase.MetaTableAccessor; 042import org.apache.hadoop.hbase.ServerName; 043import org.apache.hadoop.hbase.TableName; 044import org.apache.hadoop.hbase.client.Connection; 045import org.apache.hadoop.hbase.client.ConnectionFactory; 046import org.apache.hadoop.hbase.client.Put; 047import org.apache.hadoop.hbase.client.RegionInfo; 048import org.apache.hadoop.hbase.client.Table; 049import org.apache.hadoop.hbase.master.RackManager; 050import org.apache.hadoop.hbase.util.Bytes; 051import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 052import org.apache.yetus.audience.InterfaceAudience; 053import org.slf4j.Logger; 054import org.slf4j.LoggerFactory; 055 056import org.apache.hbase.thirdparty.com.google.common.collect.Lists; 057import org.apache.hbase.thirdparty.com.google.common.collect.Sets; 058import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 059import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; 060import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.FavoredNodes; 061 062/** 063 * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence for racks, 064 * meta scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer} when needed (from 065 * within calls like {@link FavoredNodeLoadBalancer#randomAssignment(RegionInfo, List)}). 066 * All updates to favored nodes should only be done from {@link FavoredNodesManager} and not 067 * through this helper class (except for tests). 068 */ 069@InterfaceAudience.Private 070public class FavoredNodeAssignmentHelper { 071 private static final Logger LOG = LoggerFactory.getLogger(FavoredNodeAssignmentHelper.class); 072 private RackManager rackManager; 073 private Map<String, List<ServerName>> rackToRegionServerMap; 074 private List<String> uniqueRackList; 075 // This map serves as a cache for rack to sn lookups. The num of 076 // region server entries might not match with that is in servers. 077 private Map<String, String> regionServerToRackMap; 078 private Random random; 079 private List<ServerName> servers; 080 public static final byte [] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn"); 081 public final static short FAVORED_NODES_NUM = 3; 082 public final static short MAX_ATTEMPTS_FN_GENERATION = 10; 083 084 public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) { 085 this(servers, new RackManager(conf)); 086 } 087 088 public FavoredNodeAssignmentHelper(final List<ServerName> servers, 089 final RackManager rackManager) { 090 this.servers = servers; 091 this.rackManager = rackManager; 092 this.rackToRegionServerMap = new HashMap<>(); 093 this.regionServerToRackMap = new HashMap<>(); 094 this.uniqueRackList = new ArrayList<>(); 095 this.random = new Random(); 096 } 097 098 // Always initialize() when FavoredNodeAssignmentHelper is constructed. 099 public void initialize() { 100 for (ServerName sn : this.servers) { 101 String rackName = getRackOfServer(sn); 102 List<ServerName> serverList = this.rackToRegionServerMap.get(rackName); 103 if (serverList == null) { 104 serverList = Lists.newArrayList(); 105 // Add the current rack to the unique rack list 106 this.uniqueRackList.add(rackName); 107 this.rackToRegionServerMap.put(rackName, serverList); 108 } 109 for (ServerName serverName : serverList) { 110 if (ServerName.isSameAddress(sn, serverName)) { 111 // The server is already present, ignore. 112 break; 113 } 114 } 115 serverList.add((sn)); 116 this.regionServerToRackMap.put(sn.getHostname(), rackName); 117 } 118 } 119 120 /** 121 * Update meta table with favored nodes info 122 * @param regionToFavoredNodes map of RegionInfo's to their favored nodes 123 * @param connection connection to be used 124 * @throws IOException 125 */ 126 public static void updateMetaWithFavoredNodesInfo( 127 Map<RegionInfo, List<ServerName>> regionToFavoredNodes, 128 Connection connection) throws IOException { 129 List<Put> puts = new ArrayList<>(); 130 for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) { 131 Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue()); 132 if (put != null) { 133 puts.add(put); 134 } 135 } 136 MetaTableAccessor.putsToMetaTable(connection, puts); 137 LOG.info("Added " + puts.size() + " regions in META"); 138 } 139 140 /** 141 * Update meta table with favored nodes info 142 * @param regionToFavoredNodes 143 * @param conf 144 * @throws IOException 145 */ 146 public static void updateMetaWithFavoredNodesInfo( 147 Map<RegionInfo, List<ServerName>> regionToFavoredNodes, 148 Configuration conf) throws IOException { 149 List<Put> puts = new ArrayList<>(); 150 for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) { 151 Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue()); 152 if (put != null) { 153 puts.add(put); 154 } 155 } 156 // Write the region assignments to the meta table. 157 // TODO: See above overrides take a Connection rather than a Configuration only the 158 // Connection is a short circuit connection. That is not going to good in all cases, when 159 // master and meta are not colocated. Fix when this favored nodes feature is actually used 160 // someday. 161 try (Connection connection = ConnectionFactory.createConnection(conf)) { 162 try (Table metaTable = connection.getTable(TableName.META_TABLE_NAME)) { 163 metaTable.put(puts); 164 } 165 } 166 LOG.info("Added " + puts.size() + " regions in META"); 167 } 168 169 /** 170 * Generates and returns a Put containing the region info for the catalog table and the servers 171 * @return Put object 172 */ 173 private static Put makePutFromRegionInfo(RegionInfo regionInfo, List<ServerName> favoredNodeList) 174 throws IOException { 175 Put put = null; 176 if (favoredNodeList != null) { 177 long time = EnvironmentEdgeManager.currentTime(); 178 put = MetaTableAccessor.makePutFromRegionInfo(regionInfo, time); 179 byte[] favoredNodes = getFavoredNodes(favoredNodeList); 180 put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) 181 .setRow(put.getRow()) 182 .setFamily(HConstants.CATALOG_FAMILY) 183 .setQualifier(FAVOREDNODES_QUALIFIER) 184 .setTimestamp(time) 185 .setType(Type.Put) 186 .setValue(favoredNodes) 187 .build()); 188 LOG.debug("Create the region {} with favored nodes {}", regionInfo.getRegionNameAsString(), 189 favoredNodeList); 190 } 191 return put; 192 } 193 194 /** 195 * @param favoredNodes The PB'ed bytes of favored nodes 196 * @return the array of {@link ServerName} for the byte array of favored nodes. 197 * @throws IOException 198 */ 199 public static ServerName[] getFavoredNodesList(byte[] favoredNodes) throws IOException { 200 FavoredNodes f = FavoredNodes.parseFrom(favoredNodes); 201 List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList(); 202 ServerName[] servers = new ServerName[protoNodes.size()]; 203 int i = 0; 204 for (HBaseProtos.ServerName node : protoNodes) { 205 servers[i++] = ProtobufUtil.toServerName(node); 206 } 207 return servers; 208 } 209 210 /** 211 * @param serverAddrList 212 * @return PB'ed bytes of {@link FavoredNodes} generated by the server list. 213 */ 214 public static byte[] getFavoredNodes(List<ServerName> serverAddrList) { 215 FavoredNodes.Builder f = FavoredNodes.newBuilder(); 216 for (ServerName s : serverAddrList) { 217 HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder(); 218 b.setHostName(s.getHostname()); 219 b.setPort(s.getPort()); 220 b.setStartCode(ServerName.NON_STARTCODE); 221 f.addFavoredNode(b.build()); 222 } 223 return f.build().toByteArray(); 224 } 225 226 // Place the regions round-robin across the racks picking one server from each 227 // rack at a time. Start with a random rack, and a random server from every rack. 228 // If a rack doesn't have enough servers it will go to the next rack and so on. 229 // for choosing a primary. 230 // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible 231 // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6.. 232 // If there were fewer servers in one rack, say r3, which had 3 servers, one possible 233 // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ... 234 // The regions should be distributed proportionately to the racksizes 235 public void placePrimaryRSAsRoundRobin(Map<ServerName, List<RegionInfo>> assignmentMap, 236 Map<RegionInfo, ServerName> primaryRSMap, List<RegionInfo> regions) { 237 List<String> rackList = new ArrayList<>(rackToRegionServerMap.size()); 238 rackList.addAll(rackToRegionServerMap.keySet()); 239 int rackIndex = random.nextInt(rackList.size()); 240 int maxRackSize = 0; 241 for (Map.Entry<String,List<ServerName>> r : rackToRegionServerMap.entrySet()) { 242 if (r.getValue().size() > maxRackSize) { 243 maxRackSize = r.getValue().size(); 244 } 245 } 246 int numIterations = 0; 247 // Initialize the current processing host index. 248 int serverIndex = random.nextInt(maxRackSize); 249 for (RegionInfo regionInfo : regions) { 250 List<ServerName> currentServerList; 251 String rackName; 252 while (true) { 253 rackName = rackList.get(rackIndex); 254 numIterations++; 255 // Get the server list for the current rack 256 currentServerList = rackToRegionServerMap.get(rackName); 257 258 if (serverIndex >= currentServerList.size()) { //not enough machines in this rack 259 if (numIterations % rackList.size() == 0) { 260 if (++serverIndex >= maxRackSize) serverIndex = 0; 261 } 262 if ((++rackIndex) >= rackList.size()) { 263 rackIndex = 0; // reset the rack index to 0 264 } 265 } else break; 266 } 267 268 // Get the current process region server 269 ServerName currentServer = currentServerList.get(serverIndex); 270 271 // Place the current region with the current primary region server 272 primaryRSMap.put(regionInfo, currentServer); 273 if (assignmentMap != null) { 274 List<RegionInfo> regionsForServer = assignmentMap.get(currentServer); 275 if (regionsForServer == null) { 276 regionsForServer = new ArrayList<>(); 277 assignmentMap.put(currentServer, regionsForServer); 278 } 279 regionsForServer.add(regionInfo); 280 } 281 282 // Set the next processing index 283 if (numIterations % rackList.size() == 0) { 284 ++serverIndex; 285 } 286 if ((++rackIndex) >= rackList.size()) { 287 rackIndex = 0; // reset the rack index to 0 288 } 289 } 290 } 291 292 public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryRS( 293 Map<RegionInfo, ServerName> primaryRSMap) { 294 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); 295 for (Map.Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 296 // Get the target region and its primary region server rack 297 RegionInfo regionInfo = entry.getKey(); 298 ServerName primaryRS = entry.getValue(); 299 try { 300 // Create the secondary and tertiary region server pair object. 301 ServerName[] favoredNodes = getSecondaryAndTertiary(regionInfo, primaryRS); 302 if (favoredNodes != null) { 303 secondaryAndTertiaryMap.put(regionInfo, favoredNodes); 304 LOG.debug("Place the secondary and tertiary region server for region " 305 + regionInfo.getRegionNameAsString()); 306 } 307 } catch (Exception e) { 308 LOG.warn("Cannot place the favored nodes for region " + 309 regionInfo.getRegionNameAsString() + " because " + e, e); 310 continue; 311 } 312 } 313 return secondaryAndTertiaryMap; 314 } 315 316 public ServerName[] getSecondaryAndTertiary(RegionInfo regionInfo, ServerName primaryRS) 317 throws IOException { 318 319 ServerName[] favoredNodes;// Get the rack for the primary region server 320 String primaryRack = getRackOfServer(primaryRS); 321 322 if (getTotalNumberOfRacks() == 1) { 323 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); 324 } else { 325 favoredNodes = multiRackCase(regionInfo, primaryRS, primaryRack); 326 } 327 return favoredNodes; 328 } 329 330 private Map<ServerName, Set<RegionInfo>> mapRSToPrimaries( 331 Map<RegionInfo, ServerName> primaryRSMap) { 332 Map<ServerName, Set<RegionInfo>> primaryServerMap = new HashMap<>(); 333 for (Entry<RegionInfo, ServerName> e : primaryRSMap.entrySet()) { 334 Set<RegionInfo> currentSet = primaryServerMap.get(e.getValue()); 335 if (currentSet == null) { 336 currentSet = new HashSet<>(); 337 } 338 currentSet.add(e.getKey()); 339 primaryServerMap.put(e.getValue(), currentSet); 340 } 341 return primaryServerMap; 342 } 343 344 /** 345 * For regions that share the primary, avoid placing the secondary and tertiary 346 * on a same RS. Used for generating new assignments for the 347 * primary/secondary/tertiary RegionServers 348 * @param primaryRSMap 349 * @return the map of regions to the servers the region-files should be hosted on 350 */ 351 public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryWithRestrictions( 352 Map<RegionInfo, ServerName> primaryRSMap) { 353 Map<ServerName, Set<RegionInfo>> serverToPrimaries = 354 mapRSToPrimaries(primaryRSMap); 355 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); 356 357 for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 358 // Get the target region and its primary region server rack 359 RegionInfo regionInfo = entry.getKey(); 360 ServerName primaryRS = entry.getValue(); 361 try { 362 // Get the rack for the primary region server 363 String primaryRack = getRackOfServer(primaryRS); 364 ServerName[] favoredNodes = null; 365 if (getTotalNumberOfRacks() == 1) { 366 // Single rack case: have to pick the secondary and tertiary 367 // from the same rack 368 favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); 369 } else { 370 favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries, 371 secondaryAndTertiaryMap, primaryRack, primaryRS, regionInfo); 372 } 373 if (favoredNodes != null) { 374 secondaryAndTertiaryMap.put(regionInfo, favoredNodes); 375 LOG.debug("Place the secondary and tertiary region server for region " 376 + regionInfo.getRegionNameAsString()); 377 } 378 } catch (Exception e) { 379 LOG.warn("Cannot place the favored nodes for region " 380 + regionInfo.getRegionNameAsString() + " because " + e, e); 381 continue; 382 } 383 } 384 return secondaryAndTertiaryMap; 385 } 386 387 private ServerName[] multiRackCaseWithRestrictions( 388 Map<ServerName, Set<RegionInfo>> serverToPrimaries, 389 Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap, 390 String primaryRack, ServerName primaryRS, RegionInfo regionInfo) throws IOException { 391 // Random to choose the secondary and tertiary region server 392 // from another rack to place the secondary and tertiary 393 // Random to choose one rack except for the current rack 394 Set<String> rackSkipSet = new HashSet<>(); 395 rackSkipSet.add(primaryRack); 396 String secondaryRack = getOneRandomRack(rackSkipSet); 397 List<ServerName> serverList = getServersFromRack(secondaryRack); 398 Set<ServerName> serverSet = new HashSet<>(); 399 serverSet.addAll(serverList); 400 ServerName[] favoredNodes; 401 if (serverList.size() >= 2) { 402 // Randomly pick up two servers from this secondary rack 403 // Skip the secondary for the tertiary placement 404 // skip the servers which share the primary already 405 Set<RegionInfo> primaries = serverToPrimaries.get(primaryRS); 406 Set<ServerName> skipServerSet = new HashSet<>(); 407 while (true) { 408 ServerName[] secondaryAndTertiary = null; 409 if (primaries.size() > 1) { 410 // check where his tertiary and secondary are 411 for (RegionInfo primary : primaries) { 412 secondaryAndTertiary = secondaryAndTertiaryMap.get(primary); 413 if (secondaryAndTertiary != null) { 414 if (getRackOfServer(secondaryAndTertiary[0]).equals(secondaryRack)) { 415 skipServerSet.add(secondaryAndTertiary[0]); 416 } 417 if (getRackOfServer(secondaryAndTertiary[1]).equals(secondaryRack)) { 418 skipServerSet.add(secondaryAndTertiary[1]); 419 } 420 } 421 } 422 } 423 if (skipServerSet.size() + 2 <= serverSet.size()) 424 break; 425 skipServerSet.clear(); 426 rackSkipSet.add(secondaryRack); 427 // we used all racks 428 if (rackSkipSet.size() == getTotalNumberOfRacks()) { 429 // remove the last two added and break 430 skipServerSet.remove(secondaryAndTertiary[0]); 431 skipServerSet.remove(secondaryAndTertiary[1]); 432 break; 433 } 434 secondaryRack = getOneRandomRack(rackSkipSet); 435 serverList = getServersFromRack(secondaryRack); 436 serverSet = new HashSet<>(); 437 serverSet.addAll(serverList); 438 } 439 440 // Place the secondary RS 441 ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet); 442 skipServerSet.add(secondaryRS); 443 // Place the tertiary RS 444 ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet); 445 446 if (secondaryRS == null || tertiaryRS == null) { 447 LOG.error("Cannot place the secondary and tertiary" 448 + " region server for region " 449 + regionInfo.getRegionNameAsString()); 450 } 451 // Create the secondary and tertiary pair 452 favoredNodes = new ServerName[2]; 453 favoredNodes[0] = secondaryRS; 454 favoredNodes[1] = tertiaryRS; 455 } else { 456 // Pick the secondary rs from this secondary rack 457 // and pick the tertiary from another random rack 458 favoredNodes = new ServerName[2]; 459 ServerName secondary = getOneRandomServer(secondaryRack); 460 favoredNodes[0] = secondary; 461 462 // Pick the tertiary 463 if (getTotalNumberOfRacks() == 2) { 464 // Pick the tertiary from the same rack of the primary RS 465 Set<ServerName> serverSkipSet = new HashSet<>(); 466 serverSkipSet.add(primaryRS); 467 favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet); 468 } else { 469 // Pick the tertiary from another rack 470 rackSkipSet.add(secondaryRack); 471 String tertiaryRandomRack = getOneRandomRack(rackSkipSet); 472 favoredNodes[1] = getOneRandomServer(tertiaryRandomRack); 473 } 474 } 475 return favoredNodes; 476 } 477 478 private ServerName[] singleRackCase(RegionInfo regionInfo, 479 ServerName primaryRS, 480 String primaryRack) throws IOException { 481 // Single rack case: have to pick the secondary and tertiary 482 // from the same rack 483 List<ServerName> serverList = getServersFromRack(primaryRack); 484 if ((serverList == null) || (serverList.size() <= 2)) { 485 // Single region server case: cannot not place the favored nodes 486 // on any server; 487 return null; 488 } else { 489 // Randomly select two region servers from the server list and make sure 490 // they are not overlap with the primary region server; 491 Set<ServerName> serverSkipSet = new HashSet<>(); 492 serverSkipSet.add(primaryRS); 493 494 // Place the secondary RS 495 ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet); 496 // Skip the secondary for the tertiary placement 497 serverSkipSet.add(secondaryRS); 498 ServerName tertiaryRS = getOneRandomServer(primaryRack, serverSkipSet); 499 500 if (secondaryRS == null || tertiaryRS == null) { 501 LOG.error("Cannot place the secondary, tertiary favored node for region " + 502 regionInfo.getRegionNameAsString()); 503 } 504 // Create the secondary and tertiary pair 505 ServerName[] favoredNodes = new ServerName[2]; 506 favoredNodes[0] = secondaryRS; 507 favoredNodes[1] = tertiaryRS; 508 return favoredNodes; 509 } 510 } 511 512 /** 513 * Place secondary and tertiary nodes in a multi rack case. 514 * If there are only two racks, then we try the place the secondary 515 * and tertiary on different rack than primary. But if the other rack has 516 * only one region server, then we place primary and tertiary on one rack 517 * and secondary on another. The aim is two distribute the three favored nodes 518 * on >= 2 racks. 519 * TODO: see how we can use generateMissingFavoredNodeMultiRack API here 520 * @param regionInfo Region for which we are trying to generate FN 521 * @param primaryRS The primary favored node. 522 * @param primaryRack The rack of the primary favored node. 523 * @return Array containing secondary and tertiary favored nodes. 524 * @throws IOException Signals that an I/O exception has occurred. 525 */ 526 private ServerName[] multiRackCase(RegionInfo regionInfo, ServerName primaryRS, 527 String primaryRack) throws IOException { 528 529 List<ServerName>favoredNodes = Lists.newArrayList(primaryRS); 530 // Create the secondary and tertiary pair 531 ServerName secondaryRS = generateMissingFavoredNodeMultiRack(favoredNodes); 532 favoredNodes.add(secondaryRS); 533 String secondaryRack = getRackOfServer(secondaryRS); 534 535 ServerName tertiaryRS; 536 if (primaryRack.equals(secondaryRack)) { 537 tertiaryRS = generateMissingFavoredNode(favoredNodes); 538 } else { 539 // Try to place tertiary in secondary RS rack else place on primary rack. 540 tertiaryRS = getOneRandomServer(secondaryRack, Sets.newHashSet(secondaryRS)); 541 if (tertiaryRS == null) { 542 tertiaryRS = getOneRandomServer(primaryRack, Sets.newHashSet(primaryRS)); 543 } 544 // We couldn't find anything in secondary rack, get any FN 545 if (tertiaryRS == null) { 546 tertiaryRS = generateMissingFavoredNode(Lists.newArrayList(primaryRS, secondaryRS)); 547 } 548 } 549 return new ServerName[]{ secondaryRS, tertiaryRS }; 550 } 551 552 public boolean canPlaceFavoredNodes() { 553 return (this.servers.size() >= FAVORED_NODES_NUM); 554 } 555 556 private int getTotalNumberOfRacks() { 557 return this.uniqueRackList.size(); 558 } 559 560 private List<ServerName> getServersFromRack(String rack) { 561 return this.rackToRegionServerMap.get(rack); 562 } 563 564 /** 565 * Gets a random server from the specified rack and skips anything specified. 566 567 * @param rack rack from a server is needed 568 * @param skipServerSet the server shouldn't belong to this set 569 */ 570 protected ServerName getOneRandomServer(String rack, Set<ServerName> skipServerSet) { 571 572 // Is the rack valid? Do we recognize it? 573 if (rack == null || getServersFromRack(rack) == null || 574 getServersFromRack(rack).isEmpty()) { 575 return null; 576 } 577 578 // Lets use a set so we can eliminate duplicates 579 Set<StartcodeAgnosticServerName> serversToChooseFrom = Sets.newHashSet(); 580 for (ServerName sn : getServersFromRack(rack)) { 581 serversToChooseFrom.add(StartcodeAgnosticServerName.valueOf(sn)); 582 } 583 584 if (skipServerSet != null && skipServerSet.size() > 0) { 585 for (ServerName sn : skipServerSet) { 586 serversToChooseFrom.remove(StartcodeAgnosticServerName.valueOf(sn)); 587 } 588 // Do we have any servers left to choose from? 589 if (serversToChooseFrom.isEmpty()) { 590 return null; 591 } 592 } 593 594 ServerName randomServer = null; 595 int randomIndex = random.nextInt(serversToChooseFrom.size()); 596 int j = 0; 597 for (StartcodeAgnosticServerName sn : serversToChooseFrom) { 598 if (j == randomIndex) { 599 randomServer = sn; 600 break; 601 } 602 j++; 603 } 604 605 if (randomServer != null) { 606 return ServerName.valueOf(randomServer.getHostAndPort(), randomServer.getStartcode()); 607 } else { 608 return null; 609 } 610 } 611 612 private ServerName getOneRandomServer(String rack) throws IOException { 613 return this.getOneRandomServer(rack, null); 614 } 615 616 protected String getOneRandomRack(Set<String> skipRackSet) throws IOException { 617 if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) { 618 throw new IOException("Cannot randomly pick another random server"); 619 } 620 621 String randomRack; 622 do { 623 int randomIndex = random.nextInt(this.uniqueRackList.size()); 624 randomRack = this.uniqueRackList.get(randomIndex); 625 } while (skipRackSet.contains(randomRack)); 626 627 return randomRack; 628 } 629 630 public static String getFavoredNodesAsString(List<ServerName> nodes) { 631 StringBuilder strBuf = new StringBuilder(); 632 int i = 0; 633 for (ServerName node : nodes) { 634 strBuf.append(node.getHostAndPort()); 635 if (++i != nodes.size()) strBuf.append(";"); 636 } 637 return strBuf.toString(); 638 } 639 640 /* 641 * Generates a missing favored node based on the input favored nodes. This helps to generate 642 * new FN when there is already 2 FN and we need a third one. For eg, while generating new FN 643 * for split daughters after inheriting 2 FN from the parent. If the cluster has only one rack 644 * it generates from the same rack. If the cluster has multiple racks, then it ensures the new 645 * FN respects the rack constraints similar to HDFS. For eg: if there are 3 FN, they will be 646 * spread across 2 racks. 647 */ 648 public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes) throws IOException { 649 if (this.uniqueRackList.size() == 1) { 650 return generateMissingFavoredNodeSingleRack(favoredNodes, null); 651 } else { 652 return generateMissingFavoredNodeMultiRack(favoredNodes, null); 653 } 654 } 655 656 public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes, 657 List<ServerName> excludeNodes) throws IOException { 658 if (this.uniqueRackList.size() == 1) { 659 return generateMissingFavoredNodeSingleRack(favoredNodes, excludeNodes); 660 } else { 661 return generateMissingFavoredNodeMultiRack(favoredNodes, excludeNodes); 662 } 663 } 664 665 /* 666 * Generate FN for a single rack scenario, don't generate from one of the excluded nodes. Helps 667 * when we would like to find a replacement node. 668 */ 669 private ServerName generateMissingFavoredNodeSingleRack(List<ServerName> favoredNodes, 670 List<ServerName> excludeNodes) throws IOException { 671 ServerName newServer = null; 672 Set<ServerName> excludeFNSet = Sets.newHashSet(favoredNodes); 673 if (excludeNodes != null && excludeNodes.size() > 0) { 674 excludeFNSet.addAll(excludeNodes); 675 } 676 if (favoredNodes.size() < FAVORED_NODES_NUM) { 677 newServer = this.getOneRandomServer(this.uniqueRackList.get(0), excludeFNSet); 678 } 679 return newServer; 680 } 681 682 private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes) 683 throws IOException { 684 return generateMissingFavoredNodeMultiRack(favoredNodes, null); 685 } 686 687 /* 688 * Generates a missing FN based on the input favoredNodes and also the nodes to be skipped. 689 * 690 * Get the current layout of favored nodes arrangement and nodes to be excluded and get a 691 * random node that goes with HDFS block placement. Eg: If the existing nodes are on one rack, 692 * generate one from another rack. We exclude as much as possible so the random selection 693 * has more chance to generate a node within a few iterations, ideally 1. 694 */ 695 private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes, 696 List<ServerName> excludeNodes) throws IOException { 697 698 Set<String> racks = Sets.newHashSet(); 699 Map<String, Set<ServerName>> rackToFNMapping = new HashMap<>(); 700 701 // Lets understand the current rack distribution of the FN 702 for (ServerName sn : favoredNodes) { 703 String rack = getRackOfServer(sn); 704 racks.add(rack); 705 706 Set<ServerName> serversInRack = rackToFNMapping.get(rack); 707 if (serversInRack == null) { 708 serversInRack = Sets.newHashSet(); 709 rackToFNMapping.put(rack, serversInRack); 710 } 711 serversInRack.add(sn); 712 } 713 714 // What racks should be skipped while getting a FN? 715 Set<String> skipRackSet = Sets.newHashSet(); 716 717 /* 718 * If both the FN are from the same rack, then we don't want to generate another FN on the 719 * same rack. If that rack fails, the region would be unavailable. 720 */ 721 if (racks.size() == 1 && favoredNodes.size() > 1) { 722 skipRackSet.add(racks.iterator().next()); 723 } 724 725 /* 726 * If there are no free nodes on the existing racks, we should skip those racks too. We can 727 * reduce the number of iterations for FN selection. 728 */ 729 for (String rack : racks) { 730 if (getServersFromRack(rack) != null && 731 rackToFNMapping.get(rack).size() == getServersFromRack(rack).size()) { 732 skipRackSet.add(rack); 733 } 734 } 735 736 Set<ServerName> favoredNodeSet = Sets.newHashSet(favoredNodes); 737 if (excludeNodes != null && excludeNodes.size() > 0) { 738 favoredNodeSet.addAll(excludeNodes); 739 } 740 741 /* 742 * Lets get a random rack by excluding skipRackSet and generate a random FN from that rack. 743 */ 744 int i = 0; 745 Set<String> randomRacks = Sets.newHashSet(); 746 ServerName newServer = null; 747 do { 748 String randomRack = this.getOneRandomRack(skipRackSet); 749 newServer = this.getOneRandomServer(randomRack, favoredNodeSet); 750 randomRacks.add(randomRack); 751 i++; 752 } while ((i < MAX_ATTEMPTS_FN_GENERATION) && (newServer == null)); 753 754 if (newServer == null) { 755 if (LOG.isTraceEnabled()) { 756 LOG.trace(String.format("Unable to generate additional favored nodes for %s after " 757 + "considering racks %s and skip rack %s with a unique rack list of %s and rack " 758 + "to RS map of %s and RS to rack map of %s", 759 StringUtils.join(favoredNodes, ","), randomRacks, skipRackSet, uniqueRackList, 760 rackToRegionServerMap, regionServerToRackMap)); 761 } 762 throw new IOException(" Unable to generate additional favored nodes for " 763 + StringUtils.join(favoredNodes, ",")); 764 } 765 return newServer; 766 } 767 768 /* 769 * Generate favored nodes for a region. 770 * 771 * Choose a random server as primary and then choose secondary and tertiary FN so its spread 772 * across two racks. 773 */ 774 public List<ServerName> generateFavoredNodes(RegionInfo hri) throws IOException { 775 776 List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); 777 ServerName primary = servers.get(random.nextInt(servers.size())); 778 favoredNodesForRegion.add(ServerName.valueOf(primary.getHostAndPort(), ServerName.NON_STARTCODE)); 779 780 Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(1); 781 primaryRSMap.put(hri, primary); 782 Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = 783 placeSecondaryAndTertiaryRS(primaryRSMap); 784 ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(hri); 785 if (secondaryAndTertiaryNodes != null && secondaryAndTertiaryNodes.length == 2) { 786 for (ServerName sn : secondaryAndTertiaryNodes) { 787 favoredNodesForRegion.add(ServerName.valueOf(sn.getHostAndPort(), ServerName.NON_STARTCODE)); 788 } 789 return favoredNodesForRegion; 790 } else { 791 throw new HBaseIOException("Unable to generate secondary and tertiary favored nodes."); 792 } 793 } 794 795 public Map<RegionInfo, List<ServerName>> generateFavoredNodesRoundRobin( 796 Map<ServerName, List<RegionInfo>> assignmentMap, List<RegionInfo> regions) 797 throws IOException { 798 799 if (regions.size() > 0) { 800 if (canPlaceFavoredNodes()) { 801 Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(); 802 // Lets try to have an equal distribution for primary favored node 803 placePrimaryRSAsRoundRobin(assignmentMap, primaryRSMap, regions); 804 return generateFavoredNodes(primaryRSMap); 805 806 } else { 807 throw new HBaseIOException("Not enough nodes to generate favored nodes"); 808 } 809 } 810 return null; 811 } 812 813 /* 814 * Generate favored nodes for a set of regions when we know where they are currently hosted. 815 */ 816 private Map<RegionInfo, List<ServerName>> generateFavoredNodes( 817 Map<RegionInfo, ServerName> primaryRSMap) { 818 819 Map<RegionInfo, List<ServerName>> generatedFavNodes = new HashMap<>(); 820 Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = 821 placeSecondaryAndTertiaryRS(primaryRSMap); 822 823 for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { 824 List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); 825 RegionInfo region = entry.getKey(); 826 ServerName primarySN = entry.getValue(); 827 favoredNodesForRegion.add(ServerName.valueOf(primarySN.getHostname(), primarySN.getPort(), 828 NON_STARTCODE)); 829 ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(region); 830 if (secondaryAndTertiaryNodes != null) { 831 favoredNodesForRegion.add(ServerName.valueOf( 832 secondaryAndTertiaryNodes[0].getHostname(), secondaryAndTertiaryNodes[0].getPort(), 833 NON_STARTCODE)); 834 favoredNodesForRegion.add(ServerName.valueOf( 835 secondaryAndTertiaryNodes[1].getHostname(), secondaryAndTertiaryNodes[1].getPort(), 836 NON_STARTCODE)); 837 } 838 generatedFavNodes.put(region, favoredNodesForRegion); 839 } 840 return generatedFavNodes; 841 } 842 843 /* 844 * Get the rack of server from local mapping when present, saves lookup by the RackManager. 845 */ 846 private String getRackOfServer(ServerName sn) { 847 if (this.regionServerToRackMap.containsKey(sn.getHostname())) { 848 return this.regionServerToRackMap.get(sn.getHostname()); 849 } else { 850 String rack = this.rackManager.getRack(sn); 851 this.regionServerToRackMap.put(sn.getHostname(), rack); 852 return rack; 853 } 854 } 855}