001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.io.IOException; 021import org.apache.hadoop.hbase.HBaseClassTestRule; 022import org.apache.hadoop.hbase.HBaseTestingUtility; 023import org.apache.hadoop.hbase.HConstants; 024import org.apache.hadoop.hbase.MetaTableAccessor; 025import org.apache.hadoop.hbase.TableName; 026import org.apache.hadoop.hbase.client.Durability; 027import org.apache.hadoop.hbase.client.Put; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.RegionLocator; 030import org.apache.hadoop.hbase.client.Result; 031import org.apache.hadoop.hbase.client.ResultScanner; 032import org.apache.hadoop.hbase.client.Scan; 033import org.apache.hadoop.hbase.client.Table; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.testclassification.MasterTests; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.junit.AfterClass; 038import org.junit.Assert; 039import org.junit.Before; 040import org.junit.BeforeClass; 041import org.junit.ClassRule; 042import org.junit.Ignore; 043import org.junit.Test; 044import org.junit.experimental.categories.Category; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048/** 049 * Test transitions of state across the master. Sets up the cluster once and 050 * then runs a couple of tests. 051 */ 052@Category({MasterTests.class, LargeTests.class}) 053public class TestMasterTransitions { 054 055 @ClassRule 056 public static final HBaseClassTestRule CLASS_RULE = 057 HBaseClassTestRule.forClass(TestMasterTransitions.class); 058 059 private static final Logger LOG = LoggerFactory.getLogger(TestMasterTransitions.class); 060 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 061 private static final TableName TABLENAME = TableName.valueOf("master_transitions"); 062 private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), 063 Bytes.toBytes("b"), Bytes.toBytes("c")}; 064 065 /** 066 * Start up a mini cluster and put a small table of many empty regions into it. 067 * @throws Exception 068 */ 069 @BeforeClass public static void beforeAllTests() throws Exception { 070 TEST_UTIL.startMiniCluster(2); 071 // Create a table of three families. This will assign a region. 072 TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES); 073 Table t = TEST_UTIL.getConnection().getTable(TABLENAME); 074 int countOfRegions = -1; 075 try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(TABLENAME)) { 076 countOfRegions = r.getStartKeys().length; 077 } 078 TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME); 079 addToEachStartKey(countOfRegions); 080 t.close(); 081 } 082 083 @AfterClass public static void afterAllTests() throws Exception { 084 TEST_UTIL.shutdownMiniCluster(); 085 } 086 087 @Before public void setup() throws IOException { 088 TEST_UTIL.ensureSomeRegionServersAvailable(2); 089 } 090 091 /** 092 * Listener for regionserver events testing hbase-2428 (Infinite loop of 093 * region closes if hbase:meta region is offline). In particular, listen 094 * for the close of the 'metaServer' and when it comes in, requeue it with a 095 * delay as though there were an issue processing the shutdown. As part of 096 * the requeuing, send over a close of a region on 'otherServer' so it comes 097 * into a master that has its meta region marked as offline. 098 */ 099 /* 100 static class HBase2428Listener implements RegionServerOperationListener { 101 // Map of what we've delayed so we don't do do repeated delays. 102 private final Set<RegionServerOperation> postponed = 103 new CopyOnWriteArraySet<RegionServerOperation>(); 104 private boolean done = false;; 105 private boolean metaShutdownReceived = false; 106 private final HServerAddress metaAddress; 107 private final MiniHBaseCluster cluster; 108 private final int otherServerIndex; 109 private final RegionInfo hri; 110 private int closeCount = 0; 111 static final int SERVER_DURATION = 3 * 1000; 112 static final int CLOSE_DURATION = 1 * 1000; 113 114 HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, 115 final RegionInfo closingHRI, final int otherServerIndex) { 116 this.cluster = c; 117 this.metaAddress = metaAddress; 118 this.hri = closingHRI; 119 this.otherServerIndex = otherServerIndex; 120 } 121 122 @Override 123 public boolean process(final RegionServerOperation op) throws IOException { 124 // If a regionserver shutdown and its of the meta server, then we want to 125 // delay the processing of the shutdown and send off a close of a region on 126 // the 'otherServer. 127 boolean result = true; 128 if (op instanceof ProcessServerShutdown) { 129 ProcessServerShutdown pss = (ProcessServerShutdown)op; 130 if (pss.getDeadServerAddress().equals(this.metaAddress)) { 131 // Don't postpone more than once. 132 if (!this.postponed.contains(pss)) { 133 // Close some region. 134 this.cluster.addMessageToSendRegionServer(this.otherServerIndex, 135 new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, 136 Bytes.toBytes("Forcing close in test"))); 137 this.postponed.add(pss); 138 // Put off the processing of the regionserver shutdown processing. 139 pss.setDelay(SERVER_DURATION); 140 this.metaShutdownReceived = true; 141 // Return false. This will add this op to the delayed queue. 142 result = false; 143 } 144 } 145 } else { 146 // Have the close run frequently. 147 if (isWantedCloseOperation(op) != null) { 148 op.setDelay(CLOSE_DURATION); 149 // Count how many times it comes through here. 150 this.closeCount++; 151 } 152 } 153 return result; 154 } 155 156 public void processed(final RegionServerOperation op) { 157 if (isWantedCloseOperation(op) != null) return; 158 this.done = true; 159 } 160*/ 161 /* 162 * @param op 163 * @return Null if not the wanted ProcessRegionClose, else <code>op</code> 164 * cast as a ProcessRegionClose. 165 */ 166 /* 167 private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { 168 // Count every time we get a close operation. 169 if (op instanceof ProcessRegionClose) { 170 ProcessRegionClose c = (ProcessRegionClose)op; 171 if (c.regionInfo.equals(hri)) { 172 return c; 173 } 174 } 175 return null; 176 } 177 178 boolean isDone() { 179 return this.done; 180 } 181 182 boolean isMetaShutdownReceived() { 183 return metaShutdownReceived; 184 } 185 186 int getCloseCount() { 187 return this.closeCount; 188 } 189 190 @Override 191 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 192 return true; 193 } 194 } 195*/ 196 /** 197 * In 2428, the meta region has just been set offline and then a close comes 198 * in. 199 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 200 */ 201 @Ignore @Test 202 public void testRegionCloseWhenNoMetaHBase2428() 203 throws Exception { 204 /* 205 LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); 206 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 207 final HMaster master = cluster.getMaster(); 208 int metaIndex = cluster.getServerWithMeta(); 209 // Figure the index of the server that is not server the hbase:meta 210 int otherServerIndex = -1; 211 for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { 212 if (i == metaIndex) continue; 213 otherServerIndex = i; 214 break; 215 } 216 final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); 217 final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); 218 219 // Get a region out on the otherServer. 220 final RegionInfo hri = 221 otherServer.getOnlineRegions().iterator().next().getRegionInfo(); 222 223 // Add our RegionServerOperationsListener 224 HBase2428Listener listener = new HBase2428Listener(cluster, 225 metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 226 master.getRegionServerOperationQueue(). 227 registerRegionServerOperationListener(listener); 228 try { 229 // Now close the server carrying meta. 230 cluster.abortRegionServer(metaIndex); 231 232 // First wait on receipt of meta server shutdown message. 233 while(!listener.metaShutdownReceived) Threads.sleep(100); 234 while(!listener.isDone()) Threads.sleep(10); 235 // We should not have retried the close more times than it took for the 236 // server shutdown message to exit the delay queue and get processed 237 // (Multiple by two to add in some slop in case of GC or something). 238 assertTrue(listener.getCloseCount() > 1); 239 assertTrue(listener.getCloseCount() < 240 ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); 241 242 // Assert the closed region came back online 243 assertRegionIsBackOnline(hri); 244 } finally { 245 master.getRegionServerOperationQueue(). 246 unregisterRegionServerOperationListener(listener); 247 } 248 */ 249 } 250 251 /** 252 * Test adding in a new server before old one on same host+port is dead. 253 * Make the test more onerous by having the server under test carry the meta. 254 * If confusion between old and new, purportedly meta never comes back. Test 255 * that meta gets redeployed. 256 */ 257 @Ignore @Test 258 public void testAddingServerBeforeOldIsDead2413() 259 throws IOException { 260 /* 261 LOG.info("Running testAddingServerBeforeOldIsDead2413"); 262 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 263 int count = count(); 264 int metaIndex = cluster.getServerWithMeta(); 265 MiniHBaseClusterRegionServer metaHRS = 266 (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); 267 int port = metaHRS.getServerInfo().getServerAddress().getPort(); 268 Configuration c = TEST_UTIL.getConfiguration(); 269 String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); 270 try { 271 LOG.info("KILLED=" + metaHRS); 272 metaHRS.kill(); 273 c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); 274 // Try and start new regionserver. It might clash with the old 275 // regionserver port so keep trying to get past the BindException. 276 HRegionServer hrs = null; 277 while (true) { 278 try { 279 hrs = cluster.startRegionServer().getRegionServer(); 280 break; 281 } catch (IOException e) { 282 if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { 283 InvocationTargetException ee = (InvocationTargetException)e.getCause(); 284 if (ee.getCause() != null && ee.getCause() instanceof BindException) { 285 LOG.info("BindException; retrying: " + e.toString()); 286 } 287 } 288 } 289 } 290 LOG.info("STARTED=" + hrs); 291 // Wait until he's been given at least 3 regions before we go on to try 292 // and count rows in table. 293 while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); 294 LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 295 " regions"); 296 assertEquals(count, count()); 297 } finally { 298 c.set(HConstants.REGIONSERVER_PORT, oldPort); 299 } 300 */ 301 } 302 303 /** 304 * HBase2482 is about outstanding region openings. If any are outstanding 305 * when a regionserver goes down, then they'll never deploy. They'll be 306 * stuck in the regions-in-transition list for ever. This listener looks 307 * for a region opening HMsg and if its from the server passed on construction, 308 * then we kill it. It also looks out for a close message on the victim 309 * server because that signifies start of the fireworks. 310 */ 311 /* 312 static class HBase2482Listener implements RegionServerOperationListener { 313 private final HRegionServer victim; 314 private boolean abortSent = false; 315 // We closed regions on new server. 316 private volatile boolean closed = false; 317 // Copy of regions on new server 318 private final Collection<HRegion> copyOfOnlineRegions; 319 // This is the region that was in transition on the server we aborted. Test 320 // passes if this region comes back online successfully. 321 private RegionInfo regionToFind; 322 323 HBase2482Listener(final HRegionServer victim) { 324 this.victim = victim; 325 // Copy regions currently open on this server so I can notice when 326 // there is a close. 327 this.copyOfOnlineRegions = 328 this.victim.getCopyOfOnlineRegionsSortedBySize().values(); 329 } 330 331 @Override 332 public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { 333 if (!victim.getServerInfo().equals(serverInfo) || 334 this.abortSent || !this.closed) { 335 return true; 336 } 337 if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; 338 // Save the region that is in transition so can test later it came back. 339 this.regionToFind = incomingMsg.getRegionInfo(); 340 String msg = "ABORTING " + this.victim + " because got a " + 341 HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 342 incomingMsg.getRegionInfo().getRegionNameAsString(); 343 this.victim.abort(msg); 344 this.abortSent = true; 345 return true; 346 } 347 348 @Override 349 public boolean process(RegionServerOperation op) throws IOException { 350 return true; 351 } 352 353 @Override 354 public void processed(RegionServerOperation op) { 355 if (this.closed || !(op instanceof ProcessRegionClose)) return; 356 ProcessRegionClose close = (ProcessRegionClose)op; 357 for (HRegion r: this.copyOfOnlineRegions) { 358 if (r.getRegionInfo().equals(close.regionInfo)) { 359 // We've closed one of the regions that was on the victim server. 360 // Now can start testing for when all regions are back online again 361 LOG.info("Found close of " + 362 r.getRegionInfo().getRegionNameAsString() + 363 "; setting close happened flag"); 364 this.closed = true; 365 break; 366 } 367 } 368 } 369 } 370*/ 371 /** 372 * In 2482, a RS with an opening region on it dies. The said region is then 373 * stuck in the master's regions-in-transition and never leaves it. This 374 * test works by bringing up a new regionserver, waiting for the load 375 * balancer to give it some regions. Then, we close all on the new server. 376 * After sending all the close messages, we send the new regionserver the 377 * special blocking message so it can not process any more messages. 378 * Meantime reopening of the just-closed regions is backed up on the new 379 * server. Soon as master gets an opening region from the new regionserver, 380 * we kill it. We then wait on all regions to come back on line. If bug 381 * is fixed, this should happen soon as the processing of the killed server is 382 * done. 383 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 384 */ 385 @Ignore @Test 386 public void testKillRSWithOpeningRegion2482() 387 throws Exception { 388 /* 389 LOG.info("Running testKillRSWithOpeningRegion2482"); 390 MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); 391 if (cluster.getLiveRegionServerThreads().size() < 2) { 392 // Need at least two servers. 393 cluster.startRegionServer(); 394 } 395 // Count how many regions are online. They need to be all back online for 396 // this test to succeed. 397 int countOfMetaRegions = countOfMetaRegions(); 398 // Add a listener on the server. 399 HMaster m = cluster.getMaster(); 400 // Start new regionserver. 401 MiniHBaseClusterRegionServer hrs = 402 (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 403 LOG.info("Started new regionserver: " + hrs.toString()); 404 // Wait until has some regions before proceeding. Balancer will give it some. 405 int minimumRegions = 406 countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); 407 while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); 408 // Set the listener only after some regions have been opened on new server. 409 HBase2482Listener listener = new HBase2482Listener(hrs); 410 m.getRegionServerOperationQueue(). 411 registerRegionServerOperationListener(listener); 412 try { 413 // Go close all non-catalog regions on this new server 414 closeAllNonCatalogRegions(cluster, hrs); 415 // After all closes, add blocking message before the region opens start to 416 // come in. 417 cluster.addMessageToSendRegionServer(hrs, 418 new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); 419 // Wait till one of the above close messages has an effect before we start 420 // wait on all regions back online. 421 while (!listener.closed) Threads.sleep(100); 422 LOG.info("Past close"); 423 // Make sure the abort server message was sent. 424 while(!listener.abortSent) Threads.sleep(100); 425 LOG.info("Past abort send; waiting on all regions to redeploy"); 426 // Now wait for regions to come back online. 427 assertRegionIsBackOnline(listener.regionToFind); 428 } finally { 429 m.getRegionServerOperationQueue(). 430 unregisterRegionServerOperationListener(listener); 431 } 432 */ 433 } 434 435 /* 436 * @return Count of all non-catalog regions on the designated server 437 */ 438/* 439 private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, 440 final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) 441 throws IOException { 442 int countOfRegions = 0; 443 for (HRegion r: hrs.getOnlineRegions()) { 444 if (r.getRegionInfo().isMetaRegion()) continue; 445 cluster.addMessageToSendRegionServer(hrs, 446 new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); 447 LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 448 " on " + hrs.toString()); 449 countOfRegions++; 450 } 451 return countOfRegions; 452 } 453 454 private void assertRegionIsBackOnline(final RegionInfo hri) 455 throws IOException { 456 // Region should have an entry in its startkey because of addRowToEachRegion. 457 byte [] row = getStartKey(hri); 458 HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); 459 Get g = new Get(row); 460 assertTrue((t.get(g)).size() > 0); 461 } 462 463 /* 464 * @return Count of regions in meta table. 465 * @throws IOException 466 */ 467 /* 468 private static int countOfMetaRegions() 469 throws IOException { 470 HTable meta = new HTable(TEST_UTIL.getConfiguration(), 471 HConstants.META_TABLE_NAME); 472 int rows = 0; 473 Scan scan = new Scan(); 474 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 475 ResultScanner s = meta.getScanner(scan); 476 for (Result r = null; (r = s.next()) != null;) { 477 byte [] b = 478 r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); 479 if (b == null || b.length <= 0) break; 480 rows++; 481 } 482 s.close(); 483 return rows; 484 } 485*/ 486 /* 487 * Add to each of the regions in hbase:meta a value. Key is the startrow of the 488 * region (except its 'aaa' for first region). Actual value is the row name. 489 * @param expected 490 * @return 491 * @throws IOException 492 */ 493 private static int addToEachStartKey(final int expected) throws IOException { 494 Table t = TEST_UTIL.getConnection().getTable(TABLENAME); 495 Table meta = TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME); 496 int rows = 0; 497 Scan scan = new Scan(); 498 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 499 ResultScanner s = meta.getScanner(scan); 500 for (Result r = null; (r = s.next()) != null;) { 501 RegionInfo hri = MetaTableAccessor.getRegionInfo(r); 502 if (hri == null) break; 503 if (!hri.getTable().equals(TABLENAME)) { 504 continue; 505 } 506 507 // If start key, add 'aaa'. 508 if(!hri.getTable().equals(TABLENAME)) { 509 continue; 510 } 511 byte [] row = getStartKey(hri); 512 Put p = new Put(row); 513 p.setDurability(Durability.SKIP_WAL); 514 p.addColumn(getTestFamily(), getTestQualifier(), row); 515 t.put(p); 516 rows++; 517 } 518 s.close(); 519 Assert.assertEquals(expected, rows); 520 t.close(); 521 meta.close(); 522 return rows; 523 } 524 525 /* 526 * @param hri 527 * @return Start key for hri (If start key is '', then return 'aaa'. 528 */ 529 private static byte [] getStartKey(final RegionInfo hri) { 530 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? 531 Bytes.toBytes("aaa"): hri.getStartKey(); 532 } 533 534 private static byte [] getTestFamily() { 535 return FAMILIES[0]; 536 } 537 538 private static byte [] getTestQualifier() { 539 return getTestFamily(); 540 } 541}