001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master; 019 020import java.io.IOException; 021import org.apache.hadoop.hbase.HBaseClassTestRule; 022import org.apache.hadoop.hbase.HBaseTestingUtility; 023import org.apache.hadoop.hbase.HConstants; 024import org.apache.hadoop.hbase.MetaTableAccessor; 025import org.apache.hadoop.hbase.TableName; 026import org.apache.hadoop.hbase.client.Durability; 027import org.apache.hadoop.hbase.client.Put; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.RegionLocator; 030import org.apache.hadoop.hbase.client.Result; 031import org.apache.hadoop.hbase.client.ResultScanner; 032import org.apache.hadoop.hbase.client.Scan; 033import org.apache.hadoop.hbase.client.Table; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.testclassification.MasterTests; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.junit.AfterClass; 038import org.junit.Assert; 039import org.junit.Before; 040import org.junit.BeforeClass; 041import org.junit.ClassRule; 042import org.junit.Ignore; 043import org.junit.Test; 044import org.junit.experimental.categories.Category; 045import org.slf4j.Logger; 046import org.slf4j.LoggerFactory; 047 048/** 049 * Test transitions of state across the master. Sets up the cluster once and then runs a couple of 050 * tests. 051 */ 052@Category({ MasterTests.class, LargeTests.class }) 053public class TestMasterTransitions { 054 055 @ClassRule 056 public static final HBaseClassTestRule CLASS_RULE = 057 HBaseClassTestRule.forClass(TestMasterTransitions.class); 058 059 private static final Logger LOG = LoggerFactory.getLogger(TestMasterTransitions.class); 060 private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); 061 private static final TableName TABLENAME = TableName.valueOf("master_transitions"); 062 private static final byte[][] FAMILIES = 063 new byte[][] { Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c") }; 064 065 /** 066 * Start up a mini cluster and put a small table of many empty regions into it. n 067 */ 068 @BeforeClass 069 public static void beforeAllTests() throws Exception { 070 TEST_UTIL.startMiniCluster(2); 071 // Create a table of three families. This will assign a region. 072 TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES); 073 Table t = TEST_UTIL.getConnection().getTable(TABLENAME); 074 int countOfRegions = -1; 075 try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(TABLENAME)) { 076 countOfRegions = r.getStartKeys().length; 077 } 078 TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME); 079 addToEachStartKey(countOfRegions); 080 t.close(); 081 } 082 083 @AfterClass 084 public static void afterAllTests() throws Exception { 085 TEST_UTIL.shutdownMiniCluster(); 086 } 087 088 @Before 089 public void setup() throws IOException { 090 TEST_UTIL.ensureSomeRegionServersAvailable(2); 091 } 092 093 /** 094 * Listener for regionserver events testing hbase-2428 (Infinite loop of region closes if 095 * hbase:meta region is offline). In particular, listen for the close of the 'metaServer' and when 096 * it comes in, requeue it with a delay as though there were an issue processing the shutdown. As 097 * part of the requeuing, send over a close of a region on 'otherServer' so it comes into a master 098 * that has its meta region marked as offline. 099 */ 100 /* 101 * static class HBase2428Listener implements RegionServerOperationListener { // Map of what we've 102 * delayed so we don't do do repeated delays. private final Set<RegionServerOperation> postponed = 103 * new CopyOnWriteArraySet<RegionServerOperation>(); private boolean done = false;; private 104 * boolean metaShutdownReceived = false; private final HServerAddress metaAddress; private final 105 * MiniHBaseCluster cluster; private final int otherServerIndex; private final RegionInfo hri; 106 * private int closeCount = 0; static final int SERVER_DURATION = 3 * 1000; static final int 107 * CLOSE_DURATION = 1 * 1000; HBase2428Listener(final MiniHBaseCluster c, final HServerAddress 108 * metaAddress, final RegionInfo closingHRI, final int otherServerIndex) { this.cluster = c; 109 * this.metaAddress = metaAddress; this.hri = closingHRI; this.otherServerIndex = 110 * otherServerIndex; } 111 * @Override public boolean process(final RegionServerOperation op) throws IOException { // If a 112 * regionserver shutdown and its of the meta server, then we want to // delay the processing of 113 * the shutdown and send off a close of a region on // the 'otherServer. boolean result = true; if 114 * (op instanceof ProcessServerShutdown) { ProcessServerShutdown pss = (ProcessServerShutdown)op; 115 * if (pss.getDeadServerAddress().equals(this.metaAddress)) { // Don't postpone more than once. if 116 * (!this.postponed.contains(pss)) { // Close some region. 117 * this.cluster.addMessageToSendRegionServer(this.otherServerIndex, new 118 * HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, Bytes.toBytes("Forcing close in test"))); 119 * this.postponed.add(pss); // Put off the processing of the regionserver shutdown processing. 120 * pss.setDelay(SERVER_DURATION); this.metaShutdownReceived = true; // Return false. This will add 121 * this op to the delayed queue. result = false; } } } else { // Have the close run frequently. if 122 * (isWantedCloseOperation(op) != null) { op.setDelay(CLOSE_DURATION); // Count how many times it 123 * comes through here. this.closeCount++; } } return result; } public void processed(final 124 * RegionServerOperation op) { if (isWantedCloseOperation(op) != null) return; this.done = true; } 125 */ 126 /* 127 * n * @return Null if not the wanted ProcessRegionClose, else <code>op</code> cast as a 128 * ProcessRegionClose. 129 */ 130 /* 131 * private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { // Count 132 * every time we get a close operation. if (op instanceof ProcessRegionClose) { ProcessRegionClose 133 * c = (ProcessRegionClose)op; if (c.regionInfo.equals(hri)) { return c; } } return null; } 134 * boolean isDone() { return this.done; } boolean isMetaShutdownReceived() { return 135 * metaShutdownReceived; } int getCloseCount() { return this.closeCount; } 136 * @Override public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { return true; } } 137 */ 138 /** 139 * In 2428, the meta region has just been set offline and then a close comes in. 140 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> 141 */ 142 @Ignore 143 @Test 144 public void testRegionCloseWhenNoMetaHBase2428() throws Exception { 145 /* 146 * LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); MiniHBaseCluster cluster = 147 * TEST_UTIL.getHBaseCluster(); final HMaster master = cluster.getMaster(); int metaIndex = 148 * cluster.getServerWithMeta(); // Figure the index of the server that is not server the 149 * hbase:meta int otherServerIndex = -1; for (int i = 0; i < 150 * cluster.getRegionServerThreads().size(); i++) { if (i == metaIndex) continue; 151 * otherServerIndex = i; break; } final HRegionServer otherServer = 152 * cluster.getRegionServer(otherServerIndex); final HRegionServer metaHRS = 153 * cluster.getRegionServer(metaIndex); // Get a region out on the otherServer. final RegionInfo 154 * hri = otherServer.getOnlineRegions().iterator().next().getRegionInfo(); // Add our 155 * RegionServerOperationsListener HBase2428Listener listener = new HBase2428Listener(cluster, 156 * metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); 157 * master.getRegionServerOperationQueue(). registerRegionServerOperationListener(listener); try 158 * { // Now close the server carrying meta. cluster.abortRegionServer(metaIndex); // First wait 159 * on receipt of meta server shutdown message. while(!listener.metaShutdownReceived) 160 * Threads.sleep(100); while(!listener.isDone()) Threads.sleep(10); // We should not have 161 * retried the close more times than it took for the // server shutdown message to exit the 162 * delay queue and get processed // (Multiple by two to add in some slop in case of GC or 163 * something). assertTrue(listener.getCloseCount() > 1); assertTrue(listener.getCloseCount() < 164 * ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); // Assert the 165 * closed region came back online assertRegionIsBackOnline(hri); } finally { 166 * master.getRegionServerOperationQueue(). unregisterRegionServerOperationListener(listener); } 167 */ 168 } 169 170 /** 171 * Test adding in a new server before old one on same host+port is dead. Make the test more 172 * onerous by having the server under test carry the meta. If confusion between old and new, 173 * purportedly meta never comes back. Test that meta gets redeployed. 174 */ 175 @Ignore 176 @Test 177 public void testAddingServerBeforeOldIsDead2413() throws IOException { 178 /* 179 * LOG.info("Running testAddingServerBeforeOldIsDead2413"); MiniHBaseCluster cluster = 180 * TEST_UTIL.getHBaseCluster(); int count = count(); int metaIndex = 181 * cluster.getServerWithMeta(); MiniHBaseClusterRegionServer metaHRS = 182 * (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); int port = 183 * metaHRS.getServerInfo().getServerAddress().getPort(); Configuration c = 184 * TEST_UTIL.getConfiguration(); String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); try 185 * { LOG.info("KILLED=" + metaHRS); metaHRS.kill(); c.set(HConstants.REGIONSERVER_PORT, 186 * Integer.toString(port)); // Try and start new regionserver. It might clash with the old // 187 * regionserver port so keep trying to get past the BindException. HRegionServer hrs = null; 188 * while (true) { try { hrs = cluster.startRegionServer().getRegionServer(); break; } catch 189 * (IOException e) { if (e.getCause() != null && e.getCause() instanceof 190 * InvocationTargetException) { InvocationTargetException ee = 191 * (InvocationTargetException)e.getCause(); if (ee.getCause() != null && ee.getCause() 192 * instanceof BindException) { LOG.info("BindException; retrying: " + e.toString()); } } } } 193 * LOG.info("STARTED=" + hrs); // Wait until he's been given at least 3 regions before we go on 194 * to try // and count rows in table. while (hrs.getOnlineRegions().size() < 3) 195 * Threads.sleep(100); LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + 196 * " regions"); assertEquals(count, count()); } finally { c.set(HConstants.REGIONSERVER_PORT, 197 * oldPort); } 198 */ 199 } 200 201 /** 202 * HBase2482 is about outstanding region openings. If any are outstanding when a regionserver goes 203 * down, then they'll never deploy. They'll be stuck in the regions-in-transition list for ever. 204 * This listener looks for a region opening HMsg and if its from the server passed on 205 * construction, then we kill it. It also looks out for a close message on the victim server 206 * because that signifies start of the fireworks. 207 */ 208 /* 209 * static class HBase2482Listener implements RegionServerOperationListener { private final 210 * HRegionServer victim; private boolean abortSent = false; // We closed regions on new server. 211 * private volatile boolean closed = false; // Copy of regions on new server private final 212 * Collection<HRegion> copyOfOnlineRegions; // This is the region that was in transition on the 213 * server we aborted. Test // passes if this region comes back online successfully. private 214 * RegionInfo regionToFind; HBase2482Listener(final HRegionServer victim) { this.victim = victim; 215 * // Copy regions currently open on this server so I can notice when // there is a close. 216 * this.copyOfOnlineRegions = this.victim.getCopyOfOnlineRegionsSortedBySize().values(); } 217 * @Override public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { if 218 * (!victim.getServerInfo().equals(serverInfo) || this.abortSent || !this.closed) { return true; } 219 * if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; // Save the region 220 * that is in transition so can test later it came back. this.regionToFind = 221 * incomingMsg.getRegionInfo(); String msg = "ABORTING " + this.victim + " because got a " + 222 * HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + 223 * incomingMsg.getRegionInfo().getRegionNameAsString(); this.victim.abort(msg); this.abortSent = 224 * true; return true; } 225 * @Override public boolean process(RegionServerOperation op) throws IOException { return true; } 226 * @Override public void processed(RegionServerOperation op) { if (this.closed || !(op instanceof 227 * ProcessRegionClose)) return; ProcessRegionClose close = (ProcessRegionClose)op; for (HRegion r: 228 * this.copyOfOnlineRegions) { if (r.getRegionInfo().equals(close.regionInfo)) { // We've closed 229 * one of the regions that was on the victim server. // Now can start testing for when all regions 230 * are back online again LOG.info("Found close of " + r.getRegionInfo().getRegionNameAsString() + 231 * "; setting close happened flag"); this.closed = true; break; } } } } 232 */ 233 /** 234 * In 2482, a RS with an opening region on it dies. The said region is then stuck in the master's 235 * regions-in-transition and never leaves it. This test works by bringing up a new regionserver, 236 * waiting for the load balancer to give it some regions. Then, we close all on the new server. 237 * After sending all the close messages, we send the new regionserver the special blocking message 238 * so it can not process any more messages. Meantime reopening of the just-closed regions is 239 * backed up on the new server. Soon as master gets an opening region from the new regionserver, 240 * we kill it. We then wait on all regions to come back on line. If bug is fixed, this should 241 * happen soon as the processing of the killed server is done. 242 * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> 243 */ 244 @Ignore 245 @Test 246 public void testKillRSWithOpeningRegion2482() throws Exception { 247 /* 248 * LOG.info("Running testKillRSWithOpeningRegion2482"); MiniHBaseCluster cluster = 249 * TEST_UTIL.getHBaseCluster(); if (cluster.getLiveRegionServerThreads().size() < 2) { // Need 250 * at least two servers. cluster.startRegionServer(); } // Count how many regions are online. 251 * They need to be all back online for // this test to succeed. int countOfMetaRegions = 252 * countOfMetaRegions(); // Add a listener on the server. HMaster m = cluster.getMaster(); // 253 * Start new regionserver. MiniHBaseClusterRegionServer hrs = 254 * (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); 255 * LOG.info("Started new regionserver: " + hrs.toString()); // Wait until has some regions 256 * before proceeding. Balancer will give it some. int minimumRegions = 257 * countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); while 258 * (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); // Set the listener only 259 * after some regions have been opened on new server. HBase2482Listener listener = new 260 * HBase2482Listener(hrs); m.getRegionServerOperationQueue(). 261 * registerRegionServerOperationListener(listener); try { // Go close all non-catalog regions on 262 * this new server closeAllNonCatalogRegions(cluster, hrs); // After all closes, add blocking 263 * message before the region opens start to // come in. 264 * cluster.addMessageToSendRegionServer(hrs, new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); // 265 * Wait till one of the above close messages has an effect before we start // wait on all 266 * regions back online. while (!listener.closed) Threads.sleep(100); LOG.info("Past close"); // 267 * Make sure the abort server message was sent. while(!listener.abortSent) Threads.sleep(100); 268 * LOG.info("Past abort send; waiting on all regions to redeploy"); // Now wait for regions to 269 * come back online. assertRegionIsBackOnline(listener.regionToFind); } finally { 270 * m.getRegionServerOperationQueue(). unregisterRegionServerOperationListener(listener); } 271 */ 272 } 273 274 /* 275 * @return Count of all non-catalog regions on the designated server 276 */ 277 /* 278 * private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, final 279 * MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) throws IOException { int countOfRegions = 0; 280 * for (HRegion r: hrs.getOnlineRegions()) { if (r.getRegionInfo().isMetaRegion()) continue; 281 * cluster.addMessageToSendRegionServer(hrs, new HMsg(HMsg.Type.MSG_REGION_CLOSE, 282 * r.getRegionInfo())); LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + 283 * " on " + hrs.toString()); countOfRegions++; } return countOfRegions; } private void 284 * assertRegionIsBackOnline(final RegionInfo hri) throws IOException { // Region should have an 285 * entry in its startkey because of addRowToEachRegion. byte [] row = getStartKey(hri); HTable t = 286 * new HTable(TEST_UTIL.getConfiguration(), TABLENAME); Get g = new Get(row); 287 * assertTrue((t.get(g)).size() > 0); } /* 288 * @return Count of regions in meta table. n 289 */ 290 /* 291 * private static int countOfMetaRegions() throws IOException { HTable meta = new 292 * HTable(TEST_UTIL.getConfiguration(), HConstants.META_TABLE_NAME); int rows = 0; Scan scan = new 293 * Scan(); scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); ResultScanner s 294 * = meta.getScanner(scan); for (Result r = null; (r = s.next()) != null;) { byte [] b = 295 * r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); if (b == null || b.length 296 * <= 0) break; rows++; } s.close(); return rows; } 297 */ 298 /* 299 * Add to each of the regions in hbase:meta a value. Key is the startrow of the region (except its 300 * 'aaa' for first region). Actual value is the row name. nnn 301 */ 302 private static int addToEachStartKey(final int expected) throws IOException { 303 Table t = TEST_UTIL.getConnection().getTable(TABLENAME); 304 Table meta = TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME); 305 int rows = 0; 306 Scan scan = new Scan(); 307 scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); 308 ResultScanner s = meta.getScanner(scan); 309 for (Result r = null; (r = s.next()) != null;) { 310 RegionInfo hri = MetaTableAccessor.getRegionInfo(r); 311 if (hri == null) break; 312 if (!hri.getTable().equals(TABLENAME)) { 313 continue; 314 } 315 316 // If start key, add 'aaa'. 317 if (!hri.getTable().equals(TABLENAME)) { 318 continue; 319 } 320 byte[] row = getStartKey(hri); 321 Put p = new Put(row); 322 p.setDurability(Durability.SKIP_WAL); 323 p.addColumn(getTestFamily(), getTestQualifier(), row); 324 t.put(p); 325 rows++; 326 } 327 s.close(); 328 Assert.assertEquals(expected, rows); 329 t.close(); 330 meta.close(); 331 return rows; 332 } 333 334 /* 335 * n * @return Start key for hri (If start key is '', then return 'aaa'. 336 */ 337 private static byte[] getStartKey(final RegionInfo hri) { 338 return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey()) 339 ? Bytes.toBytes("aaa") 340 : hri.getStartKey(); 341 } 342 343 private static byte[] getTestFamily() { 344 return FAMILIES[0]; 345 } 346 347 private static byte[] getTestQualifier() { 348 return getTestFamily(); 349 } 350}