001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import org.apache.hadoop.hbase.CatalogFamilyFormat;
022import org.apache.hadoop.hbase.HBaseClassTestRule;
023import org.apache.hadoop.hbase.HBaseTestingUtil;
024import org.apache.hadoop.hbase.HConstants;
025import org.apache.hadoop.hbase.TableName;
026import org.apache.hadoop.hbase.client.Durability;
027import org.apache.hadoop.hbase.client.Put;
028import org.apache.hadoop.hbase.client.RegionInfo;
029import org.apache.hadoop.hbase.client.RegionLocator;
030import org.apache.hadoop.hbase.client.Result;
031import org.apache.hadoop.hbase.client.ResultScanner;
032import org.apache.hadoop.hbase.client.Scan;
033import org.apache.hadoop.hbase.client.Table;
034import org.apache.hadoop.hbase.testclassification.LargeTests;
035import org.apache.hadoop.hbase.testclassification.MasterTests;
036import org.apache.hadoop.hbase.util.Bytes;
037import org.junit.AfterClass;
038import org.junit.Assert;
039import org.junit.Before;
040import org.junit.BeforeClass;
041import org.junit.ClassRule;
042import org.junit.Ignore;
043import org.junit.Test;
044import org.junit.experimental.categories.Category;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048/**
049 * Test transitions of state across the master. Sets up the cluster once and then runs a couple of
050 * tests.
051 */
052@Category({ MasterTests.class, LargeTests.class })
053public class TestMasterTransitions {
054
055  @ClassRule
056  public static final HBaseClassTestRule CLASS_RULE =
057    HBaseClassTestRule.forClass(TestMasterTransitions.class);
058
059  private static final Logger LOG = LoggerFactory.getLogger(TestMasterTransitions.class);
060  private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
061  private static final TableName TABLENAME = TableName.valueOf("master_transitions");
062  private static final byte[][] FAMILIES =
063    new byte[][] { Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c") };
064
065  /**
066   * Start up a mini cluster and put a small table of many empty regions into it.
067   */
068  @BeforeClass
069  public static void beforeAllTests() throws Exception {
070    TEST_UTIL.startMiniCluster(2);
071    // Create a table of three families. This will assign a region.
072    TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES);
073    Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
074    int countOfRegions = -1;
075    try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(TABLENAME)) {
076      countOfRegions = r.getStartKeys().length;
077    }
078    TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME);
079    addToEachStartKey(countOfRegions);
080    t.close();
081  }
082
083  @AfterClass
084  public static void afterAllTests() throws Exception {
085    TEST_UTIL.shutdownMiniCluster();
086  }
087
088  @Before
089  public void setup() throws IOException {
090    TEST_UTIL.ensureSomeRegionServersAvailable(2);
091  }
092
093  /**
094   * Listener for regionserver events testing hbase-2428 (Infinite loop of region closes if
095   * hbase:meta region is offline). In particular, listen for the close of the 'metaServer' and when
096   * it comes in, requeue it with a delay as though there were an issue processing the shutdown. As
097   * part of the requeuing, send over a close of a region on 'otherServer' so it comes into a master
098   * that has its meta region marked as offline.
099   */
100  /*
101   * static class HBase2428Listener implements RegionServerOperationListener { // Map of what we've
102   * delayed so we don't do do repeated delays. private final Set<RegionServerOperation> postponed =
103   * new CopyOnWriteArraySet<RegionServerOperation>(); private boolean done = false;; private
104   * boolean metaShutdownReceived = false; private final HServerAddress metaAddress; private final
105   * MiniHBaseCluster cluster; private final int otherServerIndex; private final RegionInfo hri;
106   * private int closeCount = 0; static final int SERVER_DURATION = 3 * 1000; static final int
107   * CLOSE_DURATION = 1 * 1000; HBase2428Listener(final MiniHBaseCluster c, final HServerAddress
108   * metaAddress, final RegionInfo closingHRI, final int otherServerIndex) { this.cluster = c;
109   * this.metaAddress = metaAddress; this.hri = closingHRI; this.otherServerIndex =
110   * otherServerIndex; }
111   * @Override public boolean process(final RegionServerOperation op) throws IOException { // If a
112   * regionserver shutdown and its of the meta server, then we want to // delay the processing of
113   * the shutdown and send off a close of a region on // the 'otherServer. boolean result = true; if
114   * (op instanceof ProcessServerShutdown) { ProcessServerShutdown pss = (ProcessServerShutdown)op;
115   * if (pss.getDeadServerAddress().equals(this.metaAddress)) { // Don't postpone more than once. if
116   * (!this.postponed.contains(pss)) { // Close some region.
117   * this.cluster.addMessageToSendRegionServer(this.otherServerIndex, new
118   * HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, Bytes.toBytes("Forcing close in test")));
119   * this.postponed.add(pss); // Put off the processing of the regionserver shutdown processing.
120   * pss.setDelay(SERVER_DURATION); this.metaShutdownReceived = true; // Return false. This will add
121   * this op to the delayed queue. result = false; } } } else { // Have the close run frequently. if
122   * (isWantedCloseOperation(op) != null) { op.setDelay(CLOSE_DURATION); // Count how many times it
123   * comes through here. this.closeCount++; } } return result; } public void processed(final
124   * RegionServerOperation op) { if (isWantedCloseOperation(op) != null) return; this.done = true; }
125   */
126  /*
127   * @return Null if not the wanted ProcessRegionClose, else <code>op</code> cast as a
128   * ProcessRegionClose.
129   */
130  /*
131   * private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { // Count
132   * every time we get a close operation. if (op instanceof ProcessRegionClose) { ProcessRegionClose
133   * c = (ProcessRegionClose)op; if (c.regionInfo.equals(hri)) { return c; } } return null; }
134   * boolean isDone() { return this.done; } boolean isMetaShutdownReceived() { return
135   * metaShutdownReceived; } int getCloseCount() { return this.closeCount; }
136   * @Override public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { return true; } }
137   */
138  /**
139   * In 2428, the meta region has just been set offline and then a close comes in.
140   * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
141   */
142  @Ignore
143  @Test
144  public void testRegionCloseWhenNoMetaHBase2428() throws Exception {
145    /*
146     * LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); MiniHBaseCluster cluster =
147     * TEST_UTIL.getHBaseCluster(); final HMaster master = cluster.getMaster(); int metaIndex =
148     * cluster.getServerWithMeta(); // Figure the index of the server that is not server the
149     * hbase:meta int otherServerIndex = -1; for (int i = 0; i <
150     * cluster.getRegionServerThreads().size(); i++) { if (i == metaIndex) continue;
151     * otherServerIndex = i; break; } final HRegionServer otherServer =
152     * cluster.getRegionServer(otherServerIndex); final HRegionServer metaHRS =
153     * cluster.getRegionServer(metaIndex); // Get a region out on the otherServer. final RegionInfo
154     * hri = otherServer.getOnlineRegions().iterator().next().getRegionInfo(); // Add our
155     * RegionServerOperationsListener HBase2428Listener listener = new HBase2428Listener(cluster,
156     * metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
157     * master.getRegionServerOperationQueue(). registerRegionServerOperationListener(listener); try
158     * { // Now close the server carrying meta. cluster.abortRegionServer(metaIndex); // First wait
159     * on receipt of meta server shutdown message. while(!listener.metaShutdownReceived)
160     * Threads.sleep(100); while(!listener.isDone()) Threads.sleep(10); // We should not have
161     * retried the close more times than it took for the // server shutdown message to exit the
162     * delay queue and get processed // (Multiple by two to add in some slop in case of GC or
163     * something). assertTrue(listener.getCloseCount() > 1); assertTrue(listener.getCloseCount() <
164     * ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); // Assert the
165     * closed region came back online assertRegionIsBackOnline(hri); } finally {
166     * master.getRegionServerOperationQueue(). unregisterRegionServerOperationListener(listener); }
167     */
168  }
169
170  /**
171   * Test adding in a new server before old one on same host+port is dead. Make the test more
172   * onerous by having the server under test carry the meta. If confusion between old and new,
173   * purportedly meta never comes back. Test that meta gets redeployed.
174   */
175  @Ignore
176  @Test
177  public void testAddingServerBeforeOldIsDead2413() throws IOException {
178    /*
179     * LOG.info("Running testAddingServerBeforeOldIsDead2413"); MiniHBaseCluster cluster =
180     * TEST_UTIL.getHBaseCluster(); int count = count(); int metaIndex =
181     * cluster.getServerWithMeta(); MiniHBaseClusterRegionServer metaHRS =
182     * (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); int port =
183     * metaHRS.getServerInfo().getServerAddress().getPort(); Configuration c =
184     * TEST_UTIL.getConfiguration(); String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); try
185     * { LOG.info("KILLED=" + metaHRS); metaHRS.kill(); c.set(HConstants.REGIONSERVER_PORT,
186     * Integer.toString(port)); // Try and start new regionserver. It might clash with the old //
187     * regionserver port so keep trying to get past the BindException. HRegionServer hrs = null;
188     * while (true) { try { hrs = cluster.startRegionServer().getRegionServer(); break; } catch
189     * (IOException e) { if (e.getCause() != null && e.getCause() instanceof
190     * InvocationTargetException) { InvocationTargetException ee =
191     * (InvocationTargetException)e.getCause(); if (ee.getCause() != null && ee.getCause()
192     * instanceof BindException) { LOG.info("BindException; retrying: " + e.toString()); } } } }
193     * LOG.info("STARTED=" + hrs); // Wait until he's been given at least 3 regions before we go on
194     * to try // and count rows in table. while (hrs.getOnlineRegions().size() < 3)
195     * Threads.sleep(100); LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
196     * " regions"); assertEquals(count, count()); } finally { c.set(HConstants.REGIONSERVER_PORT,
197     * oldPort); }
198     */
199  }
200
201  /**
202   * HBase2482 is about outstanding region openings. If any are outstanding when a regionserver goes
203   * down, then they'll never deploy. They'll be stuck in the regions-in-transition list for ever.
204   * This listener looks for a region opening HMsg and if its from the server passed on
205   * construction, then we kill it. It also looks out for a close message on the victim server
206   * because that signifies start of the fireworks.
207   */
208  /*
209   * static class HBase2482Listener implements RegionServerOperationListener { private final
210   * HRegionServer victim; private boolean abortSent = false; // We closed regions on new server.
211   * private volatile boolean closed = false; // Copy of regions on new server private final
212   * Collection<HRegion> copyOfOnlineRegions; // This is the region that was in transition on the
213   * server we aborted. Test // passes if this region comes back online successfully. private
214   * RegionInfo regionToFind; HBase2482Listener(final HRegionServer victim) { this.victim = victim;
215   * // Copy regions currently open on this server so I can notice when // there is a close.
216   * this.copyOfOnlineRegions = this.victim.getCopyOfOnlineRegionsSortedBySize().values(); }
217   * @Override public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { if
218   * (!victim.getServerInfo().equals(serverInfo) || this.abortSent || !this.closed) { return true; }
219   * if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; // Save the region
220   * that is in transition so can test later it came back. this.regionToFind =
221   * incomingMsg.getRegionInfo(); String msg = "ABORTING " + this.victim + " because got a " +
222   * HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
223   * incomingMsg.getRegionInfo().getRegionNameAsString(); this.victim.abort(msg); this.abortSent =
224   * true; return true; }
225   * @Override public boolean process(RegionServerOperation op) throws IOException { return true; }
226   * @Override public void processed(RegionServerOperation op) { if (this.closed || !(op instanceof
227   * ProcessRegionClose)) return; ProcessRegionClose close = (ProcessRegionClose)op; for (HRegion r:
228   * this.copyOfOnlineRegions) { if (r.getRegionInfo().equals(close.regionInfo)) { // We've closed
229   * one of the regions that was on the victim server. // Now can start testing for when all regions
230   * are back online again LOG.info("Found close of " + r.getRegionInfo().getRegionNameAsString() +
231   * "; setting close happened flag"); this.closed = true; break; } } } }
232   */
233  /**
234   * In 2482, a RS with an opening region on it dies. The said region is then stuck in the master's
235   * regions-in-transition and never leaves it. This test works by bringing up a new regionserver,
236   * waiting for the load balancer to give it some regions. Then, we close all on the new server.
237   * After sending all the close messages, we send the new regionserver the special blocking message
238   * so it can not process any more messages. Meantime reopening of the just-closed regions is
239   * backed up on the new server. Soon as master gets an opening region from the new regionserver,
240   * we kill it. We then wait on all regions to come back on line. If bug is fixed, this should
241   * happen soon as the processing of the killed server is done.
242   * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
243   */
244  @Ignore
245  @Test
246  public void testKillRSWithOpeningRegion2482() throws Exception {
247    /*
248     * LOG.info("Running testKillRSWithOpeningRegion2482"); MiniHBaseCluster cluster =
249     * TEST_UTIL.getHBaseCluster(); if (cluster.getLiveRegionServerThreads().size() < 2) { // Need
250     * at least two servers. cluster.startRegionServer(); } // Count how many regions are online.
251     * They need to be all back online for // this test to succeed. int countOfMetaRegions =
252     * countOfMetaRegions(); // Add a listener on the server. HMaster m = cluster.getMaster(); //
253     * Start new regionserver. MiniHBaseClusterRegionServer hrs =
254     * (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
255     * LOG.info("Started new regionserver: " + hrs.toString()); // Wait until has some regions
256     * before proceeding. Balancer will give it some. int minimumRegions =
257     * countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); while
258     * (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); // Set the listener only
259     * after some regions have been opened on new server. HBase2482Listener listener = new
260     * HBase2482Listener(hrs); m.getRegionServerOperationQueue().
261     * registerRegionServerOperationListener(listener); try { // Go close all non-catalog regions on
262     * this new server closeAllNonCatalogRegions(cluster, hrs); // After all closes, add blocking
263     * message before the region opens start to // come in.
264     * cluster.addMessageToSendRegionServer(hrs, new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); //
265     * Wait till one of the above close messages has an effect before we start // wait on all
266     * regions back online. while (!listener.closed) Threads.sleep(100); LOG.info("Past close"); //
267     * Make sure the abort server message was sent. while(!listener.abortSent) Threads.sleep(100);
268     * LOG.info("Past abort send; waiting on all regions to redeploy"); // Now wait for regions to
269     * come back online. assertRegionIsBackOnline(listener.regionToFind); } finally {
270     * m.getRegionServerOperationQueue(). unregisterRegionServerOperationListener(listener); }
271     */
272  }
273
274  /*
275   * @return Count of all non-catalog regions on the designated server
276   */
277  /*
278   * private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, final
279   * MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) throws IOException { int countOfRegions = 0;
280   * for (HRegion r: hrs.getOnlineRegions()) { if (r.getRegionInfo().isMetaRegion()) continue;
281   * cluster.addMessageToSendRegionServer(hrs, new HMsg(HMsg.Type.MSG_REGION_CLOSE,
282   * r.getRegionInfo())); LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
283   * " on " + hrs.toString()); countOfRegions++; } return countOfRegions; } private void
284   * assertRegionIsBackOnline(final RegionInfo hri) throws IOException { // Region should have an
285   * entry in its startkey because of addRowToEachRegion. byte [] row = getStartKey(hri); HTable t =
286   * new HTable(TEST_UTIL.getConfiguration(), TABLENAME); Get g = new Get(row);
287   * assertTrue((t.get(g)).size() > 0); } /*
288   * @return Count of regions in meta table.
289   */
290  /*
291   * private static int countOfMetaRegions() throws IOException { HTable meta = new
292   * HTable(TEST_UTIL.getConfiguration(), HConstants.META_TABLE_NAME); int rows = 0; Scan scan = new
293   * Scan(); scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); ResultScanner s
294   * = meta.getScanner(scan); for (Result r = null; (r = s.next()) != null;) { byte [] b =
295   * r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); if (b == null || b.length
296   * <= 0) break; rows++; } s.close(); return rows; }
297   */
298  /*
299   * Add to each of the regions in hbase:meta a value. Key is the startrow of the region (except its
300   * 'aaa' for first region). Actual value is the row name.
301   */
302  private static int addToEachStartKey(final int expected) throws IOException {
303    Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
304    Table meta = TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME);
305    int rows = 0;
306    Scan scan = new Scan();
307    scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
308    ResultScanner s = meta.getScanner(scan);
309    for (Result r = null; (r = s.next()) != null;) {
310      RegionInfo hri = CatalogFamilyFormat.getRegionInfo(r);
311      if (hri == null) break;
312      if (!hri.getTable().equals(TABLENAME)) {
313        continue;
314      }
315
316      // If start key, add 'aaa'.
317      if (!hri.getTable().equals(TABLENAME)) {
318        continue;
319      }
320      byte[] row = getStartKey(hri);
321      Put p = new Put(row);
322      p.setDurability(Durability.SKIP_WAL);
323      p.addColumn(getTestFamily(), getTestQualifier(), row);
324      t.put(p);
325      rows++;
326    }
327    s.close();
328    Assert.assertEquals(expected, rows);
329    t.close();
330    meta.close();
331    return rows;
332  }
333
334  /*
335   * @return Start key for hri (If start key is '', then return 'aaa'.
336   */
337  private static byte[] getStartKey(final RegionInfo hri) {
338    return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())
339      ? Bytes.toBytes("aaa")
340      : hri.getStartKey();
341  }
342
343  private static byte[] getTestFamily() {
344    return FAMILIES[0];
345  }
346
347  private static byte[] getTestQualifier() {
348    return getTestFamily();
349  }
350}