001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import java.io.IOException;
021import org.apache.hadoop.hbase.HBaseClassTestRule;
022import org.apache.hadoop.hbase.HBaseTestingUtility;
023import org.apache.hadoop.hbase.HConstants;
024import org.apache.hadoop.hbase.MetaTableAccessor;
025import org.apache.hadoop.hbase.TableName;
026import org.apache.hadoop.hbase.client.Durability;
027import org.apache.hadoop.hbase.client.Put;
028import org.apache.hadoop.hbase.client.RegionInfo;
029import org.apache.hadoop.hbase.client.RegionLocator;
030import org.apache.hadoop.hbase.client.Result;
031import org.apache.hadoop.hbase.client.ResultScanner;
032import org.apache.hadoop.hbase.client.Scan;
033import org.apache.hadoop.hbase.client.Table;
034import org.apache.hadoop.hbase.testclassification.LargeTests;
035import org.apache.hadoop.hbase.testclassification.MasterTests;
036import org.apache.hadoop.hbase.util.Bytes;
037import org.junit.AfterClass;
038import org.junit.Assert;
039import org.junit.Before;
040import org.junit.BeforeClass;
041import org.junit.ClassRule;
042import org.junit.Ignore;
043import org.junit.Test;
044import org.junit.experimental.categories.Category;
045import org.slf4j.Logger;
046import org.slf4j.LoggerFactory;
047
048/**
049 * Test transitions of state across the master.  Sets up the cluster once and
050 * then runs a couple of tests.
051 */
052@Category({MasterTests.class, LargeTests.class})
053public class TestMasterTransitions {
054
055  @ClassRule
056  public static final HBaseClassTestRule CLASS_RULE =
057      HBaseClassTestRule.forClass(TestMasterTransitions.class);
058
059  private static final Logger LOG = LoggerFactory.getLogger(TestMasterTransitions.class);
060  private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
061  private static final TableName TABLENAME = TableName.valueOf("master_transitions");
062  private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
063    Bytes.toBytes("b"), Bytes.toBytes("c")};
064
065  /**
066   * Start up a mini cluster and put a small table of many empty regions into it.
067   * @throws Exception
068   */
069  @BeforeClass public static void beforeAllTests() throws Exception {
070    TEST_UTIL.startMiniCluster(2);
071    // Create a table of three families.  This will assign a region.
072    TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES);
073    Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
074    int countOfRegions = -1;
075    try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(TABLENAME)) {
076      countOfRegions = r.getStartKeys().length;
077    }
078    TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME);
079    addToEachStartKey(countOfRegions);
080    t.close();
081  }
082
083  @AfterClass public static void afterAllTests() throws Exception {
084    TEST_UTIL.shutdownMiniCluster();
085  }
086
087  @Before public void setup() throws IOException {
088    TEST_UTIL.ensureSomeRegionServersAvailable(2);
089  }
090
091  /**
092   * Listener for regionserver events testing hbase-2428 (Infinite loop of
093   * region closes if hbase:meta region is offline).  In particular, listen
094   * for the close of the 'metaServer' and when it comes in, requeue it with a
095   * delay as though there were an issue processing the shutdown.  As part of
096   * the requeuing,  send over a close of a region on 'otherServer' so it comes
097   * into a master that has its meta region marked as offline.
098   */
099  /*
100  static class HBase2428Listener implements RegionServerOperationListener {
101    // Map of what we've delayed so we don't do do repeated delays.
102    private final Set<RegionServerOperation> postponed =
103      new CopyOnWriteArraySet<RegionServerOperation>();
104    private boolean done = false;;
105    private boolean metaShutdownReceived = false;
106    private final HServerAddress metaAddress;
107    private final MiniHBaseCluster cluster;
108    private final int otherServerIndex;
109    private final RegionInfo hri;
110    private int closeCount = 0;
111    static final int SERVER_DURATION = 3 * 1000;
112    static final int CLOSE_DURATION = 1 * 1000;
113
114    HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
115        final RegionInfo closingHRI, final int otherServerIndex) {
116      this.cluster = c;
117      this.metaAddress = metaAddress;
118      this.hri = closingHRI;
119      this.otherServerIndex = otherServerIndex;
120    }
121
122    @Override
123    public boolean process(final RegionServerOperation op) throws IOException {
124      // If a regionserver shutdown and its of the meta server, then we want to
125      // delay the processing of the shutdown and send off a close of a region on
126      // the 'otherServer.
127      boolean result = true;
128      if (op instanceof ProcessServerShutdown) {
129        ProcessServerShutdown pss = (ProcessServerShutdown)op;
130        if (pss.getDeadServerAddress().equals(this.metaAddress)) {
131          // Don't postpone more than once.
132          if (!this.postponed.contains(pss)) {
133            // Close some region.
134            this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
135              new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
136              Bytes.toBytes("Forcing close in test")));
137            this.postponed.add(pss);
138            // Put off the processing of the regionserver shutdown processing.
139            pss.setDelay(SERVER_DURATION);
140            this.metaShutdownReceived = true;
141            // Return false.  This will add this op to the delayed queue.
142            result = false;
143          }
144        }
145      } else {
146        // Have the close run frequently.
147        if (isWantedCloseOperation(op) != null) {
148          op.setDelay(CLOSE_DURATION);
149          // Count how many times it comes through here.
150          this.closeCount++;
151        }
152      }
153      return result;
154    }
155
156    public void processed(final RegionServerOperation op) {
157      if (isWantedCloseOperation(op) != null) return;
158      this.done = true;
159    }
160*/
161    /*
162     * @param op
163     * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
164     * cast as a ProcessRegionClose.
165     */
166  /*
167    private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
168      // Count every time we get a close operation.
169      if (op instanceof ProcessRegionClose) {
170        ProcessRegionClose c = (ProcessRegionClose)op;
171        if (c.regionInfo.equals(hri)) {
172          return c;
173        }
174      }
175      return null;
176    }
177
178    boolean isDone() {
179      return this.done;
180    }
181
182    boolean isMetaShutdownReceived() {
183      return metaShutdownReceived;
184    }
185
186    int getCloseCount() {
187      return this.closeCount;
188    }
189
190    @Override
191    public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
192      return true;
193    }
194  }
195*/
196  /**
197   * In 2428, the meta region has just been set offline and then a close comes
198   * in.
199   * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
200   */
201  @Ignore @Test
202  public void testRegionCloseWhenNoMetaHBase2428()
203  throws Exception {
204    /*
205    LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
206    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
207    final HMaster master = cluster.getMaster();
208    int metaIndex = cluster.getServerWithMeta();
209    // Figure the index of the server that is not server the hbase:meta
210    int otherServerIndex = -1;
211    for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
212      if (i == metaIndex) continue;
213      otherServerIndex = i;
214      break;
215    }
216    final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
217    final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
218
219    // Get a region out on the otherServer.
220    final RegionInfo hri =
221      otherServer.getOnlineRegions().iterator().next().getRegionInfo();
222
223    // Add our RegionServerOperationsListener
224    HBase2428Listener listener = new HBase2428Listener(cluster,
225      metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
226    master.getRegionServerOperationQueue().
227      registerRegionServerOperationListener(listener);
228    try {
229      // Now close the server carrying meta.
230      cluster.abortRegionServer(metaIndex);
231
232      // First wait on receipt of meta server shutdown message.
233      while(!listener.metaShutdownReceived) Threads.sleep(100);
234      while(!listener.isDone()) Threads.sleep(10);
235      // We should not have retried the close more times than it took for the
236      // server shutdown message to exit the delay queue and get processed
237      // (Multiple by two to add in some slop in case of GC or something).
238      assertTrue(listener.getCloseCount() > 1);
239      assertTrue(listener.getCloseCount() <
240        ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
241
242      // Assert the closed region came back online
243      assertRegionIsBackOnline(hri);
244    } finally {
245      master.getRegionServerOperationQueue().
246        unregisterRegionServerOperationListener(listener);
247    }
248    */
249  }
250
251  /**
252   * Test adding in a new server before old one on same host+port is dead.
253   * Make the test more onerous by having the server under test carry the meta.
254   * If confusion between old and new, purportedly meta never comes back.  Test
255   * that meta gets redeployed.
256   */
257  @Ignore @Test
258  public void testAddingServerBeforeOldIsDead2413()
259  throws IOException {
260    /*
261    LOG.info("Running testAddingServerBeforeOldIsDead2413");
262    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
263    int count = count();
264    int metaIndex = cluster.getServerWithMeta();
265    MiniHBaseClusterRegionServer metaHRS =
266      (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
267    int port = metaHRS.getServerInfo().getServerAddress().getPort();
268    Configuration c = TEST_UTIL.getConfiguration();
269    String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
270    try {
271      LOG.info("KILLED=" + metaHRS);
272      metaHRS.kill();
273      c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
274      // Try and start new regionserver.  It might clash with the old
275      // regionserver port so keep trying to get past the BindException.
276      HRegionServer hrs = null;
277      while (true) {
278        try {
279          hrs = cluster.startRegionServer().getRegionServer();
280          break;
281        } catch (IOException e) {
282          if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
283            InvocationTargetException ee = (InvocationTargetException)e.getCause();
284            if (ee.getCause() != null && ee.getCause() instanceof BindException) {
285              LOG.info("BindException; retrying: " + e.toString());
286            }
287          }
288        }
289      }
290      LOG.info("STARTED=" + hrs);
291      // Wait until he's been given at least 3 regions before we go on to try
292      // and count rows in table.
293      while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
294      LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
295        " regions");
296      assertEquals(count, count());
297    } finally {
298      c.set(HConstants.REGIONSERVER_PORT, oldPort);
299    }
300    */
301  }
302
303  /**
304   * HBase2482 is about outstanding region openings.  If any are outstanding
305   * when a regionserver goes down, then they'll never deploy.  They'll be
306   * stuck in the regions-in-transition list for ever.  This listener looks
307   * for a region opening HMsg and if its from the server passed on construction,
308   * then we kill it.  It also looks out for a close message on the victim
309   * server because that signifies start of the fireworks.
310   */
311  /*
312  static class HBase2482Listener implements RegionServerOperationListener {
313    private final HRegionServer victim;
314    private boolean abortSent = false;
315    // We closed regions on new server.
316    private volatile boolean closed = false;
317    // Copy of regions on new server
318    private final Collection<HRegion> copyOfOnlineRegions;
319    // This is the region that was in transition on the server we aborted. Test
320    // passes if this region comes back online successfully.
321    private RegionInfo regionToFind;
322
323    HBase2482Listener(final HRegionServer victim) {
324      this.victim = victim;
325      // Copy regions currently open on this server so I can notice when
326      // there is a close.
327      this.copyOfOnlineRegions =
328        this.victim.getCopyOfOnlineRegionsSortedBySize().values();
329    }
330
331    @Override
332    public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
333      if (!victim.getServerInfo().equals(serverInfo) ||
334          this.abortSent || !this.closed) {
335        return true;
336      }
337      if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
338      // Save the region that is in transition so can test later it came back.
339      this.regionToFind = incomingMsg.getRegionInfo();
340      String msg = "ABORTING " + this.victim + " because got a " +
341        HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
342        incomingMsg.getRegionInfo().getRegionNameAsString();
343      this.victim.abort(msg);
344      this.abortSent = true;
345      return true;
346    }
347
348    @Override
349    public boolean process(RegionServerOperation op) throws IOException {
350      return true;
351    }
352
353    @Override
354    public void processed(RegionServerOperation op) {
355      if (this.closed || !(op instanceof ProcessRegionClose)) return;
356      ProcessRegionClose close = (ProcessRegionClose)op;
357      for (HRegion r: this.copyOfOnlineRegions) {
358        if (r.getRegionInfo().equals(close.regionInfo)) {
359          // We've closed one of the regions that was on the victim server.
360          // Now can start testing for when all regions are back online again
361          LOG.info("Found close of " +
362            r.getRegionInfo().getRegionNameAsString() +
363            "; setting close happened flag");
364          this.closed = true;
365          break;
366        }
367      }
368    }
369  }
370*/
371  /**
372   * In 2482, a RS with an opening region on it dies.  The said region is then
373   * stuck in the master's regions-in-transition and never leaves it.  This
374   * test works by bringing up a new regionserver, waiting for the load
375   * balancer to give it some regions.  Then, we close all on the new server.
376   * After sending all the close messages, we send the new regionserver the
377   * special blocking message so it can not process any more messages.
378   * Meantime reopening of the just-closed regions is backed up on the new
379   * server.  Soon as master gets an opening region from the new regionserver,
380   * we kill it.  We then wait on all regions to come back on line.  If bug
381   * is fixed, this should happen soon as the processing of the killed server is
382   * done.
383   * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
384   */
385  @Ignore @Test
386  public void testKillRSWithOpeningRegion2482()
387  throws Exception {
388    /*
389    LOG.info("Running testKillRSWithOpeningRegion2482");
390    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
391    if (cluster.getLiveRegionServerThreads().size() < 2) {
392      // Need at least two servers.
393      cluster.startRegionServer();
394    }
395    // Count how many regions are online.  They need to be all back online for
396    // this test to succeed.
397    int countOfMetaRegions = countOfMetaRegions();
398    // Add a listener on the server.
399    HMaster m = cluster.getMaster();
400    // Start new regionserver.
401    MiniHBaseClusterRegionServer hrs =
402      (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
403    LOG.info("Started new regionserver: " + hrs.toString());
404    // Wait until has some regions before proceeding.  Balancer will give it some.
405    int minimumRegions =
406      countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
407    while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
408    // Set the listener only after some regions have been opened on new server.
409    HBase2482Listener listener = new HBase2482Listener(hrs);
410    m.getRegionServerOperationQueue().
411      registerRegionServerOperationListener(listener);
412    try {
413      // Go close all non-catalog regions on this new server
414      closeAllNonCatalogRegions(cluster, hrs);
415      // After all closes, add blocking message before the region opens start to
416      // come in.
417      cluster.addMessageToSendRegionServer(hrs,
418        new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
419      // Wait till one of the above close messages has an effect before we start
420      // wait on all regions back online.
421      while (!listener.closed) Threads.sleep(100);
422      LOG.info("Past close");
423      // Make sure the abort server message was sent.
424      while(!listener.abortSent) Threads.sleep(100);
425      LOG.info("Past abort send; waiting on all regions to redeploy");
426      // Now wait for regions to come back online.
427      assertRegionIsBackOnline(listener.regionToFind);
428    } finally {
429      m.getRegionServerOperationQueue().
430        unregisterRegionServerOperationListener(listener);
431    }
432    */
433  }
434
435  /*
436   * @return Count of all non-catalog regions on the designated server
437   */
438/*
439  private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
440    final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
441  throws IOException {
442    int countOfRegions = 0;
443    for (HRegion r: hrs.getOnlineRegions()) {
444      if (r.getRegionInfo().isMetaRegion()) continue;
445      cluster.addMessageToSendRegionServer(hrs,
446        new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
447      LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
448        " on " + hrs.toString());
449      countOfRegions++;
450    }
451    return countOfRegions;
452  }
453
454  private void assertRegionIsBackOnline(final RegionInfo hri)
455  throws IOException {
456    // Region should have an entry in its startkey because of addRowToEachRegion.
457    byte [] row = getStartKey(hri);
458    HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
459    Get g =  new Get(row);
460    assertTrue((t.get(g)).size() > 0);
461  }
462
463  /*
464   * @return Count of regions in meta table.
465   * @throws IOException
466   */
467  /*
468  private static int countOfMetaRegions()
469  throws IOException {
470    HTable meta = new HTable(TEST_UTIL.getConfiguration(),
471      HConstants.META_TABLE_NAME);
472    int rows = 0;
473    Scan scan = new Scan();
474    scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
475    ResultScanner s = meta.getScanner(scan);
476    for (Result r = null; (r = s.next()) != null;) {
477      byte [] b =
478        r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
479      if (b == null || b.length <= 0) break;
480      rows++;
481    }
482    s.close();
483    return rows;
484  }
485*/
486  /*
487   * Add to each of the regions in hbase:meta a value.  Key is the startrow of the
488   * region (except its 'aaa' for first region).  Actual value is the row name.
489   * @param expected
490   * @return
491   * @throws IOException
492   */
493  private static int addToEachStartKey(final int expected) throws IOException {
494    Table t = TEST_UTIL.getConnection().getTable(TABLENAME);
495    Table meta = TEST_UTIL.getConnection().getTable(TableName.META_TABLE_NAME);
496    int rows = 0;
497    Scan scan = new Scan();
498    scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
499    ResultScanner s = meta.getScanner(scan);
500    for (Result r = null; (r = s.next()) != null;) {
501      RegionInfo hri = MetaTableAccessor.getRegionInfo(r);
502      if (hri == null) break;
503      if (!hri.getTable().equals(TABLENAME)) {
504        continue;
505      }
506
507      // If start key, add 'aaa'.
508      if(!hri.getTable().equals(TABLENAME)) {
509        continue;
510      }
511      byte [] row = getStartKey(hri);
512      Put p = new Put(row);
513      p.setDurability(Durability.SKIP_WAL);
514      p.addColumn(getTestFamily(), getTestQualifier(), row);
515      t.put(p);
516      rows++;
517    }
518    s.close();
519    Assert.assertEquals(expected, rows);
520    t.close();
521    meta.close();
522    return rows;
523  }
524
525  /*
526   * @param hri
527   * @return Start key for hri (If start key is '', then return 'aaa'.
528   */
529  private static byte [] getStartKey(final RegionInfo hri) {
530    return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
531        Bytes.toBytes("aaa"): hri.getStartKey();
532  }
533
534  private static byte [] getTestFamily() {
535    return FAMILIES[0];
536  }
537
538  private static byte [] getTestQualifier() {
539    return getTestFamily();
540  }
541}