001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertTrue;
022
023import java.io.IOException;
024import java.util.List;
025import java.util.Map;
026import java.util.concurrent.atomic.AtomicBoolean;
027import java.util.concurrent.atomic.AtomicReference;
028import org.apache.hadoop.conf.Configuration;
029import org.apache.hadoop.hbase.HBaseClassTestRule;
030import org.apache.hadoop.hbase.HBaseConfiguration;
031import org.apache.hadoop.hbase.HBaseTestingUtility;
032import org.apache.hadoop.hbase.LocalHBaseCluster;
033import org.apache.hadoop.hbase.MiniHBaseCluster;
034import org.apache.hadoop.hbase.ServerName;
035import org.apache.hadoop.hbase.client.RegionInfo;
036import org.apache.hadoop.hbase.master.HMaster;
037import org.apache.hadoop.hbase.master.LoadBalancer;
038import org.apache.hadoop.hbase.master.ServerListener;
039import org.apache.hadoop.hbase.master.ServerManager;
040import org.apache.hadoop.hbase.testclassification.MediumTests;
041import org.apache.hadoop.hbase.testclassification.RegionServerTests;
042import org.apache.hadoop.hbase.util.Bytes;
043import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
044import org.apache.hadoop.hbase.util.Threads;
045import org.junit.ClassRule;
046import org.junit.Ignore;
047import org.junit.Rule;
048import org.junit.Test;
049import org.junit.experimental.categories.Category;
050import org.junit.rules.TestName;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse;
055
056/**
057 * Tests that a regionserver that dies after reporting for duty gets removed
058 * from list of online regions. See HBASE-9593.
059 */
060@Category({RegionServerTests.class, MediumTests.class})
061@Ignore("See HBASE-19515")
062public class TestRSKilledWhenInitializing {
063
064  @ClassRule
065  public static final HBaseClassTestRule CLASS_RULE =
066      HBaseClassTestRule.forClass(TestRSKilledWhenInitializing.class);
067
068  private static final Logger LOG = LoggerFactory.getLogger(TestRSKilledWhenInitializing.class);
069
070  @Rule
071  public TestName testName = new TestName();
072
073  // This boolean needs to be globally available. It is used below in our
074  // mocked up regionserver so it knows when to die.
075  private static AtomicBoolean masterActive = new AtomicBoolean(false);
076  // Ditto for this variable. It also is used in the mocked regionserver class.
077  private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>();
078
079  private static final int NUM_MASTERS = 1;
080  private static final int NUM_RS = 2;
081
082  /**
083   * Test verifies whether a region server is removed from online servers list in master if it went
084   * down after registering with master. Test will TIMEOUT if an error!!!!
085   * @throws Exception
086   */
087  @Test
088  public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode()
089  throws Exception {
090    // Create config to use for this cluster
091    Configuration conf = HBaseConfiguration.create();
092    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
093    // Start the cluster
094    final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
095    TEST_UTIL.startMiniDFSCluster(3);
096    TEST_UTIL.startMiniZKCluster();
097    TEST_UTIL.createRootDir();
098    final LocalHBaseCluster cluster =
099        new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class,
100            RegisterAndDieRegionServer.class);
101    final MasterThread master = startMaster(cluster.getMasters().get(0));
102    try {
103      // Master is up waiting on RegionServers to check in. Now start RegionServers.
104      for (int i = 0; i < NUM_RS; i++) {
105        cluster.getRegionServers().get(i).start();
106      }
107      // Expected total regionservers depends on whether Master can host regions or not.
108      int expectedTotalRegionServers = NUM_RS + (LoadBalancer.isTablesOnMaster(conf)? 1: 0);
109      List<ServerName> onlineServersList = null;
110      do {
111        onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
112      } while (onlineServersList.size() < expectedTotalRegionServers);
113      // Wait until killedRS is set. Means RegionServer is starting to go down.
114      while (killedRS.get() == null) {
115        Threads.sleep(1);
116      }
117      // Wait on the RegionServer to fully die.
118      while (cluster.getLiveRegionServers().size() >= expectedTotalRegionServers) {
119        Threads.sleep(1);
120      }
121      // Make sure Master is fully up before progressing. Could take a while if regions
122      // being reassigned.
123      while (!master.getMaster().isInitialized()) {
124        Threads.sleep(1);
125      }
126
127      // Now in steady state. How many regions open? Master should have too many regionservers
128      // showing still. The downed RegionServer should still be showing as registered.
129      assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get()));
130      // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup.
131      Map<RegionInfo, ServerName> assignments = null;
132      do {
133        assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments();
134      } while (assignments == null || assignments.size() < 2);
135      RegionInfo hri = null;
136      for (Map.Entry<RegionInfo, ServerName> e: assignments.entrySet()) {
137        if (e.getKey().isMetaRegion()) continue;
138        hri = e.getKey();
139        break;
140      }
141      // Try moving region to the killed server. It will fail. As by-product, we will
142      // remove the RS from Master online list because no corresponding znode.
143      assertEquals(expectedTotalRegionServers,
144        master.getMaster().getServerManager().getOnlineServersList().size());
145      LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get());
146      master.getMaster().move(hri.getEncodedNameAsBytes(),
147          Bytes.toBytes(killedRS.get().toString()));
148
149      // TODO: This test could do more to verify fix. It could create a table
150      // and do round-robin assign. It should fail if zombie RS. HBASE-19515.
151
152      // Wait until the RS no longer shows as registered in Master.
153      while (onlineServersList.size() > (NUM_RS + 1)) {
154        Thread.sleep(100);
155        onlineServersList = master.getMaster().getServerManager().getOnlineServersList();
156      }
157    } finally {
158      // Shutdown is messy with complaints about fs being closed. Why? TODO.
159      cluster.shutdown();
160      cluster.join();
161      TEST_UTIL.shutdownMiniDFSCluster();
162      TEST_UTIL.shutdownMiniZKCluster();
163      TEST_UTIL.cleanupTestDir();
164    }
165  }
166
167  /**
168   * Start Master. Get as far as the state where Master is waiting on
169   * RegionServers to check in, then return.
170   */
171  private MasterThread startMaster(MasterThread master) {
172    master.start();
173    // It takes a while until ServerManager creation to happen inside Master startup.
174    while (master.getMaster().getServerManager() == null) {
175      continue;
176    }
177    // Set a listener for the waiting-on-RegionServers state. We want to wait
178    // until this condition before we leave this method and start regionservers.
179    final AtomicBoolean waiting = new AtomicBoolean(false);
180    if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM");
181    master.getMaster().getServerManager().registerListener(new ServerListener() {
182      @Override
183      public void waiting() {
184        waiting.set(true);
185      }
186    });
187    // Wait until the Master gets to place where it is waiting on RegionServers to check in.
188    while (!waiting.get()) {
189      continue;
190    }
191    // Set the global master-is-active; gets picked up by regionservers later.
192    masterActive.set(true);
193    return master;
194  }
195
196  /**
197   * A RegionServer that reports for duty and then immediately dies if it is the first to receive
198   * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master
199   * notices and so removes the region from its set of online regionservers.
200   */
201  static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
202    public RegisterAndDieRegionServer(Configuration conf)
203    throws IOException, InterruptedException {
204      super(conf);
205    }
206
207    @Override
208    protected void handleReportForDutyResponse(RegionServerStartupResponse c)
209    throws IOException {
210      if (killedRS.compareAndSet(null, getServerName())) {
211        // Make sure Master is up so it will see the removal of the ephemeral znode for this RS.
212        while (!masterActive.get()) {
213          Threads.sleep(100);
214        }
215        super.kill();
216      } else {
217        super.handleReportForDutyResponse(c);
218      }
219    }
220  }
221}