001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021
022import java.io.IOException;
023import java.util.List;
024import java.util.NavigableSet;
025import java.util.Set;
026import java.util.TreeSet;
027import org.apache.hadoop.conf.Configuration;
028import org.apache.hadoop.hbase.HBaseClassTestRule;
029import org.apache.hadoop.hbase.HBaseConfiguration;
030import org.apache.hadoop.hbase.HBaseTestingUtility;
031import org.apache.hadoop.hbase.MiniHBaseCluster;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.TableName;
034import org.apache.hadoop.hbase.client.RegionInfo;
035import org.apache.hadoop.hbase.client.RegionLocator;
036import org.apache.hadoop.hbase.client.Table;
037import org.apache.hadoop.hbase.testclassification.LargeTests;
038import org.apache.hadoop.hbase.testclassification.MasterTests;
039import org.apache.hadoop.hbase.util.Bytes;
040import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
041import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
042import org.junit.ClassRule;
043import org.junit.Rule;
044import org.junit.Test;
045import org.junit.experimental.categories.Category;
046import org.junit.rules.TestName;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
051
052/**
053 * Tests the restarting of everything as done during rolling restarts.
054 */
055@Category({MasterTests.class, LargeTests.class})
056public class TestRollingRestart {
057
058  @ClassRule
059  public static final HBaseClassTestRule CLASS_RULE =
060      HBaseClassTestRule.forClass(TestRollingRestart.class);
061
062  private static final Logger LOG = LoggerFactory.getLogger(TestRollingRestart.class);
063
064  @Rule
065  public TestName name = new TestName();
066
067  @Test
068  public void testBasicRollingRestart() throws Exception {
069
070    // Start a cluster with 2 masters and 4 regionservers
071    final int NUM_MASTERS = 2;
072    final int NUM_RS = 3;
073    final int NUM_REGIONS_TO_CREATE = 20;
074
075    int expectedNumRS = 3;
076
077    // Start the cluster
078    log("Starting cluster");
079    Configuration conf = HBaseConfiguration.create();
080    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
081    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
082    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
083    log("Waiting for active/ready master");
084    cluster.waitForActiveAndReadyMaster();
085
086    // Create a table with regions
087    final TableName tableName = TableName.valueOf(name.getMethodName());
088    byte [] family = Bytes.toBytes("family");
089    log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
090    Table ht = TEST_UTIL.createMultiRegionTable(tableName, family, NUM_REGIONS_TO_CREATE);
091    int numRegions = -1;
092    try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(tableName)) {
093      numRegions = r.getStartKeys().length;
094    }
095    numRegions += 1; // catalogs
096    log("Waiting for no more RIT\n");
097    TEST_UTIL.waitUntilNoRegionsInTransition(60000);
098    log("Disabling table\n");
099    TEST_UTIL.getAdmin().disableTable(tableName);
100    log("Waiting for no more RIT\n");
101    TEST_UTIL.waitUntilNoRegionsInTransition(60000);
102    NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
103    log("Verifying only catalog and namespace regions are assigned\n");
104    if (regions.size() != 2) {
105      for (String oregion : regions) log("Region still online: " + oregion);
106    }
107    assertEquals(2, regions.size());
108    log("Enabling table\n");
109    TEST_UTIL.getAdmin().enableTable(tableName);
110    log("Waiting for no more RIT\n");
111    TEST_UTIL.waitUntilNoRegionsInTransition(60000);
112    log("Verifying there are " + numRegions + " assigned on cluster\n");
113    regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
114    assertRegionsAssigned(cluster, regions);
115    assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
116
117    // Add a new regionserver
118    log("Adding a fourth RS");
119    RegionServerThread restarted = cluster.startRegionServer();
120    expectedNumRS++;
121    restarted.waitForServerOnline();
122    log("Additional RS is online");
123    log("Waiting for no more RIT");
124    TEST_UTIL.waitUntilNoRegionsInTransition(60000);
125    log("Verifying there are " + numRegions + " assigned on cluster");
126    assertRegionsAssigned(cluster, regions);
127    assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
128
129    // Master Restarts
130    List<MasterThread> masterThreads = cluster.getMasterThreads();
131    MasterThread activeMaster = null;
132    MasterThread backupMaster = null;
133    assertEquals(2, masterThreads.size());
134    if (masterThreads.get(0).getMaster().isActiveMaster()) {
135      activeMaster = masterThreads.get(0);
136      backupMaster = masterThreads.get(1);
137    } else {
138      activeMaster = masterThreads.get(1);
139      backupMaster = masterThreads.get(0);
140    }
141
142    // Bring down the backup master
143    log("Stopping backup master\n\n");
144    backupMaster.getMaster().stop("Stop of backup during rolling restart");
145    cluster.hbaseCluster.waitOnMaster(backupMaster);
146
147    // Bring down the primary master
148    log("Stopping primary master\n\n");
149    activeMaster.getMaster().stop("Stop of active during rolling restart");
150    cluster.hbaseCluster.waitOnMaster(activeMaster);
151
152    // Start primary master
153    log("Restarting primary master\n\n");
154    activeMaster = cluster.startMaster();
155    cluster.waitForActiveAndReadyMaster();
156
157    // Start backup master
158    log("Restarting backup master\n\n");
159    backupMaster = cluster.startMaster();
160
161    assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
162
163    // RegionServer Restarts
164
165    // Bring them down, one at a time, waiting between each to complete
166    List<RegionServerThread> regionServers =
167      cluster.getLiveRegionServerThreads();
168    int num = 1;
169    int total = regionServers.size();
170    for (RegionServerThread rst : regionServers) {
171      ServerName serverName = rst.getRegionServer().getServerName();
172      log("Stopping region server " + num + " of " + total + " [ " +
173          serverName + "]");
174      rst.getRegionServer().stop("Stopping RS during rolling restart");
175      cluster.hbaseCluster.waitOnRegionServer(rst);
176      log("Waiting for RS shutdown to be handled by master");
177      waitForRSShutdownToStartAndFinish(activeMaster, serverName);
178      log("RS shutdown done, waiting for no more RIT");
179      TEST_UTIL.waitUntilNoRegionsInTransition(60000);
180      log("Verifying there are " + numRegions + " assigned on cluster");
181      assertRegionsAssigned(cluster, regions);
182      expectedNumRS--;
183      assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
184      log("Restarting region server " + num + " of " + total);
185      restarted = cluster.startRegionServer();
186      restarted.waitForServerOnline();
187      expectedNumRS++;
188      log("Region server " + num + " is back online");
189      log("Waiting for no more RIT");
190      TEST_UTIL.waitUntilNoRegionsInTransition(60000);
191      log("Verifying there are " + numRegions + " assigned on cluster");
192      assertRegionsAssigned(cluster, regions);
193      assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
194      num++;
195    }
196    Thread.sleep(1000);
197    assertRegionsAssigned(cluster, regions);
198
199    // TODO: Bring random 3 of 4 RS down at the same time
200
201    ht.close();
202    // Stop the cluster
203    TEST_UTIL.shutdownMiniCluster();
204  }
205
206  private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
207      ServerName serverName) throws InterruptedException {
208    ServerManager sm = activeMaster.getMaster().getServerManager();
209    // First wait for it to be in dead list
210    while (!sm.getDeadServers().isDeadServer(serverName)) {
211      log("Waiting for [" + serverName + "] to be listed as dead in master");
212      Thread.sleep(1);
213    }
214    log("Server [" + serverName + "] marked as dead, waiting for it to " +
215        "finish dead processing");
216    while (sm.areDeadServersInProgress()) {
217      log("Server [" + serverName + "] still being processed, waiting");
218      Thread.sleep(100);
219    }
220    log("Server [" + serverName + "] done with server shutdown processing");
221  }
222
223  private void log(String msg) {
224    LOG.debug("\n\nTRR: " + msg + "\n");
225  }
226
227  private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
228    int numFound = 0;
229    for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
230      numFound += rst.getRegionServer().getNumberOfOnlineRegions();
231    }
232    for (MasterThread mt : cluster.getMasterThreads()) {
233      numFound += mt.getMaster().getNumberOfOnlineRegions();
234    }
235    return numFound;
236  }
237
238  private void assertRegionsAssigned(MiniHBaseCluster cluster,
239      Set<String> expectedRegions) throws IOException {
240    int numFound = getNumberOfOnlineRegions(cluster);
241    if (expectedRegions.size() > numFound) {
242      log("Expected to find " + expectedRegions.size() + " but only found"
243          + " " + numFound);
244      NavigableSet<String> foundRegions =
245        HBaseTestingUtility.getAllOnlineRegions(cluster);
246      for (String region : expectedRegions) {
247        if (!foundRegions.contains(region)) {
248          log("Missing region: " + region);
249        }
250      }
251      assertEquals(expectedRegions.size(), numFound);
252    } else if (expectedRegions.size() < numFound) {
253      int doubled = numFound - expectedRegions.size();
254      log("Expected to find " + expectedRegions.size() + " but found"
255          + " " + numFound + " (" + doubled + " double assignments?)");
256      NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
257      for (String region : doubleRegions) {
258        log("Region is double assigned: " + region);
259      }
260      assertEquals(expectedRegions.size(), numFound);
261    } else {
262      log("Success!  Found expected number of " + numFound + " regions");
263    }
264  }
265
266  private NavigableSet<String> getDoubleAssignedRegions(
267      MiniHBaseCluster cluster) throws IOException {
268    NavigableSet<String> online = new TreeSet<>();
269    NavigableSet<String> doubled = new TreeSet<>();
270    for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
271      for (RegionInfo region : ProtobufUtil.getOnlineRegions(
272          rst.getRegionServer().getRSRpcServices())) {
273        if(!online.add(region.getRegionNameAsString())) {
274          doubled.add(region.getRegionNameAsString());
275        }
276      }
277    }
278    return doubled;
279  }
280
281
282}
283