001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master;
019
020import static org.junit.Assert.assertEquals;
021import static org.junit.Assert.assertFalse;
022import static org.junit.Assert.assertTrue;
023
024import java.io.IOException;
025import java.time.Duration;
026import java.util.List;
027import org.apache.hadoop.fs.Path;
028import org.apache.hadoop.hbase.Cell;
029import org.apache.hadoop.hbase.HBaseClassTestRule;
030import org.apache.hadoop.hbase.HBaseTestingUtil;
031import org.apache.hadoop.hbase.HConstants;
032import org.apache.hadoop.hbase.ServerName;
033import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
034import org.apache.hadoop.hbase.TableName;
035import org.apache.hadoop.hbase.Waiter;
036import org.apache.hadoop.hbase.client.Get;
037import org.apache.hadoop.hbase.client.Put;
038import org.apache.hadoop.hbase.client.RegionInfo;
039import org.apache.hadoop.hbase.client.Result;
040import org.apache.hadoop.hbase.client.Table;
041import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
042import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
043import org.apache.hadoop.hbase.regionserver.HRegionServer;
044import org.apache.hadoop.hbase.testclassification.LargeTests;
045import org.apache.hadoop.hbase.util.Bytes;
046import org.apache.hadoop.hbase.util.CommonFSUtils;
047import org.apache.hadoop.hbase.zookeeper.ZKUtil;
048import org.junit.Before;
049import org.junit.ClassRule;
050import org.junit.Rule;
051import org.junit.Test;
052import org.junit.experimental.categories.Category;
053import org.junit.rules.TestName;
054
055/**
056 * Test reuse storefiles within data directory when cluster failover with a set of new region
057 * servers with different hostnames with or without WALs and Zookeeper ZNodes, the master and
058 * cluster should fail respectively if there is any situation considered as not supported.
059 */
060@Category({ LargeTests.class })
061public class TestRecreateCluster {
062  @ClassRule
063  public static final HBaseClassTestRule CLASS_RULE =
064    HBaseClassTestRule.forClass(TestRecreateCluster.class);
065
066  @Rule
067  public TestName name = new TestName();
068
069  private static final HBaseTestingUtil TEST_UTIL = new HBaseTestingUtil();
070  private static final int NUM_RS = 3;
071  private static final long TIMEOUT_MS = Duration.ofMinutes(1).toMillis();
072  private static final long MASTER_INIT_TIMEOUT_MS = Duration.ofSeconds(45).toMillis();
073
074  @Before
075  public void setup() {
076    TEST_UTIL.getConfiguration().setLong("hbase.master.init.timeout.localHBaseCluster",
077      MASTER_INIT_TIMEOUT_MS);
078  }
079
080  @Test
081  public void testRecreateCluster_UserTableDisabled_ReuseWALsAndZNodes() throws Exception {
082    validateRecreateClusterWithUserDisabled(false, false);
083  }
084
085  @Test
086  public void testRecreateCluster_UserTableEnabled_ReuseWALsAndZNodes() throws Exception {
087    validateRecreateClusterWithUserTableEnabled(false, false);
088  }
089
090  @Test
091  public void testRecreateCluster_UserTableEnabled_CleanupZNodes() throws Exception {
092    // this is no longer failing and is a different behavior compared to branch-2
093    validateRecreateClusterWithUserTableEnabled(false, true);
094  }
095
096  @Test(expected = IOException.class)
097  public void testRecreateCluster_UserTableEnabled_CleanupWALAndZNodes() throws Exception {
098    // master fails with InitMetaProcedure because it cannot delete existing meta table directory,
099    // region server cannot join and time-out the cluster starts.
100    validateRecreateClusterWithUserTableEnabled(true, true);
101  }
102
103  private void validateRecreateClusterWithUserDisabled(boolean cleanupWALs, boolean cleanUpZNodes)
104    throws Exception {
105    TEST_UTIL.startMiniCluster(NUM_RS);
106    try {
107      TableName tableName = TableName.valueOf("t1");
108      prepareDataBeforeRecreate(TEST_UTIL, tableName);
109      TEST_UTIL.getAdmin().disableTable(tableName);
110      TEST_UTIL.waitTableDisabled(tableName.getName());
111      restartHBaseCluster(cleanupWALs, cleanUpZNodes);
112      TEST_UTIL.getAdmin().enableTable(tableName);
113      validateDataAfterRecreate(TEST_UTIL, tableName);
114    } finally {
115      TEST_UTIL.shutdownMiniCluster();
116    }
117  }
118
119  private void validateRecreateClusterWithUserTableEnabled(boolean cleanupWALs,
120    boolean cleanUpZNodes) throws Exception {
121    TEST_UTIL.startMiniCluster(NUM_RS);
122    try {
123      TableName tableName = TableName.valueOf("t1");
124      prepareDataBeforeRecreate(TEST_UTIL, tableName);
125      restartHBaseCluster(cleanupWALs, cleanUpZNodes);
126      validateDataAfterRecreate(TEST_UTIL, tableName);
127    } finally {
128      TEST_UTIL.shutdownMiniCluster();
129    }
130  }
131
132  private void restartHBaseCluster(boolean cleanUpWALs, boolean cleanUpZnodes) throws Exception {
133    // flush cache so that everything is on disk
134    TEST_UTIL.getMiniHBaseCluster().flushcache(TableName.META_TABLE_NAME);
135    TEST_UTIL.getMiniHBaseCluster().flushcache();
136
137    List<ServerName> oldServers =
138      TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();
139
140    // make sure there is no procedures pending
141    TEST_UTIL.waitFor(TIMEOUT_MS, () -> TEST_UTIL.getHBaseCluster().getMaster().getProcedures()
142      .stream().filter(p -> p.isFinished()).findAny().isPresent());
143
144    // shutdown and delete data if needed
145    Path walRootDirPath = TEST_UTIL.getMiniHBaseCluster().getMaster().getWALRootDir();
146    Path rootDirPath = CommonFSUtils.getRootDir(TEST_UTIL.getConfiguration());
147    TEST_UTIL.shutdownMiniHBaseCluster();
148
149    if (cleanUpWALs) {
150      TEST_UTIL.getDFSCluster().getFileSystem()
151        .delete(new Path(rootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
152      TEST_UTIL.getDFSCluster().getFileSystem()
153        .delete(new Path(walRootDirPath, MasterRegionFactory.MASTER_STORE_DIR), true);
154      TEST_UTIL.getDFSCluster().getFileSystem()
155        .delete(new Path(walRootDirPath, WALProcedureStore.MASTER_PROCEDURE_LOGDIR), true);
156
157      TEST_UTIL.getDFSCluster().getFileSystem()
158        .delete(new Path(walRootDirPath, HConstants.HREGION_LOGDIR_NAME), true);
159      TEST_UTIL.getDFSCluster().getFileSystem()
160        .delete(new Path(walRootDirPath, HConstants.HREGION_OLDLOGDIR_NAME), true);
161    }
162
163    if (cleanUpZnodes) {
164      // delete all zk data
165      // we cannot keep ZK data because it will hold the meta region states as open and
166      // didn't submit a InitMetaProcedure
167      ZKUtil.deleteChildrenRecursively(TEST_UTIL.getZooKeeperWatcher(),
168        TEST_UTIL.getZooKeeperWatcher().getZNodePaths().baseZNode);
169      TEST_UTIL.shutdownMiniZKCluster();
170      TEST_UTIL.startMiniZKCluster();
171    }
172
173    TEST_UTIL.restartHBaseCluster(NUM_RS);
174    TEST_UTIL.waitFor(TIMEOUT_MS, new Waiter.Predicate<Exception>() {
175      @Override
176      public boolean evaluate() throws Exception {
177        return TEST_UTIL.getMiniHBaseCluster().getNumLiveRegionServers() == NUM_RS;
178      }
179    });
180
181    // make sure we have a new set of region servers with different hostnames and ports
182    List<ServerName> newServers =
183      TEST_UTIL.getHBaseCluster().getMaster().getServerManager().getOnlineServersList();
184    assertFalse(newServers.stream().filter((newServer) -> oldServers.contains(newServer)).findAny()
185      .isPresent());
186  }
187
188  private void prepareDataBeforeRecreate(HBaseTestingUtil testUtil, TableName tableName)
189    throws Exception {
190    Table table = testUtil.createTable(tableName, "f");
191    Put put = new Put(Bytes.toBytes("r1"));
192    put.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"), Bytes.toBytes("v"));
193    table.put(put);
194
195    ensureTableNotColocatedWithSystemTable(tableName, TableName.META_TABLE_NAME);
196  }
197
198  private void ensureTableNotColocatedWithSystemTable(TableName userTable, TableName systemTable)
199    throws IOException, InterruptedException {
200    SingleProcessHBaseCluster hbaseCluster = TEST_UTIL.getHBaseCluster();
201    assertTrue("Please start more than 1 regionserver",
202      hbaseCluster.getRegionServerThreads().size() > 1);
203
204    int userTableServerNum = getServerNumForTableWithOnlyOneRegion(userTable);
205    int systemTableServerNum = getServerNumForTableWithOnlyOneRegion(systemTable);
206
207    if (userTableServerNum != systemTableServerNum) {
208      // no-ops if user table and system are already on a different host
209      return;
210    }
211
212    int destServerNum = (systemTableServerNum + 1) % NUM_RS;
213    assertTrue(systemTableServerNum != destServerNum);
214
215    HRegionServer systemTableServer = hbaseCluster.getRegionServer(systemTableServerNum);
216    HRegionServer destServer = hbaseCluster.getRegionServer(destServerNum);
217    assertTrue(!systemTableServer.equals(destServer));
218    // make sure the dest server is live before moving region
219    hbaseCluster.waitForRegionServerToStart(destServer.getServerName().getHostname(),
220      destServer.getServerName().getPort(), TIMEOUT_MS);
221    // move region of userTable to a different regionserver not co-located with system table
222    TEST_UTIL.moveRegionAndWait(TEST_UTIL.getAdmin().getRegions(userTable).get(0),
223      destServer.getServerName());
224  }
225
226  private int getServerNumForTableWithOnlyOneRegion(TableName tableName) throws IOException {
227    List<RegionInfo> tableRegionInfos = TEST_UTIL.getAdmin().getRegions(tableName);
228    assertEquals(1, tableRegionInfos.size());
229    return TEST_UTIL.getHBaseCluster().getServerWith(tableRegionInfos.get(0).getRegionName());
230  }
231
232  private void validateDataAfterRecreate(HBaseTestingUtil testUtil, TableName tableName)
233    throws Exception {
234    Table t1 = testUtil.getConnection().getTable(tableName);
235    Get get = new Get(Bytes.toBytes("r1"));
236    get.addColumn(Bytes.toBytes("f"), Bytes.toBytes("c"));
237    Result result = t1.get(get);
238    assertTrue(result.advance());
239    Cell cell = result.current();
240    assertEquals("v",
241      Bytes.toString(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()));
242    assertFalse(result.advance());
243  }
244
245}