001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.procedure; 019 020import static org.junit.Assert.assertEquals; 021 022import org.apache.hadoop.conf.Configuration; 023import org.apache.hadoop.fs.Path; 024import org.apache.hadoop.hbase.HBaseClassTestRule; 025import org.apache.hadoop.hbase.HBaseTestingUtility; 026import org.apache.hadoop.hbase.StartMiniClusterOption; 027import org.apache.hadoop.hbase.TableName; 028import org.apache.hadoop.hbase.client.RegionInfo; 029import org.apache.hadoop.hbase.client.TableDescriptor; 030import org.apache.hadoop.hbase.procedure2.Procedure; 031import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; 032import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; 033import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore; 034import org.apache.hadoop.hbase.testclassification.LargeTests; 035import org.apache.hadoop.hbase.testclassification.MasterTests; 036import org.apache.hadoop.hbase.util.Bytes; 037import org.apache.hadoop.hbase.util.CommonFSUtils; 038import org.apache.hadoop.hbase.util.ModifyRegionUtils; 039import org.junit.After; 040import org.junit.Before; 041import org.junit.ClassRule; 042import org.junit.Test; 043import org.junit.experimental.categories.Category; 044import org.slf4j.Logger; 045import org.slf4j.LoggerFactory; 046 047import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CreateTableState; 048import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.DeleteTableState; 049import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.DisableTableState; 050import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.EnableTableState; 051import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.TruncateTableState; 052 053@Category({MasterTests.class, LargeTests.class}) 054public class TestMasterFailoverWithProcedures { 055 056 @ClassRule 057 public static final HBaseClassTestRule CLASS_RULE = 058 HBaseClassTestRule.forClass(TestMasterFailoverWithProcedures.class); 059 060 private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailoverWithProcedures.class); 061 062 protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); 063 064 private static void setupConf(Configuration conf) { 065 // don't waste time retrying with the roll, the test is already slow enough. 066 conf.setInt(WALProcedureStore.MAX_RETRIES_BEFORE_ROLL_CONF_KEY, 1); 067 conf.setInt(WALProcedureStore.WAIT_BEFORE_ROLL_CONF_KEY, 0); 068 conf.setInt(WALProcedureStore.ROLL_RETRIES_CONF_KEY, 1); 069 conf.setInt(WALProcedureStore.MAX_SYNC_FAILURE_ROLL_CONF_KEY, 1); 070 conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1); 071 } 072 073 @Before 074 public void setup() throws Exception { 075 setupConf(UTIL.getConfiguration()); 076 // Set master number and use default values for other options. 077 StartMiniClusterOption option = StartMiniClusterOption.builder().numMasters(2).build(); 078 UTIL.startMiniCluster(option); 079 080 final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 081 ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, false); 082 ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, false); 083 } 084 085 @After 086 public void tearDown() throws Exception { 087 try { 088 UTIL.shutdownMiniCluster(); 089 } catch (Exception e) { 090 LOG.warn("failure shutting down cluster", e); 091 } 092 } 093 094 // ========================================================================== 095 // Test Create Table 096 // ========================================================================== 097 @Test 098 public void testCreateWithFailover() throws Exception { 099 // TODO: Should we try every step? (master failover takes long time) 100 // It is already covered by TestCreateTableProcedure 101 // but without the master restart, only the executor/store is restarted. 102 // Without Master restart we may not find bug in the procedure code 103 // like missing "wait" for resources to be available (e.g. RS) 104 testCreateWithFailoverAtStep(CreateTableState.CREATE_TABLE_ASSIGN_REGIONS.ordinal()); 105 } 106 107 private void testCreateWithFailoverAtStep(final int step) throws Exception { 108 final TableName tableName = TableName.valueOf("testCreateWithFailoverAtStep" + step); 109 110 // create the table 111 ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 112 ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true); 113 ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true); 114 115 // Start the Create procedure && kill the executor 116 byte[][] splitKeys = null; 117 TableDescriptor htd = MasterProcedureTestingUtility.createHTD(tableName, "f1", "f2"); 118 RegionInfo[] regions = ModifyRegionUtils.createRegionInfos(htd, splitKeys); 119 long procId = procExec.submitProcedure( 120 new CreateTableProcedure(procExec.getEnvironment(), htd, regions)); 121 testRecoveryAndDoubleExecution(UTIL, procId, step); 122 123 MasterProcedureTestingUtility.validateTableCreation( 124 UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2"); 125 } 126 127 // ========================================================================== 128 // Test Delete Table 129 // ========================================================================== 130 @Test 131 public void testDeleteWithFailover() throws Exception { 132 // TODO: Should we try every step? (master failover takes long time) 133 // It is already covered by TestDeleteTableProcedure 134 // but without the master restart, only the executor/store is restarted. 135 // Without Master restart we may not find bug in the procedure code 136 // like missing "wait" for resources to be available (e.g. RS) 137 testDeleteWithFailoverAtStep(DeleteTableState.DELETE_TABLE_UNASSIGN_REGIONS.ordinal()); 138 } 139 140 private void testDeleteWithFailoverAtStep(final int step) throws Exception { 141 final TableName tableName = TableName.valueOf("testDeleteWithFailoverAtStep" + step); 142 143 // create the table 144 byte[][] splitKeys = null; 145 RegionInfo[] regions = MasterProcedureTestingUtility.createTable( 146 getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2"); 147 Path tableDir = CommonFSUtils.getTableDir(getRootDir(), tableName); 148 MasterProcedureTestingUtility.validateTableCreation( 149 UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2"); 150 UTIL.getAdmin().disableTable(tableName); 151 152 ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 153 ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true); 154 ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true); 155 156 // Start the Delete procedure && kill the executor 157 long procId = procExec.submitProcedure( 158 new DeleteTableProcedure(procExec.getEnvironment(), tableName)); 159 testRecoveryAndDoubleExecution(UTIL, procId, step); 160 161 MasterProcedureTestingUtility.validateTableDeletion( 162 UTIL.getHBaseCluster().getMaster(), tableName); 163 } 164 165 // ========================================================================== 166 // Test Truncate Table 167 // ========================================================================== 168 @Test 169 public void testTruncateWithFailover() throws Exception { 170 // TODO: Should we try every step? (master failover takes long time) 171 // It is already covered by TestTruncateTableProcedure 172 // but without the master restart, only the executor/store is restarted. 173 // Without Master restart we may not find bug in the procedure code 174 // like missing "wait" for resources to be available (e.g. RS) 175 testTruncateWithFailoverAtStep(true, TruncateTableState.TRUNCATE_TABLE_ADD_TO_META.ordinal()); 176 } 177 178 private void testTruncateWithFailoverAtStep(final boolean preserveSplits, final int step) 179 throws Exception { 180 final TableName tableName = TableName.valueOf("testTruncateWithFailoverAtStep" + step); 181 182 // create the table 183 final String[] families = new String[] { "f1", "f2" }; 184 final byte[][] splitKeys = new byte[][] { 185 Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c") 186 }; 187 RegionInfo[] regions = MasterProcedureTestingUtility.createTable( 188 getMasterProcedureExecutor(), tableName, splitKeys, families); 189 // load and verify that there are rows in the table 190 MasterProcedureTestingUtility.loadData( 191 UTIL.getConnection(), tableName, 100, splitKeys, families); 192 assertEquals(100, UTIL.countRows(tableName)); 193 // disable the table 194 UTIL.getAdmin().disableTable(tableName); 195 196 ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 197 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); 198 199 // Start the Truncate procedure && kill the executor 200 long procId = procExec.submitProcedure( 201 new TruncateTableProcedure(procExec.getEnvironment(), tableName, preserveSplits)); 202 testRecoveryAndDoubleExecution(UTIL, procId, step); 203 204 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false); 205 UTIL.waitUntilAllRegionsAssigned(tableName); 206 207 // validate the table regions and layout 208 regions = UTIL.getAdmin().getTableRegions(tableName).toArray(new RegionInfo[0]); 209 if (preserveSplits) { 210 assertEquals(1 + splitKeys.length, regions.length); 211 } else { 212 assertEquals(1, regions.length); 213 } 214 MasterProcedureTestingUtility.validateTableCreation( 215 UTIL.getHBaseCluster().getMaster(), tableName, regions, families); 216 217 // verify that there are no rows in the table 218 assertEquals(0, UTIL.countRows(tableName)); 219 220 // verify that the table is read/writable 221 MasterProcedureTestingUtility.loadData( 222 UTIL.getConnection(), tableName, 50, splitKeys, families); 223 assertEquals(50, UTIL.countRows(tableName)); 224 } 225 226 // ========================================================================== 227 // Test Disable Table 228 // ========================================================================== 229 @Test 230 public void testDisableTableWithFailover() throws Exception { 231 // TODO: Should we try every step? (master failover takes long time) 232 // It is already covered by TestDisableTableProcedure 233 // but without the master restart, only the executor/store is restarted. 234 // Without Master restart we may not find bug in the procedure code 235 // like missing "wait" for resources to be available (e.g. RS) 236 testDisableTableWithFailoverAtStep( 237 DisableTableState.DISABLE_TABLE_MARK_REGIONS_OFFLINE.ordinal()); 238 } 239 240 private void testDisableTableWithFailoverAtStep(final int step) throws Exception { 241 final TableName tableName = TableName.valueOf("testDisableTableWithFailoverAtStep" + step); 242 243 // create the table 244 final byte[][] splitKeys = new byte[][] { 245 Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c") 246 }; 247 MasterProcedureTestingUtility.createTable( 248 getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2"); 249 250 ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 251 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); 252 253 // Start the Delete procedure && kill the executor 254 long procId = procExec.submitProcedure( 255 new DisableTableProcedure(procExec.getEnvironment(), tableName, false)); 256 testRecoveryAndDoubleExecution(UTIL, procId, step); 257 258 MasterProcedureTestingUtility.validateTableIsDisabled( 259 UTIL.getHBaseCluster().getMaster(), tableName); 260 } 261 262 // ========================================================================== 263 // Test Enable Table 264 // ========================================================================== 265 @Test 266 public void testEnableTableWithFailover() throws Exception { 267 // TODO: Should we try every step? (master failover takes long time) 268 // It is already covered by TestEnableTableProcedure 269 // but without the master restart, only the executor/store is restarted. 270 // Without Master restart we may not find bug in the procedure code 271 // like missing "wait" for resources to be available (e.g. RS) 272 testEnableTableWithFailoverAtStep( 273 EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE.ordinal()); 274 } 275 276 private void testEnableTableWithFailoverAtStep(final int step) throws Exception { 277 final TableName tableName = TableName.valueOf("testEnableTableWithFailoverAtStep" + step); 278 279 // create the table 280 final byte[][] splitKeys = new byte[][] { 281 Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c") 282 }; 283 MasterProcedureTestingUtility.createTable( 284 getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2"); 285 UTIL.getAdmin().disableTable(tableName); 286 287 ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor(); 288 ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true); 289 290 // Start the Delete procedure && kill the executor 291 long procId = procExec.submitProcedure( 292 new EnableTableProcedure(procExec.getEnvironment(), tableName)); 293 testRecoveryAndDoubleExecution(UTIL, procId, step); 294 295 MasterProcedureTestingUtility.validateTableIsEnabled( 296 UTIL.getHBaseCluster().getMaster(), tableName); 297 } 298 299 // ========================================================================== 300 // Test Helpers 301 // ========================================================================== 302 public static void testRecoveryAndDoubleExecution(final HBaseTestingUtility testUtil, 303 final long procId, final int lastStepBeforeFailover) throws Exception { 304 ProcedureExecutor<MasterProcedureEnv> procExec = 305 testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor(); 306 ProcedureTestingUtility.waitProcedure(procExec, procId); 307 308 final Procedure proc = procExec.getProcedure(procId); 309 for (int i = 0; i < lastStepBeforeFailover; ++i) { 310 LOG.info("Restart "+ i +" exec state: " + proc); 311 ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId); 312 MasterProcedureTestingUtility.restartMasterProcedureExecutor(procExec); 313 ProcedureTestingUtility.waitProcedure(procExec, procId); 314 } 315 ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId); 316 317 LOG.info("Trigger master failover"); 318 MasterProcedureTestingUtility.masterFailover(testUtil); 319 320 procExec = testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor(); 321 ProcedureTestingUtility.waitProcedure(procExec, procId); 322 ProcedureTestingUtility.assertProcNotFailed(procExec, procId); 323 } 324 325 // ========================================================================== 326 // Helpers 327 // ========================================================================== 328 private ProcedureExecutor<MasterProcedureEnv> getMasterProcedureExecutor() { 329 return UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(); 330 } 331 332 private Path getRootDir() { 333 return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir(); 334 } 335}