001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.backup.master; 019 020import static org.junit.jupiter.api.Assertions.assertEquals; 021import static org.junit.jupiter.api.Assertions.assertFalse; 022import static org.junit.jupiter.api.Assertions.assertTrue; 023 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.LinkedHashSet; 028import java.util.List; 029import java.util.Map; 030import java.util.Set; 031import org.apache.hadoop.fs.FileStatus; 032import org.apache.hadoop.fs.Path; 033import org.apache.hadoop.hbase.HRegionLocation; 034import org.apache.hadoop.hbase.ServerName; 035import org.apache.hadoop.hbase.TableName; 036import org.apache.hadoop.hbase.backup.BackupInfo; 037import org.apache.hadoop.hbase.backup.BackupType; 038import org.apache.hadoop.hbase.backup.TestBackupBase; 039import org.apache.hadoop.hbase.backup.impl.BackupSystemTable; 040import org.apache.hadoop.hbase.backup.util.BackupBoundaries; 041import org.apache.hadoop.hbase.client.Connection; 042import org.apache.hadoop.hbase.client.Put; 043import org.apache.hadoop.hbase.client.RegionInfo; 044import org.apache.hadoop.hbase.client.Table; 045import org.apache.hadoop.hbase.master.HMaster; 046import org.apache.hadoop.hbase.testclassification.LargeTests; 047import org.apache.hadoop.hbase.util.Bytes; 048import org.apache.hadoop.hbase.util.JVMClusterUtil; 049import org.junit.jupiter.api.BeforeAll; 050import org.junit.jupiter.api.Tag; 051import org.junit.jupiter.api.Test; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054 055@Tag(LargeTests.TAG) 056public class TestBackupLogCleaner extends TestBackupBase { 057 058 private static final Logger LOG = LoggerFactory.getLogger(TestBackupLogCleaner.class); 059 060 // implements all test cases in 1 test since incremental full backup/ 061 // incremental backup has dependencies 062 063 @BeforeAll 064 public static void before() { 065 TEST_UTIL.getConfiguration().setLong(BackupLogCleaner.TS_BUFFER_KEY, 0); 066 } 067 068 @Test 069 public void testBackupLogCleaner() throws Exception { 070 Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "root1"); 071 Path backupRoot2 = new Path(BACKUP_ROOT_DIR, "root2"); 072 073 List<TableName> tableSetFull = List.of(table1, table2, table3, table4); 074 List<TableName> tableSet14 = List.of(table1, table4); 075 List<TableName> tableSet23 = List.of(table2, table3); 076 077 try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) { 078 // Verify that we have no backup sessions yet 079 assertFalse(systemTable.hasBackupSessions()); 080 081 BackupLogCleaner cleaner = new BackupLogCleaner(); 082 cleaner.setConf(TEST_UTIL.getConfiguration()); 083 cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster())); 084 085 // All WAL files can be deleted because we do not have backups 086 List<FileStatus> walFilesBeforeBackup = getListOfWALFiles(TEST_UTIL.getConfiguration()); 087 Iterable<FileStatus> deletable = cleaner.getDeletableFiles(walFilesBeforeBackup); 088 assertEquals(walFilesBeforeBackup, deletable); 089 090 // Create a FULL backup B1 in backupRoot R1, containing all tables 091 String backupIdB1 = backupTables(BackupType.FULL, tableSetFull, backupRoot1.toString()); 092 assertTrue(checkSucceeded(backupIdB1)); 093 094 // As part of a backup, WALs are rolled, so we expect a new WAL file 095 Set<FileStatus> walFilesAfterB1 = 096 mergeAsSet(walFilesBeforeBackup, getListOfWALFiles(TEST_UTIL.getConfiguration())); 097 assertTrue(walFilesBeforeBackup.size() < walFilesAfterB1.size()); 098 099 // Currently, we only have backup B1, so we can delete any WAL preceding B1 100 deletable = cleaner.getDeletableFiles(walFilesAfterB1); 101 assertEquals(toSet(walFilesBeforeBackup), toSet(deletable)); 102 103 // Insert some data 104 Connection conn = TEST_UTIL.getConnection(); 105 try (Table t1 = conn.getTable(table1)) { 106 Put p1; 107 for (int i = 0; i < NB_ROWS_IN_BATCH; i++) { 108 p1 = new Put(Bytes.toBytes("row-t1" + i)); 109 p1.addColumn(famName, qualName, Bytes.toBytes("val" + i)); 110 t1.put(p1); 111 } 112 } 113 114 try (Table t2 = conn.getTable(table2)) { 115 Put p2; 116 for (int i = 0; i < 5; i++) { 117 p2 = new Put(Bytes.toBytes("row-t2" + i)); 118 p2.addColumn(famName, qualName, Bytes.toBytes("val" + i)); 119 t2.put(p2); 120 } 121 } 122 123 // Create an INCREMENTAL backup B2 in backupRoot R1, requesting tables 1 & 4. 124 // Note that incremental tables always include all tables already included in the backup root, 125 // i.e. the backup will contain all tables (1, 2, 3, 4), ignoring what we specify here. 126 LOG.debug("Creating B2"); 127 String backupIdB2 = backupTables(BackupType.INCREMENTAL, tableSet14, backupRoot1.toString()); 128 assertTrue(checkSucceeded(backupIdB2)); 129 130 // As part of a backup, WALs are rolled, so we expect a new WAL file 131 Set<FileStatus> walFilesAfterB2 = 132 mergeAsSet(walFilesAfterB1, getListOfWALFiles(TEST_UTIL.getConfiguration())); 133 assertTrue(walFilesAfterB1.size() < walFilesAfterB2.size()); 134 135 // At this point, we have backups in root R1: B1 and B2. 136 // We only consider the most recent backup (B2) to determine which WALs can be deleted: 137 // all WALs preceding B2 138 deletable = cleaner.getDeletableFiles(walFilesAfterB2); 139 assertEquals(toSet(walFilesAfterB1), toSet(deletable)); 140 141 // Create a FULL backup B3 in backupRoot R2, containing tables 1 & 4 142 LOG.debug("Creating B3"); 143 String backupIdB3 = backupTables(BackupType.FULL, tableSetFull, backupRoot2.toString()); 144 assertTrue(checkSucceeded(backupIdB3)); 145 146 // As part of a backup, WALs are rolled, so we expect a new WAL file 147 Set<FileStatus> walFilesAfterB3 = 148 mergeAsSet(walFilesAfterB2, getListOfWALFiles(TEST_UTIL.getConfiguration())); 149 assertTrue(walFilesAfterB2.size() < walFilesAfterB3.size()); 150 151 // At this point, we have backups in: 152 // root R1: B1 (timestamp=0, all tables), B2 (TS=1, all tables) 153 // root R2: B3 (TS=2, [T1, T4]) 154 // 155 // To determine the WAL-deletion boundary, we only consider the most recent backup per root, 156 // so [B2, B3]. From these, we take the least recent as WAL-deletion boundary: B2, it contains 157 // all tables, so acts as the deletion boundary. I.e. only WALs preceding B2 are deletable. 158 deletable = cleaner.getDeletableFiles(walFilesAfterB3); 159 assertEquals(toSet(walFilesAfterB1), toSet(deletable)); 160 161 // Create a FULL backup B4 in backupRoot R1, with a subset of tables 162 LOG.debug("Creating B4"); 163 String backupIdB4 = backupTables(BackupType.FULL, tableSet14, backupRoot1.toString()); 164 assertTrue(checkSucceeded(backupIdB4)); 165 166 // As part of a backup, WALs are rolled, so we expect a new WAL file 167 Set<FileStatus> walFilesAfterB4 = 168 mergeAsSet(walFilesAfterB3, getListOfWALFiles(TEST_UTIL.getConfiguration())); 169 assertTrue(walFilesAfterB3.size() < walFilesAfterB4.size()); 170 171 // At this point, we have backups in: 172 // root R1: B1 (timestamp=0, all tables), B2 (TS=1, all tables), B4 (TS=3, [T1, T4]) 173 // root R2: B3 (TS=2, [T1, T4]) 174 // 175 // To determine the WAL-deletion boundary, we only consider the most recent backup per root, 176 // so [B4, B3]. They contain the following timestamp boundaries per table: 177 // B4: { T1: 3, T2: 1, T3: 1, T4: 3 } 178 // B3: { T1: 2, T4: 2 } 179 // Taking the minimum timestamp (= 1), this means all WALs preceding B2 can be deleted. 180 deletable = cleaner.getDeletableFiles(walFilesAfterB4); 181 assertEquals(toSet(walFilesAfterB1), toSet(deletable)); 182 183 // Create a FULL backup B5 in backupRoot R1, for tables 2 & 3 184 String backupIdB5 = backupTables(BackupType.FULL, tableSet23, backupRoot1.toString()); 185 assertTrue(checkSucceeded(backupIdB5)); 186 187 // As part of a backup, WALs are rolled, so we expect a new WAL file 188 Set<FileStatus> walFilesAfterB5 = 189 mergeAsSet(walFilesAfterB4, getListOfWALFiles(TEST_UTIL.getConfiguration())); 190 assertTrue(walFilesAfterB4.size() < walFilesAfterB5.size()); 191 192 // At this point, we have backups in: 193 // root R1: ..., B2 (TS=1, all tables), B4 (TS=3, [T1, T4]), B5 (TS=4, [T2, T3]) 194 // root R2: B3 (TS=2, [T1, T4]) 195 // 196 // To determine the WAL-deletion boundary, we only consider the most recent backup per root, 197 // so [B5, B3]. They contain the following timestamp boundaries per table: 198 // B4: { T1: 3, T2: 4, T3: 4, T4: 3 } 199 // B3: { T1: 2, T4: 2 } 200 // Taking the minimum timestamp (= 2), this means all WALs preceding B3 can be deleted. 201 deletable = cleaner.getDeletableFiles(walFilesAfterB5); 202 assertEquals(toSet(walFilesAfterB2), toSet(deletable)); 203 } finally { 204 TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close(); 205 } 206 } 207 208 @Test 209 public void testDoesNotDeleteWALsFromNewServers() throws Exception { 210 Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "backup1"); 211 List<TableName> tableSetFull = List.of(table1, table2, table3, table4); 212 213 JVMClusterUtil.RegionServerThread rsThread = null; 214 try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) { 215 LOG.info("Creating initial backup B1"); 216 String backupIdB1 = backupTables(BackupType.FULL, tableSetFull, backupRoot1.toString()); 217 assertTrue(checkSucceeded(backupIdB1)); 218 219 List<FileStatus> walsAfterB1 = getListOfWALFiles(TEST_UTIL.getConfiguration()); 220 LOG.info("WALs after B1: {}", walsAfterB1.size()); 221 222 // Add a new RegionServer to the cluster 223 LOG.info("Adding new RegionServer to cluster"); 224 rsThread = TEST_UTIL.getMiniHBaseCluster().startRegionServer(); 225 ServerName newServerName = rsThread.getRegionServer().getServerName(); 226 LOG.info("New RegionServer started: {}", newServerName); 227 228 // Move a region to the new server to ensure it creates a WAL 229 List<RegionInfo> regions = TEST_UTIL.getAdmin().getRegions(table1); 230 RegionInfo regionToMove = regions.get(0); 231 232 LOG.info("Moving region {} to new server {}", regionToMove.getEncodedName(), newServerName); 233 TEST_UTIL.getAdmin().move(regionToMove.getEncodedNameAsBytes(), newServerName); 234 235 TEST_UTIL.waitFor(30000, () -> { 236 try { 237 HRegionLocation location = TEST_UTIL.getConnection().getRegionLocator(table1) 238 .getRegionLocation(regionToMove.getStartKey()); 239 return location.getServerName().equals(newServerName); 240 } catch (IOException e) { 241 return false; 242 } 243 }); 244 245 // Write some data to trigger WAL creation on the new server 246 try (Table t1 = TEST_UTIL.getConnection().getTable(table1)) { 247 for (int i = 0; i < 100; i++) { 248 Put p = new Put(Bytes.toBytes("newserver-row-" + i)); 249 p.addColumn(famName, qualName, Bytes.toBytes("val" + i)); 250 t1.put(p); 251 } 252 } 253 TEST_UTIL.getAdmin().flushRegion(regionToMove.getEncodedNameAsBytes()); 254 255 List<FileStatus> walsAfterNewServer = getListOfWALFiles(TEST_UTIL.getConfiguration()); 256 LOG.info("WALs after adding new server: {}", walsAfterNewServer.size()); 257 assertTrue(walsAfterNewServer.size() > walsAfterB1.size(), 258 "Should have more WALs after new server"); 259 260 List<FileStatus> newServerWALs = new ArrayList<>(walsAfterNewServer); 261 newServerWALs.removeAll(walsAfterB1); 262 assertFalse(newServerWALs.isEmpty(), "Should have WALs from new server"); 263 264 BackupLogCleaner cleaner = new BackupLogCleaner(); 265 cleaner.setConf(TEST_UTIL.getConfiguration()); 266 cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster())); 267 268 Set<FileStatus> deletable = toSet(cleaner.getDeletableFiles(walsAfterNewServer)); 269 for (FileStatus newWAL : newServerWALs) { 270 assertFalse(deletable.contains(newWAL), 271 "WAL from new server should NOT be deletable: " + newWAL.getPath()); 272 } 273 } finally { 274 TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close(); 275 // Clean up the RegionServer we added 276 if (rsThread != null) { 277 LOG.info("Stopping the RegionServer added for test"); 278 TEST_UTIL.getMiniHBaseCluster() 279 .stopRegionServer(rsThread.getRegionServer().getServerName()); 280 TEST_UTIL.getMiniHBaseCluster() 281 .waitForRegionServerToStop(rsThread.getRegionServer().getServerName(), 30000); 282 } 283 } 284 } 285 286 @Test 287 public void testCanDeleteFileWithNewServerWALs() { 288 BackupInfo backup = new BackupInfo(); 289 backup.setState(BackupInfo.BackupState.COMPLETE); 290 backup.setTableSetTimestampMap( 291 Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 1000000L))); 292 BackupBoundaries boundaries = 293 BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 0L); 294 295 // Old WAL from before the backup 296 Path oldWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500000"); 297 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, oldWAL), 298 "WAL older than backup should be deletable"); 299 300 // WAL from exactly at the backup boundary 301 Path boundaryWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1000000"); 302 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, boundaryWAL), 303 "WAL at boundary should be deletable"); 304 305 // WAL created after the backup boundary 306 Path newWal = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1500000"); 307 assertFalse(BackupLogCleaner.canDeleteFile(boundaries, newWal), 308 "WAL newer than backup should not be deletable"); 309 310 // WAL from a new server that joined AFTER the backup 311 Path newServerWAL = new Path("/hbase/oldWALs/newserver%2C60020%2C99999.1500000"); 312 assertFalse(BackupLogCleaner.canDeleteFile(boundaries, newServerWAL), 313 "WAL from new server (after backup) should NOT be deletable"); 314 } 315 316 @Test 317 public void testFirstBackupProtectsFiles() { 318 BackupInfo backup = new BackupInfo(); 319 backup.setBackupId("backup_1"); 320 backup.setState(BackupInfo.BackupState.RUNNING); 321 backup.setStartTs(100L); 322 // Running backups have no TableSetTimestampMap 323 324 BackupBoundaries boundaries = 325 BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 5L); 326 327 // There's only a single backup, and it is still running, so it's a FULL backup. 328 // We expect files preceding the snapshot are deletable, but files after the start are not. 329 // Because this is not region-server-specific, the buffer is taken into account. 330 Path path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.94"); 331 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path)); 332 path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.95"); 333 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path)); 334 path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.96"); 335 assertFalse(BackupLogCleaner.canDeleteFile(boundaries, path)); 336 337 // If there is an already completed backup in the same root, only that one matters. 338 // In this case, a region-server-specific timestamp is available, so the buffer is not used. 339 BackupInfo backup2 = new BackupInfo(); 340 backup2.setBackupId("backup_2"); 341 backup2.setState(BackupInfo.BackupState.COMPLETE); 342 backup2.setStartTs(80L); 343 backup2 344 .setTableSetTimestampMap(Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 90L))); 345 346 boundaries = BackupLogCleaner.calculatePreservationBoundary(List.of(backup, backup2), 5L); 347 348 path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.89"); 349 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path)); 350 path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.90"); 351 assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path)); 352 path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.91"); 353 assertFalse(BackupLogCleaner.canDeleteFile(boundaries, path)); 354 } 355 356 @Test 357 public void testCleansUpHMasterWal() { 358 Path path = new Path("/hbase/MasterData/WALs/hmaster,60000,1718808578163"); 359 assertTrue(BackupLogCleaner.canDeleteFile(BackupBoundaries.builder(0L).build(), path)); 360 } 361 362 @Test 363 public void testCleansUpArchivedHMasterWal() { 364 BackupBoundaries empty = BackupBoundaries.builder(0L).build(); 365 Path normalPath = 366 new Path("/hbase/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$"); 367 assertTrue(BackupLogCleaner.canDeleteFile(empty, normalPath)); 368 369 Path masterPath = new Path( 370 "/hbase/MasterData/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$"); 371 assertTrue(BackupLogCleaner.canDeleteFile(empty, masterPath)); 372 } 373 374 private Set<FileStatus> mergeAsSet(Collection<FileStatus> toCopy, Collection<FileStatus> toAdd) { 375 Set<FileStatus> result = new LinkedHashSet<>(toCopy); 376 result.addAll(toAdd); 377 return result; 378 } 379 380 private <T> Set<T> toSet(Iterable<T> iterable) { 381 Set<T> result = new LinkedHashSet<>(); 382 iterable.forEach(result::add); 383 return result; 384 } 385}