001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.backup.master;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.assertFalse;
022import static org.junit.jupiter.api.Assertions.assertTrue;
023
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.LinkedHashSet;
028import java.util.List;
029import java.util.Map;
030import java.util.Set;
031import org.apache.hadoop.fs.FileStatus;
032import org.apache.hadoop.fs.Path;
033import org.apache.hadoop.hbase.HRegionLocation;
034import org.apache.hadoop.hbase.ServerName;
035import org.apache.hadoop.hbase.TableName;
036import org.apache.hadoop.hbase.backup.BackupInfo;
037import org.apache.hadoop.hbase.backup.BackupType;
038import org.apache.hadoop.hbase.backup.TestBackupBase;
039import org.apache.hadoop.hbase.backup.impl.BackupSystemTable;
040import org.apache.hadoop.hbase.backup.util.BackupBoundaries;
041import org.apache.hadoop.hbase.client.Connection;
042import org.apache.hadoop.hbase.client.Put;
043import org.apache.hadoop.hbase.client.RegionInfo;
044import org.apache.hadoop.hbase.client.Table;
045import org.apache.hadoop.hbase.master.HMaster;
046import org.apache.hadoop.hbase.testclassification.LargeTests;
047import org.apache.hadoop.hbase.util.Bytes;
048import org.apache.hadoop.hbase.util.JVMClusterUtil;
049import org.junit.jupiter.api.BeforeAll;
050import org.junit.jupiter.api.Tag;
051import org.junit.jupiter.api.Test;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055@Tag(LargeTests.TAG)
056public class TestBackupLogCleaner extends TestBackupBase {
057
058  private static final Logger LOG = LoggerFactory.getLogger(TestBackupLogCleaner.class);
059
060  // implements all test cases in 1 test since incremental full backup/
061  // incremental backup has dependencies
062
063  @BeforeAll
064  public static void before() {
065    TEST_UTIL.getConfiguration().setLong(BackupLogCleaner.TS_BUFFER_KEY, 0);
066  }
067
068  @Test
069  public void testBackupLogCleaner() throws Exception {
070    Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "root1");
071    Path backupRoot2 = new Path(BACKUP_ROOT_DIR, "root2");
072
073    List<TableName> tableSetFull = List.of(table1, table2, table3, table4);
074    List<TableName> tableSet14 = List.of(table1, table4);
075    List<TableName> tableSet23 = List.of(table2, table3);
076
077    try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) {
078      // Verify that we have no backup sessions yet
079      assertFalse(systemTable.hasBackupSessions());
080
081      BackupLogCleaner cleaner = new BackupLogCleaner();
082      cleaner.setConf(TEST_UTIL.getConfiguration());
083      cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster()));
084
085      // All WAL files can be deleted because we do not have backups
086      List<FileStatus> walFilesBeforeBackup = getListOfWALFiles(TEST_UTIL.getConfiguration());
087      Iterable<FileStatus> deletable = cleaner.getDeletableFiles(walFilesBeforeBackup);
088      assertEquals(walFilesBeforeBackup, deletable);
089
090      // Create a FULL backup B1 in backupRoot R1, containing all tables
091      String backupIdB1 = backupTables(BackupType.FULL, tableSetFull, backupRoot1.toString());
092      assertTrue(checkSucceeded(backupIdB1));
093
094      // As part of a backup, WALs are rolled, so we expect a new WAL file
095      Set<FileStatus> walFilesAfterB1 =
096        mergeAsSet(walFilesBeforeBackup, getListOfWALFiles(TEST_UTIL.getConfiguration()));
097      assertTrue(walFilesBeforeBackup.size() < walFilesAfterB1.size());
098
099      // Currently, we only have backup B1, so we can delete any WAL preceding B1
100      deletable = cleaner.getDeletableFiles(walFilesAfterB1);
101      assertEquals(toSet(walFilesBeforeBackup), toSet(deletable));
102
103      // Insert some data
104      Connection conn = TEST_UTIL.getConnection();
105      try (Table t1 = conn.getTable(table1)) {
106        Put p1;
107        for (int i = 0; i < NB_ROWS_IN_BATCH; i++) {
108          p1 = new Put(Bytes.toBytes("row-t1" + i));
109          p1.addColumn(famName, qualName, Bytes.toBytes("val" + i));
110          t1.put(p1);
111        }
112      }
113
114      try (Table t2 = conn.getTable(table2)) {
115        Put p2;
116        for (int i = 0; i < 5; i++) {
117          p2 = new Put(Bytes.toBytes("row-t2" + i));
118          p2.addColumn(famName, qualName, Bytes.toBytes("val" + i));
119          t2.put(p2);
120        }
121      }
122
123      // Create an INCREMENTAL backup B2 in backupRoot R1, requesting tables 1 & 4.
124      // Note that incremental tables always include all tables already included in the backup root,
125      // i.e. the backup will contain all tables (1, 2, 3, 4), ignoring what we specify here.
126      LOG.debug("Creating B2");
127      String backupIdB2 = backupTables(BackupType.INCREMENTAL, tableSet14, backupRoot1.toString());
128      assertTrue(checkSucceeded(backupIdB2));
129
130      // As part of a backup, WALs are rolled, so we expect a new WAL file
131      Set<FileStatus> walFilesAfterB2 =
132        mergeAsSet(walFilesAfterB1, getListOfWALFiles(TEST_UTIL.getConfiguration()));
133      assertTrue(walFilesAfterB1.size() < walFilesAfterB2.size());
134
135      // At this point, we have backups in root R1: B1 and B2.
136      // We only consider the most recent backup (B2) to determine which WALs can be deleted:
137      // all WALs preceding B2
138      deletable = cleaner.getDeletableFiles(walFilesAfterB2);
139      assertEquals(toSet(walFilesAfterB1), toSet(deletable));
140
141      // Create a FULL backup B3 in backupRoot R2, containing tables 1 & 4
142      LOG.debug("Creating B3");
143      String backupIdB3 = backupTables(BackupType.FULL, tableSetFull, backupRoot2.toString());
144      assertTrue(checkSucceeded(backupIdB3));
145
146      // As part of a backup, WALs are rolled, so we expect a new WAL file
147      Set<FileStatus> walFilesAfterB3 =
148        mergeAsSet(walFilesAfterB2, getListOfWALFiles(TEST_UTIL.getConfiguration()));
149      assertTrue(walFilesAfterB2.size() < walFilesAfterB3.size());
150
151      // At this point, we have backups in:
152      // root R1: B1 (timestamp=0, all tables), B2 (TS=1, all tables)
153      // root R2: B3 (TS=2, [T1, T4])
154      //
155      // To determine the WAL-deletion boundary, we only consider the most recent backup per root,
156      // so [B2, B3]. From these, we take the least recent as WAL-deletion boundary: B2, it contains
157      // all tables, so acts as the deletion boundary. I.e. only WALs preceding B2 are deletable.
158      deletable = cleaner.getDeletableFiles(walFilesAfterB3);
159      assertEquals(toSet(walFilesAfterB1), toSet(deletable));
160
161      // Create a FULL backup B4 in backupRoot R1, with a subset of tables
162      LOG.debug("Creating B4");
163      String backupIdB4 = backupTables(BackupType.FULL, tableSet14, backupRoot1.toString());
164      assertTrue(checkSucceeded(backupIdB4));
165
166      // As part of a backup, WALs are rolled, so we expect a new WAL file
167      Set<FileStatus> walFilesAfterB4 =
168        mergeAsSet(walFilesAfterB3, getListOfWALFiles(TEST_UTIL.getConfiguration()));
169      assertTrue(walFilesAfterB3.size() < walFilesAfterB4.size());
170
171      // At this point, we have backups in:
172      // root R1: B1 (timestamp=0, all tables), B2 (TS=1, all tables), B4 (TS=3, [T1, T4])
173      // root R2: B3 (TS=2, [T1, T4])
174      //
175      // To determine the WAL-deletion boundary, we only consider the most recent backup per root,
176      // so [B4, B3]. They contain the following timestamp boundaries per table:
177      // B4: { T1: 3, T2: 1, T3: 1, T4: 3 }
178      // B3: { T1: 2, T4: 2 }
179      // Taking the minimum timestamp (= 1), this means all WALs preceding B2 can be deleted.
180      deletable = cleaner.getDeletableFiles(walFilesAfterB4);
181      assertEquals(toSet(walFilesAfterB1), toSet(deletable));
182
183      // Create a FULL backup B5 in backupRoot R1, for tables 2 & 3
184      String backupIdB5 = backupTables(BackupType.FULL, tableSet23, backupRoot1.toString());
185      assertTrue(checkSucceeded(backupIdB5));
186
187      // As part of a backup, WALs are rolled, so we expect a new WAL file
188      Set<FileStatus> walFilesAfterB5 =
189        mergeAsSet(walFilesAfterB4, getListOfWALFiles(TEST_UTIL.getConfiguration()));
190      assertTrue(walFilesAfterB4.size() < walFilesAfterB5.size());
191
192      // At this point, we have backups in:
193      // root R1: ..., B2 (TS=1, all tables), B4 (TS=3, [T1, T4]), B5 (TS=4, [T2, T3])
194      // root R2: B3 (TS=2, [T1, T4])
195      //
196      // To determine the WAL-deletion boundary, we only consider the most recent backup per root,
197      // so [B5, B3]. They contain the following timestamp boundaries per table:
198      // B4: { T1: 3, T2: 4, T3: 4, T4: 3 }
199      // B3: { T1: 2, T4: 2 }
200      // Taking the minimum timestamp (= 2), this means all WALs preceding B3 can be deleted.
201      deletable = cleaner.getDeletableFiles(walFilesAfterB5);
202      assertEquals(toSet(walFilesAfterB2), toSet(deletable));
203    } finally {
204      TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
205    }
206  }
207
208  @Test
209  public void testDoesNotDeleteWALsFromNewServers() throws Exception {
210    Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "backup1");
211    List<TableName> tableSetFull = List.of(table1, table2, table3, table4);
212
213    JVMClusterUtil.RegionServerThread rsThread = null;
214    try (BackupSystemTable systemTable = new BackupSystemTable(TEST_UTIL.getConnection())) {
215      LOG.info("Creating initial backup B1");
216      String backupIdB1 = backupTables(BackupType.FULL, tableSetFull, backupRoot1.toString());
217      assertTrue(checkSucceeded(backupIdB1));
218
219      List<FileStatus> walsAfterB1 = getListOfWALFiles(TEST_UTIL.getConfiguration());
220      LOG.info("WALs after B1: {}", walsAfterB1.size());
221
222      // Add a new RegionServer to the cluster
223      LOG.info("Adding new RegionServer to cluster");
224      rsThread = TEST_UTIL.getMiniHBaseCluster().startRegionServer();
225      ServerName newServerName = rsThread.getRegionServer().getServerName();
226      LOG.info("New RegionServer started: {}", newServerName);
227
228      // Move a region to the new server to ensure it creates a WAL
229      List<RegionInfo> regions = TEST_UTIL.getAdmin().getRegions(table1);
230      RegionInfo regionToMove = regions.get(0);
231
232      LOG.info("Moving region {} to new server {}", regionToMove.getEncodedName(), newServerName);
233      TEST_UTIL.getAdmin().move(regionToMove.getEncodedNameAsBytes(), newServerName);
234
235      TEST_UTIL.waitFor(30000, () -> {
236        try {
237          HRegionLocation location = TEST_UTIL.getConnection().getRegionLocator(table1)
238            .getRegionLocation(regionToMove.getStartKey());
239          return location.getServerName().equals(newServerName);
240        } catch (IOException e) {
241          return false;
242        }
243      });
244
245      // Write some data to trigger WAL creation on the new server
246      try (Table t1 = TEST_UTIL.getConnection().getTable(table1)) {
247        for (int i = 0; i < 100; i++) {
248          Put p = new Put(Bytes.toBytes("newserver-row-" + i));
249          p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
250          t1.put(p);
251        }
252      }
253      TEST_UTIL.getAdmin().flushRegion(regionToMove.getEncodedNameAsBytes());
254
255      List<FileStatus> walsAfterNewServer = getListOfWALFiles(TEST_UTIL.getConfiguration());
256      LOG.info("WALs after adding new server: {}", walsAfterNewServer.size());
257      assertTrue(walsAfterNewServer.size() > walsAfterB1.size(),
258        "Should have more WALs after new server");
259
260      List<FileStatus> newServerWALs = new ArrayList<>(walsAfterNewServer);
261      newServerWALs.removeAll(walsAfterB1);
262      assertFalse(newServerWALs.isEmpty(), "Should have WALs from new server");
263
264      BackupLogCleaner cleaner = new BackupLogCleaner();
265      cleaner.setConf(TEST_UTIL.getConfiguration());
266      cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster()));
267
268      Set<FileStatus> deletable = toSet(cleaner.getDeletableFiles(walsAfterNewServer));
269      for (FileStatus newWAL : newServerWALs) {
270        assertFalse(deletable.contains(newWAL),
271          "WAL from new server should NOT be deletable: " + newWAL.getPath());
272      }
273    } finally {
274      TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
275      // Clean up the RegionServer we added
276      if (rsThread != null) {
277        LOG.info("Stopping the RegionServer added for test");
278        TEST_UTIL.getMiniHBaseCluster()
279          .stopRegionServer(rsThread.getRegionServer().getServerName());
280        TEST_UTIL.getMiniHBaseCluster()
281          .waitForRegionServerToStop(rsThread.getRegionServer().getServerName(), 30000);
282      }
283    }
284  }
285
286  @Test
287  public void testCanDeleteFileWithNewServerWALs() {
288    BackupInfo backup = new BackupInfo();
289    backup.setState(BackupInfo.BackupState.COMPLETE);
290    backup.setTableSetTimestampMap(
291      Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 1000000L)));
292    BackupBoundaries boundaries =
293      BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 0L);
294
295    // Old WAL from before the backup
296    Path oldWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500000");
297    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, oldWAL),
298      "WAL older than backup should be deletable");
299
300    // WAL from exactly at the backup boundary
301    Path boundaryWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1000000");
302    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, boundaryWAL),
303      "WAL at boundary should be deletable");
304
305    // WAL created after the backup boundary
306    Path newWal = new Path("/hbase/oldWALs/server1%2C60020%2C12345.1500000");
307    assertFalse(BackupLogCleaner.canDeleteFile(boundaries, newWal),
308      "WAL newer than backup should not be deletable");
309
310    // WAL from a new server that joined AFTER the backup
311    Path newServerWAL = new Path("/hbase/oldWALs/newserver%2C60020%2C99999.1500000");
312    assertFalse(BackupLogCleaner.canDeleteFile(boundaries, newServerWAL),
313      "WAL from new server (after backup) should NOT be deletable");
314  }
315
316  @Test
317  public void testFirstBackupProtectsFiles() {
318    BackupInfo backup = new BackupInfo();
319    backup.setBackupId("backup_1");
320    backup.setState(BackupInfo.BackupState.RUNNING);
321    backup.setStartTs(100L);
322    // Running backups have no TableSetTimestampMap
323
324    BackupBoundaries boundaries =
325      BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 5L);
326
327    // There's only a single backup, and it is still running, so it's a FULL backup.
328    // We expect files preceding the snapshot are deletable, but files after the start are not.
329    // Because this is not region-server-specific, the buffer is taken into account.
330    Path path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.94");
331    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path));
332    path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.95");
333    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path));
334    path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.96");
335    assertFalse(BackupLogCleaner.canDeleteFile(boundaries, path));
336
337    // If there is an already completed backup in the same root, only that one matters.
338    // In this case, a region-server-specific timestamp is available, so the buffer is not used.
339    BackupInfo backup2 = new BackupInfo();
340    backup2.setBackupId("backup_2");
341    backup2.setState(BackupInfo.BackupState.COMPLETE);
342    backup2.setStartTs(80L);
343    backup2
344      .setTableSetTimestampMap(Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 90L)));
345
346    boundaries = BackupLogCleaner.calculatePreservationBoundary(List.of(backup, backup2), 5L);
347
348    path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.89");
349    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path));
350    path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.90");
351    assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path));
352    path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.91");
353    assertFalse(BackupLogCleaner.canDeleteFile(boundaries, path));
354  }
355
356  @Test
357  public void testCleansUpHMasterWal() {
358    Path path = new Path("/hbase/MasterData/WALs/hmaster,60000,1718808578163");
359    assertTrue(BackupLogCleaner.canDeleteFile(BackupBoundaries.builder(0L).build(), path));
360  }
361
362  @Test
363  public void testCleansUpArchivedHMasterWal() {
364    BackupBoundaries empty = BackupBoundaries.builder(0L).build();
365    Path normalPath =
366      new Path("/hbase/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
367    assertTrue(BackupLogCleaner.canDeleteFile(empty, normalPath));
368
369    Path masterPath = new Path(
370      "/hbase/MasterData/oldWALs/hmaster%2C60000%2C1716224062663.1716247552189$masterlocalwal$");
371    assertTrue(BackupLogCleaner.canDeleteFile(empty, masterPath));
372  }
373
374  private Set<FileStatus> mergeAsSet(Collection<FileStatus> toCopy, Collection<FileStatus> toAdd) {
375    Set<FileStatus> result = new LinkedHashSet<>(toCopy);
376    result.addAll(toAdd);
377    return result;
378  }
379
380  private <T> Set<T> toSet(Iterable<T> iterable) {
381    Set<T> result = new LinkedHashSet<>();
382    iterable.forEach(result::add);
383    return result;
384  }
385}