Skip to content

Commit 1f676bb

Browse files
janvanbesien-ngdataDieter De Paepe
authored andcommitted
HBASE-29905 BackupLogCleaner retains old WAL files due to stale entries in system:backup table
BackupLogCleaner.serverToPreservationBoundaryTs() computes the WAL deletion boundary by iterating over tableSetTimestampMap from persisted BackupInfo sessions. This map can contain entries for tables that were once part of the backup set but have since been deleted (or had all their backups deleted). Their stale, old timestamps drag the minimum WAL boundary back, preventing old WALs from being cleaned up. Fix: when computing boundaries, load the incrbackupset per backup root from BackupSystemTable and skip tables not in the active set. The incrbackupset is populated on every full backup and pruned when backups are deleted, so it accurately reflects which tables still need WAL retention.
1 parent 598600e commit 1f676bb

3 files changed

Lines changed: 197 additions & 13 deletions

File tree

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/master/BackupLogCleaner.java

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,13 @@
2525
import java.util.HashMap;
2626
import java.util.List;
2727
import java.util.Map;
28+
import java.util.Set;
2829
import java.util.stream.Collectors;
2930
import org.apache.hadoop.conf.Configuration;
3031
import org.apache.hadoop.fs.FileStatus;
3132
import org.apache.hadoop.fs.Path;
3233
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
34+
import org.apache.hadoop.hbase.TableName;
3335
import org.apache.hadoop.hbase.backup.BackupInfo;
3436
import org.apache.hadoop.hbase.backup.BackupInfo.BackupState;
3537
import org.apache.hadoop.hbase.backup.BackupRestoreConstants;
@@ -89,11 +91,13 @@ public void init(Map<String, Object> params) {
8991
* Calculates the timestamp boundary up to which all backup roots have already included the WAL.
9092
* I.e. WALs with a lower (= older) or equal timestamp are no longer needed for future incremental
9193
* backups.
92-
* @param backups all completed or running backups to use for the calculation of the boundary
93-
* @param tsBuffer a buffer (in ms) to lower the boundary for the default bound
94+
* @param backups all completed or running backups to use for the calculation of
95+
* the boundary
96+
* @param incrBackupTableSetPerRoot per backup root, the tables included in incremental backups
97+
* @param tsBuffer a buffer (in ms) to lower the boundary for the default bound
9498
*/
9599
protected static BackupBoundaries calculatePreservationBoundary(List<BackupInfo> backups,
96-
long tsBuffer) {
100+
Map<String, Set<TableName>> incrBackupTableSetPerRoot, long tsBuffer) {
97101
if (LOG.isDebugEnabled()) {
98102
LOG.debug(
99103
"Cleaning WALs if they are older than the WAL cleanup time-boundary. "
@@ -120,7 +124,16 @@ protected static BackupBoundaries calculatePreservationBoundary(List<BackupInfo>
120124
}
121125

122126
BackupBoundaries.BackupBoundariesBuilder builder = BackupBoundaries.builder(tsBuffer);
123-
newestBackupPerRootDir.values().forEach(builder::update);
127+
for (Map.Entry<String, BackupInfo> rootEntry : newestBackupPerRootDir.entrySet()) {
128+
String backupRoot = rootEntry.getKey();
129+
BackupInfo backupInfo = rootEntry.getValue();
130+
// Load the active backup table set for each backup root, so we can then skip
131+
// tables that are no longer part of the backup set. This set accurately reflects
132+
// which tables still have backups and may need future incremental backups.
133+
Set<TableName> activeTables = incrBackupTableSetPerRoot.get(backupRoot);
134+
builder.update(backupInfo, activeTables);
135+
}
136+
124137
BackupBoundaries boundaries = builder.build();
125138

126139
if (LOG.isDebugEnabled()) {
@@ -151,7 +164,16 @@ public Iterable<FileStatus> getDeletableFiles(Iterable<FileStatus> files) {
151164
long tsBuffer = getConf().getLong(TS_BUFFER_KEY, TS_BUFFER_DEFAULT);
152165
List<BackupInfo> backupHistory = sysTable.getBackupHistory(
153166
i -> EnumSet.of(BackupState.COMPLETE, BackupState.RUNNING).contains(i.getState()));
154-
boundaries = calculatePreservationBoundary(backupHistory, tsBuffer);
167+
Map<String, Set<TableName>> incrBackupTableSetPerRoot = new HashMap<>();
168+
for (BackupInfo backupInfo : backupHistory) {
169+
String backupRootDir = backupInfo.getBackupRootDir();
170+
if (!incrBackupTableSetPerRoot.containsKey(backupRootDir)) {
171+
incrBackupTableSetPerRoot.put(backupRootDir,
172+
sysTable.getIncrementalBackupTableSet(backupRootDir));
173+
}
174+
}
175+
boundaries =
176+
calculatePreservationBoundary(backupHistory, incrBackupTableSetPerRoot, tsBuffer);
155177
} catch (IOException ex) {
156178
LOG.error("Failed to analyse backup history with exception: {}. Retaining all logs",
157179
ex.getMessage(), ex);
@@ -206,4 +228,5 @@ private static boolean isHMasterWAL(Path path) {
206228
|| fn.endsWith(MasterRegionFactory.ARCHIVED_WAL_SUFFIX)
207229
|| path.toString().contains("/%s/".formatted(MasterRegionFactory.MASTER_STORE_DIR));
208230
}
231+
209232
}

hbase-backup/src/main/java/org/apache/hadoop/hbase/backup/util/BackupBoundaries.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.util.Collections;
2121
import java.util.HashMap;
2222
import java.util.Map;
23+
import java.util.Set;
2324
import org.apache.hadoop.fs.Path;
2425
import org.apache.hadoop.hbase.TableName;
2526
import org.apache.hadoop.hbase.backup.BackupInfo;
@@ -126,12 +127,17 @@ private BackupBoundariesBuilder(long tsCleanupBuffer) {
126127
* @param backupInfo the most recent completed backup info for a backup root, or if there is no
127128
* such completed backup, the currently running backup.
128129
*/
129-
public void update(BackupInfo backupInfo) {
130+
public void update(BackupInfo backupInfo, Set<TableName> activeTables) {
130131
switch (backupInfo.getState()) {
131132
case COMPLETE:
132133
// If a completed backup exists in the backup root, we want to protect all logs that
133134
// have been created since the log-roll that happened for that backup.
134135
for (TableName table : backupInfo.getTableSetTimestampMap().keySet()) {
136+
if (!activeTables.contains(table)) {
137+
LOG.debug("Skipping stale table {} in backup root {}: not in incremental backup set",
138+
table, backupInfo.getBackupRootDir());
139+
continue;
140+
}
135141
for (Map.Entry<String, Long> entry : backupInfo.getTableSetTimestampMap().get(table)
136142
.entrySet()) {
137143
Address regionServerAddress = Address.fromString(entry.getKey());

hbase-backup/src/test/java/org/apache/hadoop/hbase/backup/master/TestBackupLogCleaner.java

Lines changed: 162 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323

2424
import java.io.IOException;
2525
import java.util.ArrayList;
26+
import java.util.Arrays;
2627
import java.util.Collection;
28+
import java.util.Collections;
29+
import java.util.HashMap;
2730
import java.util.LinkedHashSet;
2831
import java.util.List;
2932
import java.util.Map;
@@ -205,6 +208,154 @@ public void testBackupLogCleaner() throws Exception {
205208
}
206209
}
207210

211+
/**
212+
* Verify that when a table is no longer in the backup set, it doesn't block WAL cleanup.
213+
*/
214+
@Test
215+
public void testRemovedBackupDoesNotPinWals() throws Exception {
216+
Path backupRoot = new Path(BACKUP_ROOT_DIR, "staleRoot");
217+
218+
try {
219+
BackupLogCleaner cleaner = new BackupLogCleaner();
220+
cleaner.setConf(TEST_UTIL.getConfiguration());
221+
Map<String, Object> params = new HashMap<>(1);
222+
params.put(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster());
223+
cleaner.init(params);
224+
225+
// Create FULL backup B1 with table1 and table2
226+
String backupIdB1 =
227+
backupTables(BackupType.FULL, Arrays.asList(table1, table2), backupRoot.toString());
228+
assertTrue(checkSucceeded(backupIdB1));
229+
230+
Set<FileStatus> walFilesAfterB1 =
231+
new LinkedHashSet<>(getListOfWALFiles(TEST_UTIL.getConfiguration()));
232+
233+
// Insert data so the next backup advances WAL positions for table1
234+
Connection conn = TEST_UTIL.getConnection();
235+
try (Table t1 = conn.getTable(table1)) {
236+
for (int i = 0; i < NB_ROWS_IN_BATCH; i++) {
237+
Put p = new Put(Bytes.toBytes("stale-row-t1" + i));
238+
p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
239+
t1.put(p);
240+
}
241+
}
242+
243+
// Create FULL backup B2 with only table1.
244+
// B2's tableSetTimestampMap carries forward the old timestamp from B1 for table2,
245+
// while table1 gets a fresh timestamp: { table1: ts(B2), table2: ts(B1) }
246+
String backupIdB2 =
247+
backupTables(BackupType.FULL, Collections.singletonList(table1), backupRoot.toString());
248+
assertTrue(checkSucceeded(backupIdB2));
249+
250+
Set<FileStatus> walFilesAfterB2 =
251+
mergeAsSet(walFilesAfterB1, getListOfWALFiles(TEST_UTIL.getConfiguration()));
252+
253+
// Delete B1: since it is the only backup referencing table2, finalizeDelete will
254+
// remove table2 from the incremental backup set for this root.
255+
getBackupAdmin().deleteBackups(new String[] { backupIdB1 });
256+
257+
// table2 is no longer in the backup set, so the boundary = ts(B2) instead of
258+
// min(ts(B2), ts(B1)) = ts(B1). WALs between B1 and B2 are now deletable.
259+
Iterable<FileStatus> deletable = cleaner.getDeletableFiles(walFilesAfterB2);
260+
assertTrue(toSet(deletable).containsAll(walFilesAfterB1),
261+
"WALs after B1 should be deletable once stale tables are removed from incr set");
262+
} finally {
263+
TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
264+
}
265+
}
266+
267+
/**
268+
* Similar as {@link #testRemovedBackupDoesNotPinWals()} but for the case where a table is
269+
* removed, and hence no longer in any backup either.
270+
*/
271+
@Test
272+
public void testRemovedTableDoesNotPinWals() throws Exception {
273+
Path backupRoot = new Path(BACKUP_ROOT_DIR, "removedTableRoot");
274+
275+
try {
276+
BackupLogCleaner cleaner = new BackupLogCleaner();
277+
cleaner.setConf(TEST_UTIL.getConfiguration());
278+
cleaner.init(Map.of(HMaster.MASTER, TEST_UTIL.getHBaseCluster().getMaster()));
279+
280+
// F1: Full backup of table1 and table2
281+
String backupIdF1 =
282+
backupTables(BackupType.FULL, Arrays.asList(table1, table2), backupRoot.toString());
283+
assertTrue(checkSucceeded(backupIdF1));
284+
285+
Set<FileStatus> walFilesAfterF1 =
286+
new LinkedHashSet<>(getListOfWALFiles(TEST_UTIL.getConfiguration()));
287+
288+
// Insert data into both tables so the incremental has something to back up
289+
Connection conn = TEST_UTIL.getConnection();
290+
try (Table t1 = conn.getTable(table1)) {
291+
for (int i = 0; i < NB_ROWS_IN_BATCH; i++) {
292+
Put p = new Put(Bytes.toBytes("rem-t1-" + i));
293+
p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
294+
t1.put(p);
295+
}
296+
}
297+
try (Table t2 = conn.getTable(table2)) {
298+
for (int i = 0; i < NB_ROWS_IN_BATCH; i++) {
299+
Put p = new Put(Bytes.toBytes("rem-t2-" + i));
300+
p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
301+
t2.put(p);
302+
}
303+
}
304+
305+
// I1: Incremental backup (for both table1 and table2)
306+
String backupIdI1 =
307+
backupTables(BackupType.INCREMENTAL, Arrays.asList(table1, table2), backupRoot.toString());
308+
assertTrue(checkSucceeded(backupIdI1));
309+
310+
Set<FileStatus> walFilesAfterI1 =
311+
mergeAsSet(walFilesAfterF1, getListOfWALFiles(TEST_UTIL.getConfiguration()));
312+
313+
// Delete table2 from HBase. After this point, no new backup can reference it.
314+
TEST_UTIL.deleteTable(table2);
315+
316+
// F2: Full backup of table1 only
317+
String backupIdF2 =
318+
backupTables(BackupType.FULL, Collections.singletonList(table1), backupRoot.toString());
319+
assertTrue(checkSucceeded(backupIdF2));
320+
321+
Set<FileStatus> walFilesAfterF2 =
322+
mergeAsSet(walFilesAfterI1, getListOfWALFiles(TEST_UTIL.getConfiguration()));
323+
324+
// Insert more data into table1 so the incremental has something to back up
325+
try (Table t1 = conn.getTable(table1)) {
326+
for (int i = 0; i < NB_ROWS_IN_BATCH; i++) {
327+
Put p = new Put(Bytes.toBytes("rem-t1b-" + i));
328+
p.addColumn(famName, qualName, Bytes.toBytes("val" + i));
329+
t1.put(p);
330+
}
331+
}
332+
333+
// I2: Incremental backup of table1 only
334+
String backupIdI2 = backupTables(BackupType.INCREMENTAL, Collections.singletonList(table1),
335+
backupRoot.toString());
336+
assertTrue(checkSucceeded(backupIdI2));
337+
338+
Set<FileStatus> walFilesAfterI2 =
339+
mergeAsSet(walFilesAfterF2, getListOfWALFiles(TEST_UTIL.getConfiguration()));
340+
341+
// Delete F1 and I1 (i.e. the backups that still referenced table2).
342+
getBackupAdmin().deleteBackups(new String[] { backupIdF1, backupIdI1 });
343+
344+
// table2 no longer exists and F1/I1 (the only backups referencing it) are deleted.
345+
// The boundary should now be based solely on the remaining backups (F2, I2) for table1.
346+
// All WALs after I1 should be deletable.
347+
Iterable<FileStatus> deletable = cleaner.getDeletableFiles(walFilesAfterI2);
348+
assertTrue(toSet(deletable).containsAll(walFilesAfterI1),
349+
"WALs after I1 should be deletable once removed table no longer pins them");
350+
} finally {
351+
// Recreate table2 if it was deleted during the test
352+
if (!TEST_UTIL.getAdmin().tableExists(table2)) {
353+
TEST_UTIL.createTable(table2, famName);
354+
}
355+
TEST_UTIL.truncateTable(BackupSystemTable.getTableName(TEST_UTIL.getConfiguration())).close();
356+
}
357+
}
358+
208359
@Test
209360
public void testDoesNotDeleteWALsFromNewServers() throws Exception {
210361
Path backupRoot1 = new Path(BACKUP_ROOT_DIR, "backup1");
@@ -287,10 +438,11 @@ public void testDoesNotDeleteWALsFromNewServers() throws Exception {
287438
public void testCanDeleteFileWithNewServerWALs() {
288439
BackupInfo backup = new BackupInfo();
289440
backup.setState(BackupInfo.BackupState.COMPLETE);
290-
backup.setTableSetTimestampMap(
291-
Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 1000000L)));
441+
TableName table = TableName.valueOf("table1");
442+
Map<String, Set<TableName>> incrTableSetPerRoot = Map.of("root1", Set.of(table));
443+
backup.setTableSetTimestampMap(Map.of(table, Map.of("server1:60020", 1000000L)));
292444
BackupBoundaries boundaries =
293-
BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 0L);
445+
BackupLogCleaner.calculatePreservationBoundary(List.of(backup), incrTableSetPerRoot, 0L);
294446

295447
// Old WAL from before the backup
296448
Path oldWAL = new Path("/hbase/oldWALs/server1%2C60020%2C12345.500000");
@@ -315,14 +467,16 @@ public void testCanDeleteFileWithNewServerWALs() {
315467

316468
@Test
317469
public void testFirstBackupProtectsFiles() {
470+
TableName table = TableName.valueOf("table1");
471+
318472
BackupInfo backup = new BackupInfo();
319473
backup.setBackupId("backup_1");
320474
backup.setState(BackupInfo.BackupState.RUNNING);
321475
backup.setStartTs(100L);
322476
// Running backups have no TableSetTimestampMap
323477

324478
BackupBoundaries boundaries =
325-
BackupLogCleaner.calculatePreservationBoundary(List.of(backup), 5L);
479+
BackupLogCleaner.calculatePreservationBoundary(List.of(backup), null /* ignored */, 5L);
326480

327481
// There's only a single backup, and it is still running, so it's a FULL backup.
328482
// We expect files preceding the snapshot are deletable, but files after the start are not.
@@ -340,10 +494,11 @@ public void testFirstBackupProtectsFiles() {
340494
backup2.setBackupId("backup_2");
341495
backup2.setState(BackupInfo.BackupState.COMPLETE);
342496
backup2.setStartTs(80L);
343-
backup2
344-
.setTableSetTimestampMap(Map.of(TableName.valueOf("table1"), Map.of("server1:60020", 90L)));
497+
backup2.setTableSetTimestampMap(Map.of(table, Map.of("server1:60020", 90L)));
345498

346-
boundaries = BackupLogCleaner.calculatePreservationBoundary(List.of(backup, backup2), 5L);
499+
Map<String, Set<TableName>> incrTableSetPerRoot = Map.of("root1", Set.of(table));
500+
boundaries = BackupLogCleaner.calculatePreservationBoundary(List.of(backup, backup2),
501+
incrTableSetPerRoot, 5L);
347502

348503
path = new Path("/hbase/oldWALs/server1%2C60020%2C12345.89");
349504
assertTrue(BackupLogCleaner.canDeleteFile(boundaries, path));

0 commit comments

Comments
 (0)