From 1fad9ba33cf2a96a4ad6488c5f01de38d1c42a1f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 24 Jun 2026 17:45:14 +0800 Subject: [PATCH] fix(update): clean up delete files from expired manifests Reachable cleanup now reads live paths from data and delete manifests, matching Java's ManifestFiles.readPaths behavior. --- src/iceberg/test/expire_snapshots_test.cc | 13 +++++--- src/iceberg/update/expire_snapshots.cc | 37 +++++++++++------------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/iceberg/test/expire_snapshots_test.cc b/src/iceberg/test/expire_snapshots_test.cc index 1216952e7..b96754ec0 100644 --- a/src/iceberg/test/expire_snapshots_test.cc +++ b/src/iceberg/test/expire_snapshots_test.cc @@ -376,6 +376,8 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) { const auto expired_data_manifest_path = table_location_ + "/metadata/expired-data.avro"; const auto expired_delete_manifest_path = table_location_ + "/metadata/expired-delete.avro"; + const auto missing_delete_manifest_path = + table_location_ + "/metadata/missing-delete.avro"; const auto expired_manifest_list_path = table_location_ + "/metadata/expired-manifest-list.avro"; const auto current_manifest_list_path = @@ -389,6 +391,7 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) { expired_delete_manifest_path, kExpiredSnapshotId, {MakeEntry(ManifestStatus::kAdded, kExpiredSnapshotId, kExpiredSequenceNumber, MakePositionDeleteFile(expired_delete_file_path))}); + expired_delete_manifest.manifest_path = missing_delete_manifest_path; WriteManifestList(expired_manifest_list_path, kExpiredSnapshotId, /*parent_snapshot_id=*/0, kExpiredSequenceNumber, {expired_data_manifest, expired_delete_manifest}); @@ -406,7 +409,7 @@ TEST_F(ExpireSnapshotsCleanupTest, IgnoresExpiredDeleteManifestReadFailures) { EXPECT_THAT(update->Commit(), IsOk()); EXPECT_THAT(deleted_files, testing::UnorderedElementsAre(expired_data_file_path, expired_data_manifest_path, - expired_delete_manifest_path, + missing_delete_manifest_path, expired_manifest_list_path)); } @@ -443,10 +446,10 @@ TEST_F(ExpireSnapshotsCleanupTest, DeletesExpiredFiles) { [&deleted_files](const std::string& path) { deleted_files.push_back(path); }); EXPECT_THAT(update->Commit(), IsOk()); - EXPECT_THAT(deleted_files, testing::UnorderedElementsAre(expired_data_file_path, - expired_data_manifest_path, - expired_delete_manifest_path, - expired_manifest_list_path)); + EXPECT_THAT(deleted_files, testing::UnorderedElementsAre( + expired_data_file_path, expired_delete_file_path, + expired_data_manifest_path, expired_delete_manifest_path, + expired_manifest_list_path)); } TEST_F(ExpireSnapshotsCleanupTest, ExecutorDispatchesDeletesConcurrently) { diff --git a/src/iceberg/update/expire_snapshots.cc b/src/iceberg/update/expire_snapshots.cc index d14a7331f..5573efa77 100644 --- a/src/iceberg/update/expire_snapshots.cc +++ b/src/iceberg/update/expire_snapshots.cc @@ -236,10 +236,10 @@ class ReachableFileCleanup : public FileCleanupStrategy { if (!manifests_to_delete.empty()) { if (level == CleanupLevel::kAll) { - // Deleting data files - auto data_files_to_delete = FindDataFilesToDelete( + // Deleting data and delete files. + auto files_to_delete = FindFilesToDelete( metadata_before_expiration, manifests_to_delete, current_manifests); - DeleteFiles(data_files_to_delete); + DeleteFiles(files_to_delete); } // Deleting manifest files @@ -300,25 +300,22 @@ class ReachableFileCleanup : public FileCleanupStrategy { return manifests_to_delete; } - Result> ReadLiveDataFilePaths( + Result> ReadLiveFilePaths( const TableMetadata& metadata, const ManifestFile& manifest) { - ICEBERG_PRECHECK(manifest.content == ManifestContent::kData, - "Cannot read data file paths from a delete manifest: {}", - manifest.manifest_path); - // TODO(shangxinli): optimize by only reading file paths ICEBERG_ASSIGN_OR_RAISE(auto reader, MakeManifestReader(manifest, file_io_, metadata)); + reader->Select({"file_path"}); ICEBERG_ASSIGN_OR_RAISE(auto entries, reader->LiveEntries()); - std::unordered_set data_file_paths; + std::unordered_set file_paths; for (const auto& entry : entries) { if (entry.data_file) { - data_file_paths.insert(entry.data_file->file_path); + file_paths.insert(entry.data_file->file_path); } } - return data_file_paths; + return file_paths; } /// \brief Project manifests to manifest paths for deletion. @@ -332,8 +329,8 @@ class ReachableFileCleanup : public FileCleanupStrategy { return manifest_paths; } - /// \brief Find data files to delete from manifests being removed. - std::unordered_set FindDataFilesToDelete( + /// \brief Find files to delete from manifests being removed. + std::unordered_set FindFilesToDelete( const TableMetadata& metadata, const std::unordered_set& manifests_to_delete, const std::unordered_set& current_manifests) { @@ -342,7 +339,7 @@ class ReachableFileCleanup : public FileCleanupStrategy { plan_executor_, manifests_to_delete, [this, &metadata]( const ManifestFile& manifest) -> Result> { - auto result = ReadLiveDataFilePaths(metadata, manifest); + auto result = ReadLiveFilePaths(metadata, manifest); // Ignore expired-manifest read failures and keep scanning candidates. if (!result.has_value()) { return std::unordered_set{}; @@ -352,26 +349,26 @@ class ReachableFileCleanup : public FileCleanupStrategy { if (!live_file_results.has_value()) { return {}; } - auto data_files_to_delete = std::move(live_file_results).value(); - if (data_files_to_delete.empty()) { - return data_files_to_delete; + auto files_to_delete = std::move(live_file_results).value(); + if (files_to_delete.empty()) { + return files_to_delete; } // Remove files still referenced by current manifests. auto current_live_results = ParallelCollect(plan_executor_, current_manifests, [this, &metadata](const ManifestFile& manifest) { - return ReadLiveDataFilePaths(metadata, manifest); + return ReadLiveFilePaths(metadata, manifest); }); // Fail closed if any retained manifest cannot be read safely. if (!current_live_results.has_value()) { return std::unordered_set{}; } for (const auto& file_path : current_live_results.value()) { - data_files_to_delete.erase(file_path); + files_to_delete.erase(file_path); } - return data_files_to_delete; + return files_to_delete; } };