From 0fbe4d4a28efc0f6d644e912aa6c717e039fea18 Mon Sep 17 00:00:00 2001 From: "shuxu.li" Date: Sun, 14 Jun 2026 22:20:25 +0800 Subject: [PATCH 1/5] feat(update): add OverwriteFiles for overwrite snapshot commits Summary: Add a production OverwriteFiles builder that brings iceberg-cpp to semantic parity with Java's BaseOverwriteFiles. It supports explicit file replacement (DeleteFile + AddFile) and range-based replacement (OverwriteByRowFilter + AddFile) with the same family of pre-commit concurrency validations. The builder is a thin subclass of MergingSnapshotUpdate and reuses the existing commit kernel (Apply/summary/retry/cleanup) unchanged. Changes: - New OverwriteFiles class (src/iceberg/update/overwrite_files.{h,cc}) and Table::NewOverwrite() / Transaction::NewOverwrite() entry points. - Builder surface: AddFile, DeleteFile, bulk DeleteFiles, OverwriteByRowFilter, ValidateFromSnapshot, ConflictDetectionFilter, ValidateNoConflictingData, ValidateNoConflictingDeletes, ValidateAddedFilesMatchOverwriteFilter, WithCaseSensitivity. - Validate(): conflict-filter resolution, concurrent add/delete conflict checks, and strict added-file range validation (projection + StrictMetricsEvaluator). - Tests (overwrite_files_test.cc, 45 cases) and CMake/meson wiring. Behavior alignment with Java: - operation() returns append/delete/overwrite from builder content. - Conflict-filter resolution mirrors BaseOverwriteFiles (explicit -> row filter -> AlwaysTrue); replaced-file delete checks honor ConflictDetectionFilter. - Strict added-file validation uses a single DataSpec(), rejecting multi-spec and empty added-file sets. - Deviations: public WithCaseSensitivity (vs caseSensitive) to avoid a protected-name clash; ValidateFromSnapshot rejects negative ids early. --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/meson.build | 1 + src/iceberg/table.cc | 7 + src/iceberg/table.h | 3 + src/iceberg/test/CMakeLists.txt | 3 +- src/iceberg/test/meson.build | 4 + src/iceberg/test/overwrite_files_test.cc | 1052 ++++++++++++++++++++++ src/iceberg/transaction.cc | 8 + src/iceberg/transaction.h | 3 + src/iceberg/type_fwd.h | 1 + src/iceberg/update/meson.build | 1 + src/iceberg/update/overwrite_files.cc | 352 ++++++++ src/iceberg/update/overwrite_files.h | 152 ++++ 13 files changed, 1587 insertions(+), 1 deletion(-) create mode 100644 src/iceberg/test/overwrite_files_test.cc create mode 100644 src/iceberg/update/overwrite_files.cc create mode 100644 src/iceberg/update/overwrite_files.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 9a0dc68b7..aa051cd14 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -105,6 +105,7 @@ set(ICEBERG_SOURCES update/fast_append.cc update/merge_append.cc update/merging_snapshot_update.cc + update/overwrite_files.cc update/pending_update.cc update/row_delta.cc update/set_snapshot.cc diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index ab514be87..58d79a402 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -130,6 +130,7 @@ iceberg_sources = files( 'update/fast_append.cc', 'update/merge_append.cc', 'update/merging_snapshot_update.cc', + 'update/overwrite_files.cc', 'update/pending_update.cc', 'update/row_delta.cc', 'update/set_snapshot.cc', diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc index afa626964..1365c0565 100644 --- a/src/iceberg/table.cc +++ b/src/iceberg/table.cc @@ -36,6 +36,7 @@ #include "iceberg/update/fast_append.h" #include "iceberg/update/merge_append.h" #include "iceberg/update/row_delta.h" +#include "iceberg/update/overwrite_files.h" #include "iceberg/update/set_snapshot.h" #include "iceberg/update/snapshot_manager.h" #include "iceberg/update/update_location.h" @@ -238,6 +239,12 @@ Result> Table::NewRowDelta() { return RowDelta::Make(name().name, std::move(ctx)); } +Result> Table::NewOverwrite() { + ICEBERG_ASSIGN_OR_RAISE( + auto ctx, TransactionContext::Make(shared_from_this(), TransactionKind::kUpdate)); + return OverwriteFiles::Make(name().name, std::move(ctx)); +} + Result> Table::NewUpdateStatistics() { ICEBERG_ASSIGN_OR_RAISE( auto ctx, TransactionContext::Make(shared_from_this(), TransactionKind::kUpdate)); diff --git a/src/iceberg/table.h b/src/iceberg/table.h index c8f6ded08..536c97ad7 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -185,6 +185,9 @@ class ICEBERG_EXPORT Table : public std::enable_shared_from_this { /// \brief Create a new RowDelta to add rows and row-level deletes. virtual Result> NewRowDelta(); + /// \brief Create a new OverwriteFiles to overwrite data files and commit the changes. + virtual Result> NewOverwrite(); + /// \brief Create a new SnapshotManager to manage snapshots and snapshot references. virtual Result> NewSnapshotManager(); diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index e528b1333..0756c1eef 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -239,7 +239,8 @@ if(ICEBERG_BUILD_BUNDLE) update_properties_test.cc update_schema_test.cc update_sort_order_test.cc - update_statistics_test.cc) + update_statistics_test.cc + overwrite_files_test.cc) add_iceberg_test(data_test USE_BUNDLE diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index b01a61904..053c73cb7 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -123,6 +123,10 @@ iceberg_tests = { ), 'use_data': true, }, + 'overwrite_files_test': { + 'sources': files('overwrite_files_test.cc'), + 'use_data': true, + }, } if get_option('rest').enabled() diff --git a/src/iceberg/test/overwrite_files_test.cc b/src/iceberg/test/overwrite_files_test.cc new file mode 100644 index 000000000..d003c6ee5 --- /dev/null +++ b/src/iceberg/test/overwrite_files_test.cc @@ -0,0 +1,1052 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/overwrite_files.h" + +#include +#include +#include +#include + +#include +#include + +#include "iceberg/avro/avro_register.h" +#include "iceberg/constants.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/manifest/manifest_list.h" +#include "iceberg/manifest/manifest_writer.h" +#include "iceberg/partition_field.h" +#include "iceberg/partition_spec.h" +#include "iceberg/row/partition_values.h" +#include "iceberg/schema.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/update_test_base.h" +#include "iceberg/transaction.h" +#include "iceberg/transform.h" +#include "iceberg/update/fast_append.h" +#include "iceberg/util/data_file_set.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +// ===================================================================================== +// Test harness for OverwriteFiles. +// +// Modeled on src/iceberg/test/merging_snapshot_update_test.cc (same fixture style, +// same in-memory mock FileIO / catalog setup, same DataFile / commit helpers). Unlike +// that file, OverwriteFiles is the production class with a private constructor, so the +// tests drive it exclusively through its public builder surface (AddFile / DeleteFile / +// OverwriteByRowFilter / ... / operation() / Validate() / Commit()) and observe its +// behavior through the public API: operation() classification, the committed snapshot +// summary, and the public Validate(...) entry point that the commit kernel invokes. +// +// The base table (TableMetadataV2ValidMinimal.json) has schema {x: long (id 1), +// y: long (id 2), z: long (id 3)} and a single partition spec (spec id 0) that +// partitions by identity(x). +// ===================================================================================== +class OverwriteFilesTest : public UpdateTestBase { + protected: + static void SetUpTestSuite() { avro::RegisterAll(); } + + std::string MetadataResource() const override { + return "TableMetadataV2ValidMinimal.json"; + } + + void SetUp() override { + UpdateTestBase::SetUp(); + + ICEBERG_UNWRAP_OR_FAIL(spec_, table_->spec()); + ICEBERG_UNWRAP_OR_FAIL(schema_, table_->schema()); + + file_a_ = MakeDataFile("/data/file_a.parquet", /*partition_x=*/1L); + file_b_ = MakeDataFile("/data/file_b.parquet", /*partition_x=*/2L); + } + + // A plain data file in spec 0 (identity(x)) with the given partition value for x. + std::shared_ptr MakeDataFile(const std::string& path, int64_t partition_x, + int64_t record_count = 100) { + auto f = std::make_shared(); + f->content = DataFile::Content::kData; + f->file_path = table_location_ + path; + f->file_format = FileFormatType::kParquet; + f->partition = PartitionValues(std::vector{Literal::Long(partition_x)}); + f->file_size_in_bytes = 1024; + f->record_count = record_count; + f->partition_spec_id = spec_->spec_id(); + return f; + } + + // A data file carrying column metrics for column y (field id 2): lower/upper bounds + // plus value/null counts, so the StrictMetricsEvaluator can reason about it. + std::shared_ptr MakeDataFileWithYBounds(const std::string& path, + int64_t partition_x, int64_t y_lower, + int64_t y_upper) { + auto f = MakeDataFile(path, partition_x); + f->lower_bounds = {{2, Literal::Long(y_lower).Serialize().value()}}; + f->upper_bounds = {{2, Literal::Long(y_upper).Serialize().value()}}; + f->value_counts = {{2, f->record_count}}; + f->null_value_counts = {{2, 0}}; + return f; + } + + std::shared_ptr MakeDeleteFile(const std::string& path, int64_t partition_x) { + auto f = MakeDataFile(path, partition_x); + f->content = DataFile::Content::kPositionDeletes; + return f; + } + + // An equality delete file in spec 0 (partition x = partition_x). + std::shared_ptr MakeEqualityDeleteFile(const std::string& path, + int64_t partition_x) { + auto f = MakeDeleteFile(path, partition_x); + f->content = DataFile::Content::kEqualityDeletes; + f->equality_ids = {1}; + return f; + } + + // Write a delete manifest containing the given delete files, with the snapshot id and + // sequence number assigned on each entry (so the manifest list writer does not need to + // inherit them). + Result WriteDeleteManifest( + const std::string& path, const std::vector>& files, + int64_t snapshot_id, int64_t sequence_number) { + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestWriter::MakeWriter(/*format_version=*/2, snapshot_id, path, file_io_, + spec_, schema_, ManifestContent::kDeletes)); + for (const auto& f : files) { + ManifestEntry entry; + entry.status = ManifestStatus::kAdded; + entry.snapshot_id = snapshot_id; + entry.sequence_number = sequence_number; + entry.data_file = f; + ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); + } + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + return writer->ToManifestFile(); + } + + // Build a synthetic snapshot from the given manifests (mirrors the merging-update test + // harness). Used to inject a concurrent commit between read and validate. + Result> MakeSyntheticSnapshot( + std::string operation, int64_t snapshot_id, + std::optional parent_snapshot_id, int64_t sequence_number, + const std::vector& manifests) { + auto manifest_list_path = table_location_ + "/metadata/manifest-list-" + + std::to_string(snapshot_id) + ".avro"; + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + ManifestListWriter::MakeWriter(table_->metadata()->format_version, snapshot_id, + parent_snapshot_id, manifest_list_path, file_io_, + sequence_number)); + ICEBERG_RETURN_UNEXPECTED(writer->AddAll(manifests)); + ICEBERG_RETURN_UNEXPECTED(writer->Close()); + + ICEBERG_ASSIGN_OR_RAISE( + auto snapshot, + Snapshot::Make(sequence_number, snapshot_id, parent_snapshot_id, TimePointMs{}, + std::move(operation), {}, table_->metadata()->current_schema_id, + manifest_list_path)); + return std::shared_ptr(std::move(snapshot)); + } + + // Inject a concurrent snapshot that adds an equality delete file covering partition + // `partition_x`, layered on top of `parent`. Returns the metadata containing the new + // snapshot (as current) plus the new snapshot itself, so a subsequent Validate(...) can + // scan the range (parent, new] for new deletes. + struct ConcurrentDelete { + std::shared_ptr metadata; + std::shared_ptr snapshot; + }; + ConcurrentDelete InjectConcurrentEqualityDelete(const std::shared_ptr& parent, + const std::string& delete_path, + int64_t partition_x) { + ConcurrentDelete out; + auto del_file = MakeEqualityDeleteFile(delete_path, partition_x); + const int64_t new_snapshot_id = parent->snapshot_id + 1000; + const int64_t new_sequence_number = parent->sequence_number + 1; + auto manifest = + WriteDeleteManifest(table_location_ + "/metadata/concurrent-del-manifest.avro", + {del_file}, new_snapshot_id, new_sequence_number); + EXPECT_TRUE(manifest.has_value()); + auto snap = MakeSyntheticSnapshot(DataOperation::kOverwrite, new_snapshot_id, + parent->snapshot_id, new_sequence_number, + {manifest.value()}); + if (!snap.has_value()) { + ADD_FAILURE() << "MakeSyntheticSnapshot failed: " << snap.error().message; + } + EXPECT_TRUE(snap.has_value()); + out.snapshot = snap.value(); + out.metadata = std::make_shared(*table_->metadata()); + out.metadata->snapshots.push_back(out.snapshot); + out.metadata->current_snapshot_id = out.snapshot->snapshot_id; + out.metadata->last_sequence_number = out.snapshot->sequence_number; + return out; + } + + Result> NewOverwrite() { + return table_->NewOverwrite(); + } + + // Commit file_a_ with FastAppend and refresh the table; returns its snapshot id. + int64_t CommitFileA() { + auto fa = table_->NewFastAppend(); + EXPECT_TRUE(fa.has_value()); + fa.value()->AppendFile(file_a_); + EXPECT_THAT(fa.value()->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + auto snap = table_->current_snapshot(); + EXPECT_TRUE(snap.has_value()); + return snap.value()->snapshot_id; + } + + // Append a single file via FastAppend and refresh; returns the new snapshot. + std::shared_ptr CommitFastAppend(const std::shared_ptr& file) { + auto fa = table_->NewFastAppend(); + EXPECT_TRUE(fa.has_value()); + fa.value()->AppendFile(file); + EXPECT_THAT(fa.value()->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + auto snap = table_->current_snapshot(); + EXPECT_TRUE(snap.has_value()); + return snap.value(); + } + + std::shared_ptr spec_; + std::shared_ptr schema_; + std::shared_ptr file_a_; + std::shared_ptr file_b_; +}; + +// ===================================================================================== +// 9.2 Entry-point and builder-method tests +// ===================================================================================== + +// Req 1.1: Table::NewOverwrite() returns a valid builder for a standalone operation. +TEST_F(OverwriteFilesTest, TableNewOverwriteReturnsBuilder) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + ASSERT_NE(op, nullptr); +} + +// Req 1.2, 1.3: Transaction::NewOverwrite() returns a valid builder registered with the +// transaction (commit deferred until the transaction commits). +TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { + ICEBERG_UNWRAP_OR_FAIL(auto txn, Transaction::Make(table_, TransactionKind::kUpdate)); + ICEBERG_UNWRAP_OR_FAIL(auto op, txn->NewOverwrite()); + ASSERT_NE(op, nullptr); + + (*op).OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))).AddFile(file_a_); + + // Within a transaction, the builder is staged by its own Commit() before the + // transaction is committed. + EXPECT_THAT(op->Commit(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto committed, txn->Commit()); + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), + DataOperation::kOverwrite); +} + +// Builder methods all return *this and chain (Req 2.1-2.3, 3.1, 4.1-4.2, 6.1, 6.3, 6.4). +TEST_F(OverwriteFilesTest, BuilderMethodsReturnSelfAndChain) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + auto* self = op.get(); + + EXPECT_EQ(&op->AddFile(file_a_), self); + EXPECT_EQ(&op->DeleteFile(file_b_), self); + EXPECT_EQ(&op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))), self); + EXPECT_EQ(&op->ValidateFromSnapshot(42), self); + EXPECT_EQ(&op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(1L))), + self); + EXPECT_EQ(&op->ValidateNoConflictingData(), self); + EXPECT_EQ(&op->ValidateNoConflictingDeletes(), self); + EXPECT_EQ(&op->ValidateAddedFilesMatchOverwriteFilter(), self); + EXPECT_EQ(&op->WithCaseSensitivity(false), self); + + // A single fluent chain compiles and returns the same instance. + OverwriteFiles& chained = + (*op) + .AddFile(MakeDataFile("/data/chain.parquet", 1L)) + .OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))) + .ValidateNoConflictingData(); + EXPECT_EQ(&chained, self); +} + +// ===================================================================================== +// 9.3 operation() truth-table tests (Req 5.1-5.4) +// ===================================================================================== + +TEST_F(OverwriteFilesTest, OperationAddOnlyIsAppend) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(file_a_); + EXPECT_EQ(op->operation(), DataOperation::kAppend); +} + +TEST_F(OverwriteFilesTest, OperationDeleteOnlyIsDelete) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file_a_); + EXPECT_EQ(op->operation(), DataOperation::kDelete); +} + +TEST_F(OverwriteFilesTest, OperationAddAndDeleteIsOverwrite) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file_a_); + op->AddFile(file_b_); + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); +} + +TEST_F(OverwriteFilesTest, OperationPureRowFilterAndAddIsOverwrite) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(file_b_); + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); +} + +TEST_F(OverwriteFilesTest, OperationNeitherIsOverwrite) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); +} + +// ===================================================================================== +// 9.4 Commit-path and snapshot-control tests (Req 11.2-11.5) +// ===================================================================================== + +// DeleteFile + AddFile commits as overwrite and the recorded operation matches +// operation() (Req 11.4). +TEST_F(OverwriteFilesTest, CommitDeleteAndAddIsOverwrite) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file_a_); + op->AddFile(file_b_); + const std::string expected_operation = op->operation(); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), expected_operation); + EXPECT_EQ(expected_operation, DataOperation::kOverwrite); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); +} + +// OverwriteByRowFilter + AddFile commits as overwrite. +TEST_F(OverwriteFilesTest, CommitRowFilterAndAddIsOverwrite) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/new_x1.parquet", 1L)); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), + DataOperation::kOverwrite); +} + +// An empty overwrite (no adds, no deletes) commits and records the overwrite operation. +TEST_F(OverwriteFilesTest, CommitEmptyOverwrite) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), + DataOperation::kOverwrite); +} + +// Duplicate AddFile / DeleteFile are deduplicated by the underlying set types (Req 3.7). +TEST_F(OverwriteFilesTest, CommitDeduplicatesDuplicateAddAndDelete) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + auto add = MakeDataFile("/data/dup_add.parquet", 1L); + op->DeleteFile(file_a_); + op->DeleteFile(file_a_); // duplicate delete + op->AddFile(add); + op->AddFile(add); // duplicate add + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); +} + +// StageOnly commits the snapshot without advancing the target branch (Req 11.2). +TEST_F(OverwriteFilesTest, CommitStageOnlyDoesNotAdvanceCurrentSnapshot) { + const int64_t base_snapshot_id = CommitFileA(); + const size_t base_snapshot_count = table_->metadata()->snapshots.size(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->StageOnly(); + op->AddFile(file_b_); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + // The staged snapshot is recorded but the main branch still points at file_a's + // snapshot. + ICEBERG_UNWRAP_OR_FAIL(auto current, table_->current_snapshot()); + EXPECT_EQ(current->snapshot_id, base_snapshot_id); + EXPECT_GT(table_->metadata()->snapshots.size(), base_snapshot_count); +} + +// SetTargetBranch commits to a named branch (Req 11.2). +TEST_F(OverwriteFilesTest, CommitToTargetBranch) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->SetTargetBranch("audit"); + op->AddFile(file_b_); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + EXPECT_TRUE(table_->metadata()->refs.contains("audit")); +} + +// A custom Set(property, value) is carried into the committed snapshot summary +// (Req 11.2). +TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { + CommitFileA(); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->Set("custom-prop", "custom-value"); + op->AddFile(file_b_); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at("custom-prop"), "custom-value"); +} + +// ===================================================================================== +// 9.5 Bulk DeleteFiles tests (Req 3.3-3.7) +// ===================================================================================== + +// A DataFileSet + DeleteFileSet forwards data files to DeleteDataFile and delete files +// to DeleteDeleteFile; the committed snapshot reflects the data-file removal. (The +// delete file is forwarded to DeleteDeleteFile; with no matching committed delete file +// present its removal is a harmless no-op, mirroring the inherited missing-delete +// behavior.) +TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { + // Seed the table with a data file. + { + ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); + seed->AddFile(file_a_); + EXPECT_THAT(seed->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); + + // Build the bulk delete: remove file_a (data) plus del_file (delete file). + DataFileSet data_files; + data_files.insert(file_a_); + DeleteFileSet delete_files; + delete_files.insert(del_file); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(data_files, delete_files); + op->AddFile(file_b_); + // Both a data file deletion and an add => overwrite. + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); +} + +// Empty sets are a no-op: with no adds or deletes the builder is a bare overwrite. +TEST_F(OverwriteFilesTest, BulkDeleteFilesEmptySetsAreNoOp) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(DataFileSet{}, DeleteFileSet{}); + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); // neither adds nor deletes +} + +// A DataFileSet portion of DeleteFiles records data files such that DeletesDataFiles +// becomes true (observed via operation() == delete), equivalent to repeated DeleteFile. +TEST_F(OverwriteFilesTest, BulkDeleteFilesDataPortionMarksDelete) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + DataFileSet data_files; + data_files.insert(file_a_); + data_files.insert(file_b_); + op->DeleteFiles(data_files, DeleteFileSet{}); + EXPECT_EQ(op->operation(), DataOperation::kDelete); +} + +// DeleteFiles is equivalent to repeated DeleteFile for the data-file portion: both +// classify as delete and (after committing against a seeded table) remove the files. +TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { + // Seed file_a and file_b. + { + ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); + seed->AddFile(file_a_); + seed->AddFile(file_b_); + EXPECT_THAT(seed->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + DataFileSet data_files; + data_files.insert(file_a_); + data_files.insert(file_b_); + op->DeleteFiles(data_files, DeleteFileSet{}); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "2"); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "0"); +} + +// ===================================================================================== +// 9.6 Concurrency-validation tests (Req 8.2, 8.3, 9.2-9.5; Properties 6, 7) +// ===================================================================================== + +// ValidateNoConflictingData: a competing FastAppend that added a data file matching the +// resolved conflict-detection filter makes the overwrite commit fail (Req 8.3). +TEST_F(OverwriteFilesTest, ValidateNoConflictingDataDetectsConflictingAdd) { + const int64_t first_id = CommitFileA(); + // Competing append of file_b (partition x=2) between read and commit. + CommitFastAppend(file_b_); + + // The overwrite targets the x=2 range, so the concurrent add of file_b conflicts. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); + op->AddFile(MakeDataFile("/data/replacement_x2.parquet", 2L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); +} + +// A non-conflicting concurrent change still commits, and the recorded operation matches +// operation() (Req 11.4, 11.5; Property 6). +TEST_F(OverwriteFilesTest, ValidateNoConflictingDataAllowsNonConflictingChange) { + const int64_t first_id = CommitFileA(); + // Competing append of file_b in partition x=2. + CommitFastAppend(file_b_); + + // The overwrite targets the x=1 range; the concurrent x=2 add does not conflict. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/replacement_x1.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + op->ValidateNoConflictingDeletes(); + const std::string expected_operation = op->operation(); + EXPECT_THAT(op->Commit(), IsOk()); + + EXPECT_THAT(table_->Refresh(), IsOk()); + ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); + EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), expected_operation); +} + +// ValidateNoConflictingDeletes: a competing snapshot that deleted a data file in the +// overwrite range makes the commit fail (Req 9.2-9.4). +TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) { + const int64_t first_id = CommitFileA(); + + // Competing overwrite removes file_a (partition x=1) between read and commit. + { + ICEBERG_UNWRAP_OR_FAIL(auto competing, NewOverwrite()); + competing->DeleteFile(file_a_); + EXPECT_THAT(competing->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); + } + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/replacement_after_delete.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingDeletes(); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); +} + +// ValidateNoConflictingDeletes allows a non-conflicting concurrent append to commit. +TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChange) { + const int64_t first_id = CommitFileA(); + // Competing append in partition x=2 (no deletes in the x=1 range). + CommitFastAppend(file_b_); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/replacement_no_conflict.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingDeletes(); + EXPECT_THAT(op->Commit(), IsOk()); +} + +// ===================================================================================== +// Fix #1: the explicit replaced-files delete branch (Path B) honors the +// ConflictDetectionFilter. +// +// Path B fires when explicit data files were registered for replacement +// (deleted_data_files_ non-empty) under ValidateNoConflictingDeletes(). It now calls the +// STATIC data_filter overload of ValidateNoNewDeletesForDataFiles, passing the raw +// conflict_detection_filter_ (which may be nullptr = "no filter, consider all delete +// files"). These tests inject a concurrent equality-delete file that +// covers the replaced data file (file_a, partition x=1) and assert that: +// * with NO conflict filter, the concurrent delete is a conflict => FAIL; +// * with a conflict filter that does NOT cover x=1 (here x=2), the delete is filtered +// out of the conflict scope => SUCCEED. +// +// To exercise Path B in isolation, the builder uses explicit DeleteFile (no +// OverwriteByRowFilter), so RowFilter() stays AlwaysFalse and Path A is skipped. +// ===================================================================================== + +// No conflict filter => the concurrent delete on the replaced file is detected (Path B +// passes nullptr through, so all delete files are considered). +TEST_F(OverwriteFilesTest, PathBExplicitDeletesDetectsConcurrentDeleteWithoutFilter) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + // Concurrent commit adds an equality delete covering file_a (partition x=1). + auto concurrent = InjectConcurrentEqualityDelete( + first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file_a_); // explicit replacement => deleted_data_files_ non-empty + op->AddFile(MakeDataFile("/data/rewrite_x1.parquet", 1L)); + op->ValidateFromSnapshot(first_snapshot->snapshot_id); + op->ValidateNoConflictingDeletes(); + // No ConflictDetectionFilter => Path B considers all concurrent deletes => conflict. + EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), + IsError(ErrorKind::kValidationFailed)); +} + +// A conflict filter that does not cover the replaced file's partition narrows the scope, +// so the concurrent delete is filtered out and validation succeeds. +TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { + CommitFileA(); + ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); + + // Same concurrent equality delete covering file_a (partition x=1). + auto concurrent = InjectConcurrentEqualityDelete( + first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file_a_); + op->AddFile(MakeDataFile("/data/rewrite_x1.parquet", 1L)); + op->ValidateFromSnapshot(first_snapshot->snapshot_id); + // Conflict filter targets x=2, which does NOT cover the x=1 delete => filtered out. + op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); + op->ValidateNoConflictingDeletes(); + EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), IsOk()); +} + +// ===================================================================================== +// 9.7 Strict added-file range validation tests (Req 10.1-10.6; Properties 9, 10) +// +// These exercise OverwriteFiles::Validate(...) directly (the same entry point the commit +// kernel invokes), which is sufficient and deterministic: the strict-range branch does +// not depend on concurrent snapshots. +// ===================================================================================== + +// Strict partition projection proves containment directly (Req 10.3). +TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/in_partition.parquet", 1L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); +} + +// Strict partition projection is insufficient (filter on a non-partition column) but the +// StrictMetricsEvaluator proves containment from the file's bounds (Req 10.3). +TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictMetrics) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); + // y bounds [5, 5] => every row has y == 5, fully contained in the filter. + op->AddFile(MakeDataFileWithYBounds("/data/y_eq_5.parquet", 1L, 5L, 5L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); +} + +// Neither the strict projection nor the metrics can prove containment => fail (Req 10.5). +TEST_F(OverwriteFilesTest, StrictRangeRejectedWhenNotProvable) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); + // y bounds [1, 10] => not all rows are y == 5. + op->AddFile(MakeDataFileWithYBounds("/data/y_range.parquet", 1L, 1L, 10L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), + IsError(ErrorKind::kValidationFailed)); +} + +// A file whose partition falls outside the inclusive projection is rejected (Req 10.4). +TEST_F(OverwriteFilesTest, StrictRangeRejectsFileOutsidePartitionRange) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/wrong_partition.parquet", /*partition_x=*/2L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), + IsError(ErrorKind::kValidationFailed)); +} + +// ValidateAddedFilesMatchOverwriteFilter without a row filter fails (Req 10.1, 10.2; +// Property 10). +TEST_F(OverwriteFilesTest, StrictRangeRequiresRowFilter) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(MakeDataFile("/data/no_filter.parquet", 1L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), + IsError(ErrorKind::kValidationFailed)); +} + +// Fix #2: added files belonging to MORE THAN ONE partition spec are rejected, since the +// validation resolves a single spec via DataSpec() (which requires exactly one spec among +// added files). DataSpec() fails fast with a multi-spec error. +TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { + // Add a second partition spec (id 1) to the table metadata BEFORE creating the builder, + // so the producer's base metadata can resolve both specs when files are staged. + ICEBERG_UNWRAP_OR_FAIL( + auto spec1, PartitionSpec::Make(*schema_, /*spec_id=*/1, + {PartitionField(/*source_id=*/1, /*field_id=*/1001, + "x_v1", Transform::Identity())}, + /*allow_missing_fields=*/false)); + table_->metadata()->partition_specs.push_back( + std::shared_ptr(std::move(spec1))); + // Confirm both specs resolve. + ASSERT_THAT(table_->metadata()->PartitionSpecById(0), IsOk()); + ASSERT_THAT(table_->metadata()->PartitionSpecById(1), IsOk()); + + auto file_spec0 = MakeDataFile("/data/spec0_x1.parquet", 1L); // partition_spec_id 0 + auto file_spec1 = MakeDataFile("/data/spec1_x1.parquet", 1L); + file_spec1->partition_spec_id = 1; + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(file_spec0); + op->AddFile(file_spec1); + op->ValidateAddedFilesMatchOverwriteFilter(); + // DataSpec() rejects the two distinct specs with an InvalidArgument error. + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), + IsError(ErrorKind::kInvalidArgument)); +} + +// Fix #3: enabling the strict added-file range validation with a row filter set but NO +// added data files (e.g. a pure overwrite-by-filter with no AddFile) fails, because +// DataSpec() rejects an empty added-files set. +TEST_F(OverwriteFilesTest, StrictRangeRejectsEmptyAddedFiles) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + // Row filter is set (precondition satisfied) but no AddFile was called. + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->ValidateAddedFilesMatchOverwriteFilter(); + // DataSpec() raises InvalidArgument because no data file was added. + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), + IsError(ErrorKind::kInvalidArgument)); +} + +// ===================================================================================== +// 9.8 Case-sensitivity and null-rejection tests (Req 6.4, 12.1-12.4; Property 4) +// ===================================================================================== + +// Case-insensitive binding accepts a differently-cased column name where the +// case-sensitive (default) binding rejects it. Observed through the strict-range +// validation, which binds the row filter using the configured case sensitivity. +TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { + // Case-sensitive (default): the filter references "X" which does not match column "x". + { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("X", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/cs.parquet", 1L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + // Binding "X" against schema {x, y, z} fails case-sensitively. + auto result = op->Validate(*table_->metadata(), /*snapshot=*/nullptr); + EXPECT_FALSE(result.has_value()); + } + // Case-insensitive: "X" binds to column "x" and the in-range file validates. + { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->WithCaseSensitivity(false); + op->OverwriteByRowFilter(Expressions::Equal("X", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/ci.parquet", 1L)); + op->ValidateAddedFilesMatchOverwriteFilter(); + EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); + } +} + +// Null arguments to the pointer-taking builder methods surface at Commit() without +// crashing (Req 12.1, 12.2; Property 4). The ErrorCollector aggregates deferred builder +// errors and surfaces them as a kValidationFailed error at Commit() that preserves the +// underlying invalid-argument message. +TEST_F(OverwriteFilesTest, NullAddFileRejectedAtCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(nullptr); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); +} + +TEST_F(OverwriteFilesTest, NullDeleteFileRejectedAtCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(nullptr); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); +} + +TEST_F(OverwriteFilesTest, NullOverwriteByRowFilterRejectedAtCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(nullptr); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid row filter expression: null")); +} + +TEST_F(OverwriteFilesTest, NullConflictDetectionFilterRejectedAtCommit) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->ConflictDetectionFilter(nullptr); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid conflict detection filter: null")); +} + +// The builder chain continues without crashing after a null argument is recorded. +TEST_F(OverwriteFilesTest, NullArgumentDoesNotCrashBuilderChain) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + (*op).AddFile(nullptr).AddFile(file_a_).OverwriteByRowFilter( + Expressions::Equal("x", Literal::Long(1L))); + // The recorded error still surfaces at commit. + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); +} + +// A null element cannot enter a DataFileSet / DeleteFileSet (insert() rejects nullptr), +// so DeleteFiles({null...}, {null...}) is a no-op rather than an error. The deferred +// null-element rejection inside DeleteFiles is therefore a defensive guard that is +// unreachable through the public set API; this test documents that observable behavior. +TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { + DataFileSet data_files; + data_files.insert(std::shared_ptr{nullptr}); + DeleteFileSet delete_files; + delete_files.insert(std::shared_ptr{nullptr}); + EXPECT_TRUE(data_files.empty()); + EXPECT_TRUE(delete_files.empty()); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(data_files, delete_files); + // No data files were deleted => still a bare overwrite, and commit succeeds. + EXPECT_EQ(op->operation(), DataOperation::kOverwrite); + EXPECT_THAT(op->Commit(), IsOk()); +} + +// ValidateFromSnapshot accepts a non-negative id (0 is in the generated id range +// [0, INT64_MAX]) and rejects a negative id (including kInvalidSnapshotId == -1) as a +// deferred error surfaced at Commit(). Req 6.1. +TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(file_a_).ValidateFromSnapshot(-1); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid snapshot id")); +} + +TEST_F(OverwriteFilesTest, ValidateFromSnapshotAcceptsZeroSnapshotId) { + // 0 is a legal generated snapshot id (the generator masks with int64_t::max()), so it + // must not be rejected at the builder stage. With no concurrency validation enabled, + // the starting id is merely recorded and the commit succeeds. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(file_a_).ValidateFromSnapshot(0); + EXPECT_THAT(op->Commit(), IsOk()); +} + +// ===================================================================================== +// Property-style tests (parameterized via loops; no PBT library is available). +// ===================================================================================== + +// Property 2 (builder forwarding) + Property 3 (delete dual-tracking), Task 2.4. +// +// AddedDataFiles() / deleted_data_files_ are not publicly observable, so the properties +// are checked indirectly through operation(): for any non-null file, AddFile-only yields +// `append` (the file was added and the row filter is untouched), while DeleteFile-only +// and DeleteFiles-only (data portion) yield `delete` (the file was registered for +// deletion and DeletesDataFiles() became true). Validates Req 2.1, 2.2, 3.1-3.5. +TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { + const std::vector> files = { + MakeDataFile("/data/p0.parquet", 1L), + MakeDataFile("/data/p1.parquet", 2L), + MakeDataFile("/data/p2.parquet", 3L), + MakeDataFile("/data/p3.parquet", 7L), + }; + + for (const auto& file : files) { + { + // AddFile preserves "no deletes" => append, proving the row filter is unchanged. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(file); + EXPECT_EQ(op->operation(), DataOperation::kAppend) << file->file_path; + } + { + // DeleteFile dual-tracks => DeletesDataFiles() true, no adds => delete. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(file); + EXPECT_EQ(op->operation(), DataOperation::kDelete) << file->file_path; + } + { + // Bulk DeleteFiles data portion behaves like repeated DeleteFile. + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + DataFileSet data_files; + data_files.insert(file); + op->DeleteFiles(data_files, DeleteFileSet{}); + EXPECT_EQ(op->operation(), DataOperation::kDelete) << file->file_path; + } + } +} + +// Property 4 (null rejection), Task 2.5: every pointer-taking builder mutator records a +// deferred error that surfaces as an InvalidArgument-class error at Commit() without +// crashing. Validates Req 12.1, 12.2, 12.3, 12.4. +TEST_F(OverwriteFilesTest, PropertyNullArgumentRejection) { + using Mutator = std::function; + const std::vector mutators = { + [](OverwriteFiles& op) { op.AddFile(nullptr); }, + [](OverwriteFiles& op) { op.DeleteFile(nullptr); }, + [](OverwriteFiles& op) { op.OverwriteByRowFilter(nullptr); }, + [](OverwriteFiles& op) { op.ConflictDetectionFilter(nullptr); }, + }; + + for (const auto& mutate : mutators) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + mutate(*op); + // Deferred builder errors are aggregated and surfaced as kValidationFailed at commit. + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); + } +} + +// Property 1 (operation() reflects content), Task 3.2: exhaustive truth table over +// {adds, deletes (explicit or via row filter)}. Validates Req 5.1-5.4. +TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { + struct Case { + bool add; + bool delete_file; + bool row_filter; + std::string expected; + }; + const std::vector cases = { + {/*add=*/true, /*delete_file=*/false, /*row_filter=*/false, DataOperation::kAppend}, + {false, true, false, DataOperation::kDelete}, + {false, false, true, DataOperation::kDelete}, // row filter counts as a delete + {true, true, false, DataOperation::kOverwrite}, + {true, false, true, DataOperation::kOverwrite}, + {false, true, true, DataOperation::kDelete}, // deletes only (no adds) => delete + {false, false, false, DataOperation::kOverwrite}, // neither + }; + + int index = 0; + for (const auto& c : cases) { + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + if (c.add) { + op->AddFile(MakeDataFile("/data/tt_add" + std::to_string(index) + ".parquet", 1L)); + } + if (c.delete_file) { + op->DeleteFile( + MakeDataFile("/data/tt_del" + std::to_string(index) + ".parquet", 1L)); + } + if (c.row_filter) { + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + } + EXPECT_EQ(op->operation(), c.expected) << "case index " << index; + ++index; + } +} + +// Property 5 (conflict filter resolution), Task 4.4. +// +// DataConflictDetectionFilter() is private, so its three resolution outcomes are observed +// indirectly through ValidateNoConflictingData against a competing concurrent add of +// file_b (partition x=2): +// * explicit filter set -> the explicit filter is used (overrides the row filter); +// * row filter only -> the row filter is used; +// * file-replacement present -> AlwaysTrue (any concurrent add conflicts). +// Validates Req 7.1, 7.2, 7.3. +TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { + // Resolution case 2 (row filter only): row filter x=1 does NOT match the concurrent + // x=2 add -> no conflict. + { + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/r2_ok.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), + IsOk()); + } + // Resolution case 2 (row filter only): row filter x=2 DOES match -> conflict, proving + // the row filter (not AlwaysFalse / AlwaysTrue blindly) is what was used. + { + SetUp(); // fresh table + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); + op->AddFile(MakeDataFile("/data/r2_conflict.parquet", 2L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), + IsError(ErrorKind::kValidationFailed)); + } + // Resolution case 1 (explicit filter overrides row filter): row filter x=1 would NOT + // conflict, but the explicit conflict filter x=2 does -> conflict. + { + SetUp(); + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); + op->AddFile(MakeDataFile("/data/r1.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), + IsError(ErrorKind::kValidationFailed)); + } + // Resolution case 3 (file replacement -> AlwaysTrue): with an explicit DeleteFile and + // no explicit conflict filter, ANY concurrent add conflicts (here file_b in x=2, even + // though the row filter is x=1). + { + SetUp(); + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->DeleteFile(file_a_); // makes deleted_data_files_ non-empty => AlwaysTrue + op->AddFile(MakeDataFile("/data/r3.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), + IsError(ErrorKind::kValidationFailed)); + } +} + +} // namespace iceberg diff --git a/src/iceberg/transaction.cc b/src/iceberg/transaction.cc index e911a61dc..169e7ec90 100644 --- a/src/iceberg/transaction.cc +++ b/src/iceberg/transaction.cc @@ -36,6 +36,7 @@ #include "iceberg/update/expire_snapshots.h" #include "iceberg/update/fast_append.h" #include "iceberg/update/merge_append.h" +#include "iceberg/update/overwrite_files.h" #include "iceberg/update/pending_update.h" #include "iceberg/update/row_delta.h" #include "iceberg/update/set_snapshot.h" @@ -513,6 +514,13 @@ Result> Transaction::NewRowDelta() { return row_delta; } +Result> Transaction::NewOverwrite() { + ICEBERG_ASSIGN_OR_RAISE(std::shared_ptr overwrite, + OverwriteFiles::Make(ctx_->table->name().name, ctx_)); + ICEBERG_RETURN_UNEXPECTED(AddUpdate(overwrite)); + return overwrite; +} + Result> Transaction::NewUpdateStatistics() { ICEBERG_ASSIGN_OR_RAISE(std::shared_ptr update_statistics, UpdateStatistics::Make(ctx_)); diff --git a/src/iceberg/transaction.h b/src/iceberg/transaction.h index 34ca78bd7..49b607d60 100644 --- a/src/iceberg/transaction.h +++ b/src/iceberg/transaction.h @@ -115,6 +115,9 @@ class ICEBERG_EXPORT Transaction : public std::enable_shared_from_this> NewRowDelta(); + /// \brief Create a new OverwriteFiles to overwrite data files and commit the changes. + Result> NewOverwrite(); + /// \brief Create a new SnapshotManager to manage snapshots. Result> NewSnapshotManager(); diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index f29bc4a1a..784b3e03b 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -242,6 +242,7 @@ class DeleteFiles; class ExpireSnapshots; class FastAppend; class MergeAppend; +class OverwriteFiles; class PendingUpdate; class RowDelta; class SetSnapshot; diff --git a/src/iceberg/update/meson.build b/src/iceberg/update/meson.build index 4f594a06e..4ba4168d4 100644 --- a/src/iceberg/update/meson.build +++ b/src/iceberg/update/meson.build @@ -22,6 +22,7 @@ install_headers( 'fast_append.h', 'merge_append.h', 'merging_snapshot_update.h', + 'overwrite_files.h', 'pending_update.h', 'row_delta.h', 'set_snapshot.h', diff --git a/src/iceberg/update/overwrite_files.cc b/src/iceberg/update/overwrite_files.cc new file mode 100644 index 000000000..165d43005 --- /dev/null +++ b/src/iceberg/update/overwrite_files.cc @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/overwrite_files.h" + +#include + +#include "iceberg/expression/binder.h" +#include "iceberg/expression/evaluator.h" +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/projections.h" +#include "iceberg/expression/strict_metrics_evaluator.h" +#include "iceberg/manifest/manifest_entry.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/snapshot.h" +#include "iceberg/table.h" +#include "iceberg/table_metadata.h" +#include "iceberg/transaction.h" +#include "iceberg/type.h" +#include "iceberg/util/error_collector.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +Result> OverwriteFiles::Make( + std::string table_name, std::shared_ptr ctx) { + ICEBERG_PRECHECK(!table_name.empty(), "Table name cannot be empty"); + ICEBERG_PRECHECK(ctx != nullptr, "Cannot create OverwriteFiles without a context"); + return std::shared_ptr( + new OverwriteFiles(std::move(table_name), std::move(ctx))); +} + +OverwriteFiles::OverwriteFiles(std::string table_name, + std::shared_ptr ctx) + : MergingSnapshotUpdate(std::move(table_name), std::move(ctx)) {} + +OverwriteFiles::~OverwriteFiles() = default; + +OverwriteFiles& OverwriteFiles::AddFile(const std::shared_ptr& file) { + ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + // Forward to the inherited staging path. Any error (e.g. missing partition spec + // ID) is captured by the ErrorCollector and surfaced at Commit() rather than + // dropped. RowFilter() and deleted_data_files_ are intentionally left unchanged. + ICEBERG_BUILDER_RETURN_IF_ERROR(AddDataFile(file)); + return *this; +} + +OverwriteFiles& OverwriteFiles::DeleteFile(const std::shared_ptr& file) { + ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + // Dual-track: record the file for delete-conflict validation AND register it for + // removal via the inherited pipeline. + deleted_data_files_.insert(file); + ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); + return *this; +} + +OverwriteFiles& OverwriteFiles::DeleteFiles(const DataFileSet& data_files_to_delete, + const DeleteFileSet& delete_files_to_delete) { + // Bulk equivalent of repeated DeleteFile(...) plus explicit delete-file removal. Empty + // sets are no-ops; the set types handle deduplication of repeated entries. + for (const auto& file : data_files_to_delete) { + ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + // Dual-track: record for delete-conflict validation AND register for removal. + deleted_data_files_.insert(file); + ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); + } + for (const auto& file : delete_files_to_delete) { + ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid delete file: null"); + ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDeleteFile(file)); + } + return *this; +} + +OverwriteFiles& OverwriteFiles::OverwriteByRowFilter(std::shared_ptr expr) { + ICEBERG_BUILDER_CHECK(expr != nullptr, "Invalid row filter expression: null"); + // Forward to the inherited filter path: this sets RowFilter() and forwards the + // expression to both the data and delete filter managers. Any error is captured by the + // ErrorCollector and surfaced at Commit(). + ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteByRowFilter(std::move(expr))); + return *this; +} + +OverwriteFiles& OverwriteFiles::ValidateFromSnapshot(int64_t snapshot_id) { + // Snapshot ids are non-negative: SnapshotUtil::GenerateSnapshotId() masks with + // int64_t::max() and so yields a value in [0, INT64_MAX], while kInvalidSnapshotId is + // -1. A negative id therefore can never identify a real snapshot and is rejected via + // the deferred ErrorCollector (surfaced at Commit()); 0 is accepted because it is in + // the generator's range. A negative id is always a misuse, so we keep an early, + // clearer guard against the obviously-invalid sentinel/negative values. + ICEBERG_BUILDER_CHECK(snapshot_id >= 0, "Invalid snapshot id: {}", snapshot_id); + starting_snapshot_id_ = snapshot_id; + return *this; +} + +OverwriteFiles& OverwriteFiles::ConflictDetectionFilter( + std::shared_ptr expr) { + ICEBERG_BUILDER_CHECK(expr != nullptr, "Invalid conflict detection filter: null"); + conflict_detection_filter_ = std::move(expr); + return *this; +} + +OverwriteFiles& OverwriteFiles::WithCaseSensitivity(bool case_sensitive) { + // Forward to the protected MergingSnapshotUpdate::CaseSensitive(bool) setter (which + // returns void); the public name differs to avoid the public/protected name clash and + // keep a fluent return type. + CaseSensitive(case_sensitive); + return *this; +} + +OverwriteFiles& OverwriteFiles::ValidateNoConflictingData() { + // Enable concurrent-data validation and require every explicitly-deleted old file to + // be hit during manifest filtering. + validate_no_conflicting_data_ = true; + FailMissingDeletePaths(); + return *this; +} + +OverwriteFiles& OverwriteFiles::ValidateNoConflictingDeletes() { + // Enable concurrent-delete validation and require every explicitly-deleted old file to + // be hit during manifest filtering. + validate_no_conflicting_deletes_ = true; + FailMissingDeletePaths(); + return *this; +} + +OverwriteFiles& OverwriteFiles::ValidateAddedFilesMatchOverwriteFilter() { + // Enable strict validation that every added data file is fully contained in the + // overwrite range. The precondition (a row filter must be set) is enforced in + // Validate(...). + validate_added_files_match_overwrite_filter_ = true; + return *this; +} + +// Classify the snapshot operation from the current builder content. DeletesDataFiles() +// is true for both an explicit DeleteFile(...) and a pure OverwriteByRowFilter(...) (the +// latter via ManifestFilterManager::ContainsDeletes()), so OverwriteByRowFilter + AddFile +// correctly classifies as overwrite without any RowFilter()-based fallback. +std::string OverwriteFiles::operation() { + if (DeletesDataFiles() && !AddsDataFiles()) { + return DataOperation::kDelete; + } + if (AddsDataFiles() && !DeletesDataFiles()) { + return DataOperation::kAppend; + } + return DataOperation::kOverwrite; +} + +// Select the conflict-detection filter. Precedence: (1) the explicitly-set +// conflict_detection_filter_; otherwise +// (2) the row filter when it is set and not AlwaysFalse and no explicit data files were +// registered for deletion (the pure overwriteByRowFilter case, where the row filter +// already describes the conflicting range); otherwise (3) the conservative AlwaysTrue, +// under which any newly-added data file counts as a conflict (file-replacement or mixed +// mode). RowFilter() defaults to the AlwaysFalse singleton when unset, so comparing +// against the Expressions::AlwaysFalse() singleton by pointer identity (consistent with +// how the filter managers track delete_expr_) correctly treats the unset case as "no +// row filter". +std::shared_ptr OverwriteFiles::DataConflictDetectionFilter() const { + if (conflict_detection_filter_ != nullptr) { + return conflict_detection_filter_; + } + if (RowFilter() != nullptr && RowFilter() != Expressions::AlwaysFalse() && + deleted_data_files_.empty()) { + return RowFilter(); + } + return Expressions::AlwaysTrue(); +} + +// Run the enabled overwrite-specific concurrency checks before commit. The branches are +// sequential and independent. The first failing check aborts the commit; on success we +// return OK and let the inherited Apply(...) build the snapshot. +Status OverwriteFiles::Validate(const TableMetadata& current_metadata, + const std::shared_ptr& snapshot) { + // 0. Strict added-file range precondition: when + // ValidateAddedFilesMatchOverwriteFilter() + // is enabled, a row filter is required to define the overwrite range. RowFilter() + // defaults to the AlwaysFalse singleton when unset, so an unset or AlwaysFalse + // filter means there is no range to validate against and the commit must fail. + if (validate_added_files_match_overwrite_filter_) { + if (RowFilter() == nullptr || RowFilter() == Expressions::AlwaysFalse()) { + return ValidationFailed( + "Cannot validate added files match overwrite filter: row filter is not set"); + } + ICEBERG_RETURN_UNEXPECTED( + ValidateAddedFilesMatchOverwriteFilterImpl(current_metadata)); + } + + // 1. Concurrent newly-added data files: fail if any snapshot after the starting point + // added a data file matching the resolved conflict-detection filter. + if (validate_no_conflicting_data_) { + ICEBERG_RETURN_UNEXPECTED(ValidateAddedDataFiles( + current_metadata, starting_snapshot_id_, DataConflictDetectionFilter(), snapshot, + ctx_->table->io(), IsCaseSensitive())); + } + + // 2. Concurrent deletes: the two sub-checks are independent and both may fire. + if (validate_no_conflicting_deletes_) { + // Path A: a row filter is in play (set and not the AlwaysFalse singleton). Fail if a + // new delete file matches the range, or if a data file in the range was concurrently + // removed. Prefer the explicit conflict_detection_filter_ when set, else the row + // filter that describes the overwrite range. + if (RowFilter() != nullptr && RowFilter() != Expressions::AlwaysFalse()) { + auto delete_filter = conflict_detection_filter_ != nullptr + ? conflict_detection_filter_ + : RowFilter(); + ICEBERG_RETURN_UNEXPECTED( + ValidateNoNewDeleteFiles(current_metadata, starting_snapshot_id_, delete_filter, + snapshot, ctx_->table->io(), IsCaseSensitive())); + ICEBERG_RETURN_UNEXPECTED( + ValidateDeletedDataFiles(current_metadata, starting_snapshot_id_, delete_filter, + snapshot, ctx_->table->io(), IsCaseSensitive())); + } + + // Path B: explicit old data files were registered for replacement. Fail if a new + // delete file matching the conflict-detection filter covers any of them. Use the + // STATIC data_filter overload, passing the raw conflict_detection_filter_ (which may + // be nullptr — a nullptr filter means no filter, so all delete files are considered). + // The 3rd argument is an Expression, so this overload is unambiguous and needs no + // member-function-pointer cast. + if (!deleted_data_files_.empty()) { + ICEBERG_RETURN_UNEXPECTED(ValidateNoNewDeletesForDataFiles( + current_metadata, starting_snapshot_id_, conflict_detection_filter_, + deleted_data_files_, snapshot, ctx_->table->io(), IsCaseSensitive())); + } + } + + return {}; +} + +// Every added data file must be fully contained in the overwrite range defined by +// RowFilter(). The validation resolves a single partition spec via DataSpec() and then, +// for each added file, an inclusive partition projection provides a fast negative (reject +// when the file's partition cannot possibly fall in range), a strict partition projection +// provides a fast positive (accept when the whole partition is guaranteed in range), and +// a file-level StrictMetricsEvaluator provides the fallback proof. Metrics that are +// missing or insufficient yield a non-matching result and therefore fail validation +// (conservative: never silently accept an unprovable file). +// +// DataSpec() returns InvalidArgument when zero data files were added (empty-added-files +// is rejected) and when more than one partition spec is represented among the added files +// (multi-spec is rejected). DataSpec() reads the spec from the producer's base metadata; +// the projections use the table schema from metadata.Schema(). +// +// Implementation notes: +// * `Evaluator::Make` takes a `Schema`, and the partition type is obtained via +// `PartitionSpec::PartitionType(schema)` (which needs the table schema and returns a +// `StructType`). We therefore wrap the partition `StructType`'s fields in a `Schema` +// and bind the projected expression against it. +// * `StrictMetricsEvaluator::Make` binds its expression internally, and `Binder::Bind` +// rejects an already-bound predicate. So the bound filter is used only for the +// projections (`ProjectionEvaluator::Project` requires a bound expression), while the +// UNBOUND `RowFilter()` is handed to `StrictMetricsEvaluator::Make`. +Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( + const TableMetadata& metadata) { + ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); + + // Single spec for all added files. This rejects both the empty-added-files case and + // the multi-spec case via InvalidArgument. + ICEBERG_ASSIGN_OR_RAISE(auto spec, DataSpec()); + + // Bind the row filter once for projection (Project requires a bound expression); build + // the strict metrics evaluator once over the UNBOUND row filter (it binds internally). + ICEBERG_ASSIGN_OR_RAISE(auto bound_filter, + Binder::Bind(*schema, RowFilter(), IsCaseSensitive())); + ICEBERG_ASSIGN_OR_RAISE( + auto strict_metrics_evaluator, + StrictMetricsEvaluator::Make(RowFilter(), schema, IsCaseSensitive())); + + // Build ONE inclusive and ONE strict partition-value evaluator from the single spec. + // The Evaluators bind on construction and do not retain the partition schema, so the + // local schema below may safely go out of scope. + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec->PartitionType(*schema)); + auto partition_fields = partition_type->fields(); + Schema partition_schema( + std::vector(partition_fields.begin(), partition_fields.end())); + + // Inclusive projection (fast negative): project the bound filter into the partition + // space and build an evaluator over the spec's partition values. + auto inclusive_projection = Projections::Inclusive(*spec, *schema, IsCaseSensitive()); + ICEBERG_ASSIGN_OR_RAISE(auto inclusive_expr, + inclusive_projection->Project(bound_filter)); + ICEBERG_ASSIGN_OR_RAISE( + auto inclusive_evaluator, + Evaluator::Make(partition_schema, inclusive_expr, IsCaseSensitive())); + + // Strict projection (fast positive). + auto strict_projection = Projections::Strict(*spec, *schema, IsCaseSensitive()); + ICEBERG_ASSIGN_OR_RAISE(auto strict_expr, strict_projection->Project(bound_filter)); + ICEBERG_ASSIGN_OR_RAISE( + auto strict_evaluator, + Evaluator::Make(partition_schema, strict_expr, IsCaseSensitive())); + + for (const auto& file : AddedDataFiles()) { + if (file == nullptr) { + return ValidationFailed( + "Cannot validate added files match overwrite filter: null data file"); + } + + // Step 1: inclusive projection — fast negative. If the file's partition cannot match + // the (inclusively projected) filter, the file is definitively outside the range. + ICEBERG_ASSIGN_OR_RAISE(bool inclusive_match, + inclusive_evaluator->Evaluate(file->partition)); + if (!inclusive_match) { + return ValidationFailed( + "Cannot commit file {}: added file does not match overwrite filter (outside " + "the overwrite range)", + file->file_path); + } + + // Step 2: strict projection — fast positive. If the whole partition is guaranteed in + // range, the file is accepted without inspecting metrics. + ICEBERG_ASSIGN_OR_RAISE(bool strict_match, + strict_evaluator->Evaluate(file->partition)); + if (strict_match) { + continue; + } + + // Step 3: file-level proof via strict metrics. A false result — including the case + // where metrics are missing or insufficient — fails validation conservatively rather + // than silently accepting a file that cannot be proved fully in range. + ICEBERG_ASSIGN_OR_RAISE(bool metrics_match, + strict_metrics_evaluator->Evaluate(*file)); + if (!metrics_match) { + return ValidationFailed( + "Cannot commit file {}: added file is not fully contained in the overwrite " + "filter", + file->file_path); + } + } + + return {}; +} + +} // namespace iceberg diff --git a/src/iceberg/update/overwrite_files.h b/src/iceberg/update/overwrite_files.h new file mode 100644 index 000000000..59f25914d --- /dev/null +++ b/src/iceberg/update/overwrite_files.h @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/update/overwrite_files.h + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" +#include "iceberg/update/merging_snapshot_update.h" +#include "iceberg/util/data_file_set.h" + +namespace iceberg { + +/// \brief Overwrite data files in a table. +/// +/// OverwriteFiles expresses a logical overwrite transaction: either explicit file +/// replacement (`DeleteFile(old)` + `AddFile(new)`) or range-based replacement +/// (`OverwriteByRowFilter(expr)` + `AddFile(new...)`). It is a concrete subclass of +/// MergingSnapshotUpdate that adds only the overwrite-specific builder surface and the +/// overwrite-specific `Validate(...)` logic, reusing the inherited filter → write → +/// merge pipeline, summary building, commit retry, and cleanup unchanged. +class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { + public: + /// \brief Create a new OverwriteFiles instance. + /// + /// Mirrors FastAppend::Make: validates inputs and returns a heap instance. + /// + /// \param table_name The name of the table + /// \param ctx The transaction context to use for this update + /// \return A Result containing the OverwriteFiles instance or an error + static Result> Make( + std::string table_name, std::shared_ptr ctx); + + ~OverwriteFiles() override; + + // --- Overwrite builder surface (chained, ErrorCollector-deferred) --- + + /// \brief Add a new data file to the overwrite. + /// + /// \param file The data file to add + /// \return Reference to this for method chaining + OverwriteFiles& AddFile(const std::shared_ptr& file); + + /// \brief Delete an existing data file as part of the overwrite. + /// + /// \param file The data file to delete + /// \return Reference to this for method chaining + OverwriteFiles& DeleteFile(const std::shared_ptr& file); + + /// \brief Bulk equivalent of repeated DeleteFile(...) plus explicit delete-file + /// removal. + /// + /// \param data_files_to_delete The data files to delete + /// \param delete_files_to_delete The delete files to remove + /// \return Reference to this for method chaining + OverwriteFiles& DeleteFiles(const DataFileSet& data_files_to_delete, + const DeleteFileSet& delete_files_to_delete); + + /// \brief Overwrite all rows matching the given expression. + /// + /// \param expr The row filter expression defining the overwrite range + /// \return Reference to this for method chaining + OverwriteFiles& OverwriteByRowFilter(std::shared_ptr expr); + + // --- Concurrency / correctness controls --- + + /// \brief Set the lower bound snapshot id for concurrency scans. + /// + /// \param snapshot_id The starting snapshot id + /// \return Reference to this for method chaining + OverwriteFiles& ValidateFromSnapshot(int64_t snapshot_id); + + /// \brief Set an explicit conflict-detection filter. + /// + /// \param expr The conflict-detection filter expression + /// \return Reference to this for method chaining + OverwriteFiles& ConflictDetectionFilter(std::shared_ptr expr); + + /// \brief Enable validation that no concurrent commit added conflicting data files. + /// + /// \return Reference to this for method chaining + OverwriteFiles& ValidateNoConflictingData(); + + /// \brief Enable validation that no concurrent commit added conflicting deletes. + /// + /// \return Reference to this for method chaining + OverwriteFiles& ValidateNoConflictingDeletes(); + + /// \brief Enable strict validation that every added file is fully within the + /// overwrite range. + /// + /// \return Reference to this for method chaining + OverwriteFiles& ValidateAddedFilesMatchOverwriteFilter(); + + /// \brief Set case sensitivity for binding, projection, and metrics evaluation. + /// + /// Forwards to the protected MergingSnapshotUpdate::CaseSensitive(bool); the public + /// name differs to avoid the public/protected name clash and keep a fluent return. + /// + /// \param case_sensitive Whether matching should be case-sensitive + /// \return Reference to this for method chaining + OverwriteFiles& WithCaseSensitivity(bool case_sensitive); + + // --- SnapshotUpdate / MergingSnapshotUpdate overrides --- + + std::string operation() override; + + Status Validate(const TableMetadata& current_metadata, + const std::shared_ptr& snapshot) override; + + private: + explicit OverwriteFiles(std::string table_name, + std::shared_ptr ctx); + + /// \brief Select the conflict-detection filter from the configured state. + std::shared_ptr DataConflictDetectionFilter() const; + + /// \brief Verify every added data file is fully contained in the row filter. + Status ValidateAddedFilesMatchOverwriteFilterImpl(const TableMetadata& metadata); + + std::optional starting_snapshot_id_; + std::shared_ptr conflict_detection_filter_; + DataFileSet deleted_data_files_; + bool validate_no_conflicting_data_ = false; + bool validate_no_conflicting_deletes_ = false; + bool validate_added_files_match_overwrite_filter_ = false; +}; + +} // namespace iceberg From 41c932ba9d314c63729c7175a5b079c71fcd8980 Mon Sep 17 00:00:00 2001 From: "shuxu.li" Date: Mon, 15 Jun 2026 11:29:25 +0800 Subject: [PATCH 2/5] fix(update): validate DeleteFiles content types and fix Meson build --- src/iceberg/test/meson.build | 4 --- src/iceberg/test/overwrite_files_test.cc | 42 ++++++++++++++++++++++++ src/iceberg/update/overwrite_files.cc | 12 +++++++ 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 053c73cb7..b01a61904 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -123,10 +123,6 @@ iceberg_tests = { ), 'use_data': true, }, - 'overwrite_files_test': { - 'sources': files('overwrite_files_test.cc'), - 'use_data': true, - }, } if get_option('rest').enabled() diff --git a/src/iceberg/test/overwrite_files_test.cc b/src/iceberg/test/overwrite_files_test.cc index d003c6ee5..7c74cc4cd 100644 --- a/src/iceberg/test/overwrite_files_test.cc +++ b/src/iceberg/test/overwrite_files_test.cc @@ -526,6 +526,48 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "0"); } +// Content validation: because both sets hold std::shared_ptr, DeleteFiles +// guards content so a data file cannot be passed as a delete file (or vice versa). A +// delete file (position/equality) in the data-file set is rejected; the error surfaces +// at Commit(). +TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { + auto del_file = + MakeDeleteFile("/delete/del_a.parquet", 1L); // content = positionDeletes + DataFileSet data_files; + data_files.insert(del_file); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(data_files, DeleteFileSet{}); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("has delete-file content")); +} + +// A data file (content kData) in the delete-file set is rejected. +TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDataFileInDeleteSet) { + DeleteFileSet delete_files; + delete_files.insert(file_a_); // content = kData + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(DataFileSet{}, delete_files); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("has data-file content")); +} + +// An equality delete file is a valid delete file (content != kData) and is accepted in +// the delete-file set. +TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { + auto eq_delete = MakeEqualityDeleteFile("/delete/eq_a.parquet", 1L); + DeleteFileSet delete_files; + delete_files.insert(eq_delete); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFiles(DataFileSet{}, delete_files); + op->AddFile(file_b_); + EXPECT_THAT(op->Commit(), IsOk()); +} + // ===================================================================================== // 9.6 Concurrency-validation tests (Req 8.2, 8.3, 9.2-9.5; Properties 6, 7) // ===================================================================================== diff --git a/src/iceberg/update/overwrite_files.cc b/src/iceberg/update/overwrite_files.cc index 165d43005..8283c4529 100644 --- a/src/iceberg/update/overwrite_files.cc +++ b/src/iceberg/update/overwrite_files.cc @@ -76,14 +76,26 @@ OverwriteFiles& OverwriteFiles::DeleteFiles(const DataFileSet& data_files_to_del const DeleteFileSet& delete_files_to_delete) { // Bulk equivalent of repeated DeleteFile(...) plus explicit delete-file removal. Empty // sets are no-ops; the set types handle deduplication of repeated entries. + // + // Because both sets hold std::shared_ptr, content is validated here so a data + // file cannot be registered as a delete file (or vice versa): a data file must have + // content kData, and a delete file must be a position/equality delete (content != + // kData). This restores the type safety the C++ shared DataFile representation cannot + // enforce at compile time. for (const auto& file : data_files_to_delete) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + ICEBERG_BUILDER_CHECK(file->content == DataFile::Content::kData, + "Invalid data file to delete: {} has delete-file content", + file->file_path); // Dual-track: record for delete-conflict validation AND register for removal. deleted_data_files_.insert(file); ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); } for (const auto& file : delete_files_to_delete) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid delete file: null"); + ICEBERG_BUILDER_CHECK(file->content != DataFile::Content::kData, + "Invalid delete file to delete: {} has data-file content", + file->file_path); ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDeleteFile(file)); } return *this; From 9d3ba451d0ec3bbc182a810bb15860b52b5e6656 Mon Sep 17 00:00:00 2001 From: "shuxu.li" Date: Sat, 20 Jun 2026 08:08:59 +0800 Subject: [PATCH 3/5] fix(update): address overwrite files lint and comment style --- src/iceberg/test/overwrite_files_test.cc | 224 +++++------------------ src/iceberg/update/overwrite_files.cc | 114 +----------- src/iceberg/update/overwrite_files.h | 21 +-- 3 files changed, 55 insertions(+), 304 deletions(-) diff --git a/src/iceberg/test/overwrite_files_test.cc b/src/iceberg/test/overwrite_files_test.cc index 7c74cc4cd..9b4c6c0c4 100644 --- a/src/iceberg/test/overwrite_files_test.cc +++ b/src/iceberg/test/overwrite_files_test.cc @@ -50,21 +50,11 @@ namespace iceberg { -// ===================================================================================== // Test harness for OverwriteFiles. // -// Modeled on src/iceberg/test/merging_snapshot_update_test.cc (same fixture style, -// same in-memory mock FileIO / catalog setup, same DataFile / commit helpers). Unlike -// that file, OverwriteFiles is the production class with a private constructor, so the -// tests drive it exclusively through its public builder surface (AddFile / DeleteFile / -// OverwriteByRowFilter / ... / operation() / Validate() / Commit()) and observe its -// behavior through the public API: operation() classification, the committed snapshot -// summary, and the public Validate(...) entry point that the commit kernel invokes. -// // The base table (TableMetadataV2ValidMinimal.json) has schema {x: long (id 1), // y: long (id 2), z: long (id 3)} and a single partition spec (spec id 0) that // partitions by identity(x). -// ===================================================================================== class OverwriteFilesTest : public UpdateTestBase { protected: static void SetUpTestSuite() { avro::RegisterAll(); } @@ -239,18 +229,11 @@ class OverwriteFilesTest : public UpdateTestBase { std::shared_ptr file_b_; }; -// ===================================================================================== -// 9.2 Entry-point and builder-method tests -// ===================================================================================== - -// Req 1.1: Table::NewOverwrite() returns a valid builder for a standalone operation. TEST_F(OverwriteFilesTest, TableNewOverwriteReturnsBuilder) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); ASSERT_NE(op, nullptr); } -// Req 1.2, 1.3: Transaction::NewOverwrite() returns a valid builder registered with the -// transaction (commit deferred until the transaction commits). TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { ICEBERG_UNWRAP_OR_FAIL(auto txn, Transaction::Make(table_, TransactionKind::kUpdate)); ICEBERG_UNWRAP_OR_FAIL(auto op, txn->NewOverwrite()); @@ -258,8 +241,6 @@ TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { (*op).OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))).AddFile(file_a_); - // Within a transaction, the builder is staged by its own Commit() before the - // transaction is committed. EXPECT_THAT(op->Commit(), IsOk()); ICEBERG_UNWRAP_OR_FAIL(auto committed, txn->Commit()); EXPECT_THAT(table_->Refresh(), IsOk()); @@ -268,7 +249,6 @@ TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { DataOperation::kOverwrite); } -// Builder methods all return *this and chain (Req 2.1-2.3, 3.1, 4.1-4.2, 6.1, 6.3, 6.4). TEST_F(OverwriteFilesTest, BuilderMethodsReturnSelfAndChain) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); auto* self = op.get(); @@ -284,7 +264,6 @@ TEST_F(OverwriteFilesTest, BuilderMethodsReturnSelfAndChain) { EXPECT_EQ(&op->ValidateAddedFilesMatchOverwriteFilter(), self); EXPECT_EQ(&op->WithCaseSensitivity(false), self); - // A single fluent chain compiles and returns the same instance. OverwriteFiles& chained = (*op) .AddFile(MakeDataFile("/data/chain.parquet", 1L)) @@ -293,10 +272,6 @@ TEST_F(OverwriteFilesTest, BuilderMethodsReturnSelfAndChain) { EXPECT_EQ(&chained, self); } -// ===================================================================================== -// 9.3 operation() truth-table tests (Req 5.1-5.4) -// ===================================================================================== - TEST_F(OverwriteFilesTest, OperationAddOnlyIsAppend) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_); @@ -328,12 +303,6 @@ TEST_F(OverwriteFilesTest, OperationNeitherIsOverwrite) { EXPECT_EQ(op->operation(), DataOperation::kOverwrite); } -// ===================================================================================== -// 9.4 Commit-path and snapshot-control tests (Req 11.2-11.5) -// ===================================================================================== - -// DeleteFile + AddFile commits as overwrite and the recorded operation matches -// operation() (Req 11.4). TEST_F(OverwriteFilesTest, CommitDeleteAndAddIsOverwrite) { CommitFileA(); @@ -351,7 +320,6 @@ TEST_F(OverwriteFilesTest, CommitDeleteAndAddIsOverwrite) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); } -// OverwriteByRowFilter + AddFile commits as overwrite. TEST_F(OverwriteFilesTest, CommitRowFilterAndAddIsOverwrite) { CommitFileA(); @@ -366,7 +334,6 @@ TEST_F(OverwriteFilesTest, CommitRowFilterAndAddIsOverwrite) { DataOperation::kOverwrite); } -// An empty overwrite (no adds, no deletes) commits and records the overwrite operation. TEST_F(OverwriteFilesTest, CommitEmptyOverwrite) { CommitFileA(); @@ -380,7 +347,6 @@ TEST_F(OverwriteFilesTest, CommitEmptyOverwrite) { DataOperation::kOverwrite); } -// Duplicate AddFile / DeleteFile are deduplicated by the underlying set types (Req 3.7). TEST_F(OverwriteFilesTest, CommitDeduplicatesDuplicateAddAndDelete) { CommitFileA(); @@ -398,7 +364,6 @@ TEST_F(OverwriteFilesTest, CommitDeduplicatesDuplicateAddAndDelete) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); } -// StageOnly commits the snapshot without advancing the target branch (Req 11.2). TEST_F(OverwriteFilesTest, CommitStageOnlyDoesNotAdvanceCurrentSnapshot) { const int64_t base_snapshot_id = CommitFileA(); const size_t base_snapshot_count = table_->metadata()->snapshots.size(); @@ -416,7 +381,6 @@ TEST_F(OverwriteFilesTest, CommitStageOnlyDoesNotAdvanceCurrentSnapshot) { EXPECT_GT(table_->metadata()->snapshots.size(), base_snapshot_count); } -// SetTargetBranch commits to a named branch (Req 11.2). TEST_F(OverwriteFilesTest, CommitToTargetBranch) { CommitFileA(); @@ -429,8 +393,6 @@ TEST_F(OverwriteFilesTest, CommitToTargetBranch) { EXPECT_TRUE(table_->metadata()->refs.contains("audit")); } -// A custom Set(property, value) is carried into the committed snapshot summary -// (Req 11.2). TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { CommitFileA(); @@ -444,17 +406,12 @@ TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { EXPECT_EQ(snapshot->summary.at("custom-prop"), "custom-value"); } -// ===================================================================================== -// 9.5 Bulk DeleteFiles tests (Req 3.3-3.7) -// ===================================================================================== - // A DataFileSet + DeleteFileSet forwards data files to DeleteDataFile and delete files // to DeleteDeleteFile; the committed snapshot reflects the data-file removal. (The // delete file is forwarded to DeleteDeleteFile; with no matching committed delete file // present its removal is a harmless no-op, mirroring the inherited missing-delete // behavior.) TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { - // Seed the table with a data file. { ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); seed->AddFile(file_a_); @@ -464,7 +421,6 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); - // Build the bulk delete: remove file_a (data) plus del_file (delete file). DataFileSet data_files; data_files.insert(file_a_); DeleteFileSet delete_files; @@ -473,7 +429,6 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFiles(data_files, delete_files); op->AddFile(file_b_); - // Both a data file deletion and an add => overwrite. EXPECT_EQ(op->operation(), DataOperation::kOverwrite); EXPECT_THAT(op->Commit(), IsOk()); @@ -483,15 +438,12 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); } -// Empty sets are a no-op: with no adds or deletes the builder is a bare overwrite. TEST_F(OverwriteFilesTest, BulkDeleteFilesEmptySetsAreNoOp) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFiles(DataFileSet{}, DeleteFileSet{}); EXPECT_EQ(op->operation(), DataOperation::kOverwrite); // neither adds nor deletes } -// A DataFileSet portion of DeleteFiles records data files such that DeletesDataFiles -// becomes true (observed via operation() == delete), equivalent to repeated DeleteFile. TEST_F(OverwriteFilesTest, BulkDeleteFilesDataPortionMarksDelete) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); DataFileSet data_files; @@ -501,10 +453,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesDataPortionMarksDelete) { EXPECT_EQ(op->operation(), DataOperation::kDelete); } -// DeleteFiles is equivalent to repeated DeleteFile for the data-file portion: both -// classify as delete and (after committing against a seeded table) remove the files. TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { - // Seed file_a and file_b. { ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); seed->AddFile(file_a_); @@ -526,10 +475,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "0"); } -// Content validation: because both sets hold std::shared_ptr, DeleteFiles -// guards content so a data file cannot be passed as a delete file (or vice versa). A -// delete file (position/equality) in the data-file set is rejected; the error surfaces -// at Commit(). +// DeleteFiles validates content because both input sets store DataFile pointers. TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); // content = positionDeletes @@ -543,7 +489,6 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { EXPECT_THAT(result, HasErrorMessage("has delete-file content")); } -// A data file (content kData) in the delete-file set is rejected. TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDataFileInDeleteSet) { DeleteFileSet delete_files; delete_files.insert(file_a_); // content = kData @@ -555,8 +500,6 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDataFileInDeleteSet) { EXPECT_THAT(result, HasErrorMessage("has data-file content")); } -// An equality delete file is a valid delete file (content != kData) and is accepted in -// the delete-file set. TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { auto eq_delete = MakeEqualityDeleteFile("/delete/eq_a.parquet", 1L); DeleteFileSet delete_files; @@ -568,18 +511,12 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { EXPECT_THAT(op->Commit(), IsOk()); } -// ===================================================================================== -// 9.6 Concurrency-validation tests (Req 8.2, 8.3, 9.2-9.5; Properties 6, 7) -// ===================================================================================== - // ValidateNoConflictingData: a competing FastAppend that added a data file matching the -// resolved conflict-detection filter makes the overwrite commit fail (Req 8.3). +// resolved conflict-detection filter makes the overwrite commit fail. TEST_F(OverwriteFilesTest, ValidateNoConflictingDataDetectsConflictingAdd) { const int64_t first_id = CommitFileA(); - // Competing append of file_b (partition x=2) between read and commit. CommitFastAppend(file_b_); - // The overwrite targets the x=2 range, so the concurrent add of file_b conflicts. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); op->AddFile(MakeDataFile("/data/replacement_x2.parquet", 2L)); @@ -588,14 +525,10 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDataDetectsConflictingAdd) { EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } -// A non-conflicting concurrent change still commits, and the recorded operation matches -// operation() (Req 11.4, 11.5; Property 6). TEST_F(OverwriteFilesTest, ValidateNoConflictingDataAllowsNonConflictingChange) { const int64_t first_id = CommitFileA(); - // Competing append of file_b in partition x=2. CommitFastAppend(file_b_); - // The overwrite targets the x=1 range; the concurrent x=2 add does not conflict. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/replacement_x1.parquet", 1L)); @@ -611,11 +544,10 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDataAllowsNonConflictingChange) } // ValidateNoConflictingDeletes: a competing snapshot that deleted a data file in the -// overwrite range makes the commit fail (Req 9.2-9.4). +// overwrite range makes the commit fail. TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) { const int64_t first_id = CommitFileA(); - // Competing overwrite removes file_a (partition x=1) between read and commit. { ICEBERG_UNWRAP_OR_FAIL(auto competing, NewOverwrite()); competing->DeleteFile(file_a_); @@ -645,40 +577,20 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChang EXPECT_THAT(op->Commit(), IsOk()); } -// ===================================================================================== -// Fix #1: the explicit replaced-files delete branch (Path B) honors the -// ConflictDetectionFilter. -// -// Path B fires when explicit data files were registered for replacement -// (deleted_data_files_ non-empty) under ValidateNoConflictingDeletes(). It now calls the -// STATIC data_filter overload of ValidateNoNewDeletesForDataFiles, passing the raw -// conflict_detection_filter_ (which may be nullptr = "no filter, consider all delete -// files"). These tests inject a concurrent equality-delete file that -// covers the replaced data file (file_a, partition x=1) and assert that: -// * with NO conflict filter, the concurrent delete is a conflict => FAIL; -// * with a conflict filter that does NOT cover x=1 (here x=2), the delete is filtered -// out of the conflict scope => SUCCEED. -// -// To exercise Path B in isolation, the builder uses explicit DeleteFile (no -// OverwriteByRowFilter), so RowFilter() stays AlwaysFalse and Path A is skipped. -// ===================================================================================== - -// No conflict filter => the concurrent delete on the replaced file is detected (Path B -// passes nullptr through, so all delete files are considered). +// Explicit replaced-file validation applies ConflictDetectionFilter to concurrent delete +// files that cover replaced data files. TEST_F(OverwriteFilesTest, PathBExplicitDeletesDetectsConcurrentDeleteWithoutFilter) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); - // Concurrent commit adds an equality delete covering file_a (partition x=1). auto concurrent = InjectConcurrentEqualityDelete( first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFile(file_a_); // explicit replacement => deleted_data_files_ non-empty + op->DeleteFile(file_a_); op->AddFile(MakeDataFile("/data/rewrite_x1.parquet", 1L)); op->ValidateFromSnapshot(first_snapshot->snapshot_id); op->ValidateNoConflictingDeletes(); - // No ConflictDetectionFilter => Path B considers all concurrent deletes => conflict. EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), IsError(ErrorKind::kValidationFailed)); } @@ -689,7 +601,6 @@ TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); - // Same concurrent equality delete covering file_a (partition x=1). auto concurrent = InjectConcurrentEqualityDelete( first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); @@ -697,21 +608,15 @@ TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { op->DeleteFile(file_a_); op->AddFile(MakeDataFile("/data/rewrite_x1.parquet", 1L)); op->ValidateFromSnapshot(first_snapshot->snapshot_id); - // Conflict filter targets x=2, which does NOT cover the x=1 delete => filtered out. op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); op->ValidateNoConflictingDeletes(); EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), IsOk()); } -// ===================================================================================== -// 9.7 Strict added-file range validation tests (Req 10.1-10.6; Properties 9, 10) -// // These exercise OverwriteFiles::Validate(...) directly (the same entry point the commit // kernel invokes), which is sufficient and deterministic: the strict-range branch does // not depend on concurrent snapshots. -// ===================================================================================== -// Strict partition projection proves containment directly (Req 10.3). TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); @@ -721,28 +626,27 @@ TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { } // Strict partition projection is insufficient (filter on a non-partition column) but the -// StrictMetricsEvaluator proves containment from the file's bounds (Req 10.3). +// StrictMetricsEvaluator proves containment from the file's bounds. TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictMetrics) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); - // y bounds [5, 5] => every row has y == 5, fully contained in the filter. + // y bounds [5, 5] prove every row has y == 5. op->AddFile(MakeDataFileWithYBounds("/data/y_eq_5.parquet", 1L, 5L, 5L)); op->ValidateAddedFilesMatchOverwriteFilter(); EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); } -// Neither the strict projection nor the metrics can prove containment => fail (Req 10.5). +// Neither the strict projection nor the metrics can prove containment. TEST_F(OverwriteFilesTest, StrictRangeRejectedWhenNotProvable) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); - // y bounds [1, 10] => not all rows are y == 5. + // y bounds [1, 10] do not prove every row has y == 5. op->AddFile(MakeDataFileWithYBounds("/data/y_range.parquet", 1L, 1L, 10L)); op->ValidateAddedFilesMatchOverwriteFilter(); EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsError(ErrorKind::kValidationFailed)); } -// A file whose partition falls outside the inclusive projection is rejected (Req 10.4). TEST_F(OverwriteFilesTest, StrictRangeRejectsFileOutsidePartitionRange) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); @@ -752,8 +656,6 @@ TEST_F(OverwriteFilesTest, StrictRangeRejectsFileOutsidePartitionRange) { IsError(ErrorKind::kValidationFailed)); } -// ValidateAddedFilesMatchOverwriteFilter without a row filter fails (Req 10.1, 10.2; -// Property 10). TEST_F(OverwriteFilesTest, StrictRangeRequiresRowFilter) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(MakeDataFile("/data/no_filter.parquet", 1L)); @@ -762,11 +664,9 @@ TEST_F(OverwriteFilesTest, StrictRangeRequiresRowFilter) { IsError(ErrorKind::kValidationFailed)); } -// Fix #2: added files belonging to MORE THAN ONE partition spec are rejected, since the -// validation resolves a single spec via DataSpec() (which requires exactly one spec among -// added files). DataSpec() fails fast with a multi-spec error. +// ValidateAddedFilesMatchOverwriteFilter resolves one data spec for all added files. TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { - // Add a second partition spec (id 1) to the table metadata BEFORE creating the builder, + // Add a second partition spec before creating the builder, // so the producer's base metadata can resolve both specs when files are staged. ICEBERG_UNWRAP_OR_FAIL( auto spec1, PartitionSpec::Make(*schema_, /*spec_id=*/1, @@ -775,7 +675,6 @@ TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { /*allow_missing_fields=*/false)); table_->metadata()->partition_specs.push_back( std::shared_ptr(std::move(spec1))); - // Confirm both specs resolve. ASSERT_THAT(table_->metadata()->PartitionSpecById(0), IsOk()); ASSERT_THAT(table_->metadata()->PartitionSpecById(1), IsOk()); @@ -788,43 +687,30 @@ TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { op->AddFile(file_spec0); op->AddFile(file_spec1); op->ValidateAddedFilesMatchOverwriteFilter(); - // DataSpec() rejects the two distinct specs with an InvalidArgument error. EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsError(ErrorKind::kInvalidArgument)); } -// Fix #3: enabling the strict added-file range validation with a row filter set but NO -// added data files (e.g. a pure overwrite-by-filter with no AddFile) fails, because -// DataSpec() rejects an empty added-files set. TEST_F(OverwriteFilesTest, StrictRangeRejectsEmptyAddedFiles) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - // Row filter is set (precondition satisfied) but no AddFile was called. op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); op->ValidateAddedFilesMatchOverwriteFilter(); - // DataSpec() raises InvalidArgument because no data file was added. EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsError(ErrorKind::kInvalidArgument)); } -// ===================================================================================== -// 9.8 Case-sensitivity and null-rejection tests (Req 6.4, 12.1-12.4; Property 4) -// ===================================================================================== - // Case-insensitive binding accepts a differently-cased column name where the // case-sensitive (default) binding rejects it. Observed through the strict-range // validation, which binds the row filter using the configured case sensitivity. TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { - // Case-sensitive (default): the filter references "X" which does not match column "x". { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("X", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/cs.parquet", 1L)); op->ValidateAddedFilesMatchOverwriteFilter(); - // Binding "X" against schema {x, y, z} fails case-sensitively. auto result = op->Validate(*table_->metadata(), /*snapshot=*/nullptr); EXPECT_FALSE(result.has_value()); } - // Case-insensitive: "X" binds to column "x" and the in-range file validates. { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->WithCaseSensitivity(false); @@ -836,9 +722,7 @@ TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { } // Null arguments to the pointer-taking builder methods surface at Commit() without -// crashing (Req 12.1, 12.2; Property 4). The ErrorCollector aggregates deferred builder -// errors and surfaces them as a kValidationFailed error at Commit() that preserves the -// underlying invalid-argument message. +// crashing. TEST_F(OverwriteFilesTest, NullAddFileRejectedAtCommit) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(nullptr); @@ -871,21 +755,17 @@ TEST_F(OverwriteFilesTest, NullConflictDetectionFilterRejectedAtCommit) { EXPECT_THAT(result, HasErrorMessage("Invalid conflict detection filter: null")); } -// The builder chain continues without crashing after a null argument is recorded. TEST_F(OverwriteFilesTest, NullArgumentDoesNotCrashBuilderChain) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); (*op).AddFile(nullptr).AddFile(file_a_).OverwriteByRowFilter( Expressions::Equal("x", Literal::Long(1L))); - // The recorded error still surfaces at commit. auto result = op->Commit(); EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); } // A null element cannot enter a DataFileSet / DeleteFileSet (insert() rejects nullptr), -// so DeleteFiles({null...}, {null...}) is a no-op rather than an error. The deferred -// null-element rejection inside DeleteFiles is therefore a defensive guard that is -// unreachable through the public set API; this test documents that observable behavior. +// so DeleteFiles({null...}, {null...}) is a no-op rather than an error. TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { DataFileSet data_files; data_files.insert(std::shared_ptr{nullptr}); @@ -896,14 +776,13 @@ TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFiles(data_files, delete_files); - // No data files were deleted => still a bare overwrite, and commit succeeds. EXPECT_EQ(op->operation(), DataOperation::kOverwrite); EXPECT_THAT(op->Commit(), IsOk()); } // ValidateFromSnapshot accepts a non-negative id (0 is in the generated id range // [0, INT64_MAX]) and rejects a negative id (including kInvalidSnapshotId == -1) as a -// deferred error surfaced at Commit(). Req 6.1. +// deferred error surfaced at Commit(). TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_).ValidateFromSnapshot(-1); @@ -913,25 +792,16 @@ TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { } TEST_F(OverwriteFilesTest, ValidateFromSnapshotAcceptsZeroSnapshotId) { - // 0 is a legal generated snapshot id (the generator masks with int64_t::max()), so it - // must not be rejected at the builder stage. With no concurrency validation enabled, - // the starting id is merely recorded and the commit succeeds. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_).ValidateFromSnapshot(0); EXPECT_THAT(op->Commit(), IsOk()); } -// ===================================================================================== -// Property-style tests (parameterized via loops; no PBT library is available). -// ===================================================================================== - -// Property 2 (builder forwarding) + Property 3 (delete dual-tracking), Task 2.4. -// // AddedDataFiles() / deleted_data_files_ are not publicly observable, so the properties // are checked indirectly through operation(): for any non-null file, AddFile-only yields // `append` (the file was added and the row filter is untouched), while DeleteFile-only // and DeleteFiles-only (data portion) yield `delete` (the file was registered for -// deletion and DeletesDataFiles() became true). Validates Req 2.1, 2.2, 3.1-3.5. +// deletion and DeletesDataFiles() became true). TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { const std::vector> files = { MakeDataFile("/data/p0.parquet", 1L), @@ -942,19 +812,16 @@ TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { for (const auto& file : files) { { - // AddFile preserves "no deletes" => append, proving the row filter is unchanged. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file); EXPECT_EQ(op->operation(), DataOperation::kAppend) << file->file_path; } { - // DeleteFile dual-tracks => DeletesDataFiles() true, no adds => delete. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFile(file); EXPECT_EQ(op->operation(), DataOperation::kDelete) << file->file_path; } { - // Bulk DeleteFiles data portion behaves like repeated DeleteFile. ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); DataFileSet data_files; data_files.insert(file); @@ -964,9 +831,6 @@ TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { } } -// Property 4 (null rejection), Task 2.5: every pointer-taking builder mutator records a -// deferred error that surfaces as an InvalidArgument-class error at Commit() without -// crashing. Validates Req 12.1, 12.2, 12.3, 12.4. TEST_F(OverwriteFilesTest, PropertyNullArgumentRejection) { using Mutator = std::function; const std::vector mutators = { @@ -979,13 +843,10 @@ TEST_F(OverwriteFilesTest, PropertyNullArgumentRejection) { for (const auto& mutate : mutators) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); mutate(*op); - // Deferred builder errors are aggregated and surfaced as kValidationFailed at commit. EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } } -// Property 1 (operation() reflects content), Task 3.2: exhaustive truth table over -// {adds, deletes (explicit or via row filter)}. Validates Req 5.1-5.4. TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { struct Case { bool add; @@ -994,13 +855,34 @@ TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { std::string expected; }; const std::vector cases = { - {/*add=*/true, /*delete_file=*/false, /*row_filter=*/false, DataOperation::kAppend}, - {false, true, false, DataOperation::kDelete}, - {false, false, true, DataOperation::kDelete}, // row filter counts as a delete - {true, true, false, DataOperation::kOverwrite}, - {true, false, true, DataOperation::kOverwrite}, - {false, true, true, DataOperation::kDelete}, // deletes only (no adds) => delete - {false, false, false, DataOperation::kOverwrite}, // neither + {.add = true, + .delete_file = false, + .row_filter = false, + .expected = DataOperation::kAppend}, + {.add = false, + .delete_file = true, + .row_filter = false, + .expected = DataOperation::kDelete}, + {.add = false, + .delete_file = false, + .row_filter = true, + .expected = DataOperation::kDelete}, // row filter counts as a delete + {.add = true, + .delete_file = true, + .row_filter = false, + .expected = DataOperation::kOverwrite}, + {.add = true, + .delete_file = false, + .row_filter = true, + .expected = DataOperation::kOverwrite}, + {.add = false, + .delete_file = true, + .row_filter = true, + .expected = DataOperation::kDelete}, // deletes only + {.add = false, + .delete_file = false, + .row_filter = false, + .expected = DataOperation::kOverwrite}, // neither }; int index = 0; @@ -1021,18 +903,11 @@ TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { } } -// Property 5 (conflict filter resolution), Task 4.4. -// // DataConflictDetectionFilter() is private, so its three resolution outcomes are observed // indirectly through ValidateNoConflictingData against a competing concurrent add of // file_b (partition x=2): -// * explicit filter set -> the explicit filter is used (overrides the row filter); -// * row filter only -> the row filter is used; -// * file-replacement present -> AlwaysTrue (any concurrent add conflicts). -// Validates Req 7.1, 7.2, 7.3. +// explicit filter, row filter only, and explicit file replacement. TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { - // Resolution case 2 (row filter only): row filter x=1 does NOT match the concurrent - // x=2 add -> no conflict. { const int64_t first_id = CommitFileA(); CommitFastAppend(file_b_); @@ -1044,8 +919,6 @@ TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), IsOk()); } - // Resolution case 2 (row filter only): row filter x=2 DOES match -> conflict, proving - // the row filter (not AlwaysFalse / AlwaysTrue blindly) is what was used. { SetUp(); // fresh table const int64_t first_id = CommitFileA(); @@ -1058,8 +931,6 @@ TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), IsError(ErrorKind::kValidationFailed)); } - // Resolution case 1 (explicit filter overrides row filter): row filter x=1 would NOT - // conflict, but the explicit conflict filter x=2 does -> conflict. { SetUp(); const int64_t first_id = CommitFileA(); @@ -1073,16 +944,13 @@ TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), IsError(ErrorKind::kValidationFailed)); } - // Resolution case 3 (file replacement -> AlwaysTrue): with an explicit DeleteFile and - // no explicit conflict filter, ANY concurrent add conflicts (here file_b in x=2, even - // though the row filter is x=1). { SetUp(); const int64_t first_id = CommitFileA(); CommitFastAppend(file_b_); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->DeleteFile(file_a_); // makes deleted_data_files_ non-empty => AlwaysTrue + op->DeleteFile(file_a_); op->AddFile(MakeDataFile("/data/r3.parquet", 1L)); op->ValidateFromSnapshot(first_id); op->ValidateNoConflictingData(); diff --git a/src/iceberg/update/overwrite_files.cc b/src/iceberg/update/overwrite_files.cc index 8283c4529..4e5dbfeef 100644 --- a/src/iceberg/update/overwrite_files.cc +++ b/src/iceberg/update/overwrite_files.cc @@ -56,17 +56,12 @@ OverwriteFiles::~OverwriteFiles() = default; OverwriteFiles& OverwriteFiles::AddFile(const std::shared_ptr& file) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); - // Forward to the inherited staging path. Any error (e.g. missing partition spec - // ID) is captured by the ErrorCollector and surfaced at Commit() rather than - // dropped. RowFilter() and deleted_data_files_ are intentionally left unchanged. ICEBERG_BUILDER_RETURN_IF_ERROR(AddDataFile(file)); return *this; } OverwriteFiles& OverwriteFiles::DeleteFile(const std::shared_ptr& file) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); - // Dual-track: record the file for delete-conflict validation AND register it for - // removal via the inherited pipeline. deleted_data_files_.insert(file); ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); return *this; @@ -74,20 +69,13 @@ OverwriteFiles& OverwriteFiles::DeleteFile(const std::shared_ptr& file OverwriteFiles& OverwriteFiles::DeleteFiles(const DataFileSet& data_files_to_delete, const DeleteFileSet& delete_files_to_delete) { - // Bulk equivalent of repeated DeleteFile(...) plus explicit delete-file removal. Empty - // sets are no-ops; the set types handle deduplication of repeated entries. - // - // Because both sets hold std::shared_ptr, content is validated here so a data - // file cannot be registered as a delete file (or vice versa): a data file must have - // content kData, and a delete file must be a position/equality delete (content != - // kData). This restores the type safety the C++ shared DataFile representation cannot - // enforce at compile time. + // Both sets use DataFile pointers, so validate content before forwarding to the + // data-file and delete-file removal paths. for (const auto& file : data_files_to_delete) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); ICEBERG_BUILDER_CHECK(file->content == DataFile::Content::kData, "Invalid data file to delete: {} has delete-file content", file->file_path); - // Dual-track: record for delete-conflict validation AND register for removal. deleted_data_files_.insert(file); ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); } @@ -103,20 +91,11 @@ OverwriteFiles& OverwriteFiles::DeleteFiles(const DataFileSet& data_files_to_del OverwriteFiles& OverwriteFiles::OverwriteByRowFilter(std::shared_ptr expr) { ICEBERG_BUILDER_CHECK(expr != nullptr, "Invalid row filter expression: null"); - // Forward to the inherited filter path: this sets RowFilter() and forwards the - // expression to both the data and delete filter managers. Any error is captured by the - // ErrorCollector and surfaced at Commit(). ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteByRowFilter(std::move(expr))); return *this; } OverwriteFiles& OverwriteFiles::ValidateFromSnapshot(int64_t snapshot_id) { - // Snapshot ids are non-negative: SnapshotUtil::GenerateSnapshotId() masks with - // int64_t::max() and so yields a value in [0, INT64_MAX], while kInvalidSnapshotId is - // -1. A negative id therefore can never identify a real snapshot and is rejected via - // the deferred ErrorCollector (surfaced at Commit()); 0 is accepted because it is in - // the generator's range. A negative id is always a misuse, so we keep an early, - // clearer guard against the obviously-invalid sentinel/negative values. ICEBERG_BUILDER_CHECK(snapshot_id >= 0, "Invalid snapshot id: {}", snapshot_id); starting_snapshot_id_ = snapshot_id; return *this; @@ -130,41 +109,27 @@ OverwriteFiles& OverwriteFiles::ConflictDetectionFilter( } OverwriteFiles& OverwriteFiles::WithCaseSensitivity(bool case_sensitive) { - // Forward to the protected MergingSnapshotUpdate::CaseSensitive(bool) setter (which - // returns void); the public name differs to avoid the public/protected name clash and - // keep a fluent return type. CaseSensitive(case_sensitive); return *this; } OverwriteFiles& OverwriteFiles::ValidateNoConflictingData() { - // Enable concurrent-data validation and require every explicitly-deleted old file to - // be hit during manifest filtering. validate_no_conflicting_data_ = true; FailMissingDeletePaths(); return *this; } OverwriteFiles& OverwriteFiles::ValidateNoConflictingDeletes() { - // Enable concurrent-delete validation and require every explicitly-deleted old file to - // be hit during manifest filtering. validate_no_conflicting_deletes_ = true; FailMissingDeletePaths(); return *this; } OverwriteFiles& OverwriteFiles::ValidateAddedFilesMatchOverwriteFilter() { - // Enable strict validation that every added data file is fully contained in the - // overwrite range. The precondition (a row filter must be set) is enforced in - // Validate(...). validate_added_files_match_overwrite_filter_ = true; return *this; } -// Classify the snapshot operation from the current builder content. DeletesDataFiles() -// is true for both an explicit DeleteFile(...) and a pure OverwriteByRowFilter(...) (the -// latter via ManifestFilterManager::ContainsDeletes()), so OverwriteByRowFilter + AddFile -// correctly classifies as overwrite without any RowFilter()-based fallback. std::string OverwriteFiles::operation() { if (DeletesDataFiles() && !AddsDataFiles()) { return DataOperation::kDelete; @@ -175,16 +140,8 @@ std::string OverwriteFiles::operation() { return DataOperation::kOverwrite; } -// Select the conflict-detection filter. Precedence: (1) the explicitly-set -// conflict_detection_filter_; otherwise -// (2) the row filter when it is set and not AlwaysFalse and no explicit data files were -// registered for deletion (the pure overwriteByRowFilter case, where the row filter -// already describes the conflicting range); otherwise (3) the conservative AlwaysTrue, -// under which any newly-added data file counts as a conflict (file-replacement or mixed -// mode). RowFilter() defaults to the AlwaysFalse singleton when unset, so comparing -// against the Expressions::AlwaysFalse() singleton by pointer identity (consistent with -// how the filter managers track delete_expr_) correctly treats the unset case as "no -// row filter". +// Pure row-filter overwrites use the row filter as the data conflict filter. Explicit +// file replacement is more conservative unless the caller supplies a narrower filter. std::shared_ptr OverwriteFiles::DataConflictDetectionFilter() const { if (conflict_detection_filter_ != nullptr) { return conflict_detection_filter_; @@ -196,16 +153,8 @@ std::shared_ptr OverwriteFiles::DataConflictDetectionFilter() const return Expressions::AlwaysTrue(); } -// Run the enabled overwrite-specific concurrency checks before commit. The branches are -// sequential and independent. The first failing check aborts the commit; on success we -// return OK and let the inherited Apply(...) build the snapshot. Status OverwriteFiles::Validate(const TableMetadata& current_metadata, const std::shared_ptr& snapshot) { - // 0. Strict added-file range precondition: when - // ValidateAddedFilesMatchOverwriteFilter() - // is enabled, a row filter is required to define the overwrite range. RowFilter() - // defaults to the AlwaysFalse singleton when unset, so an unset or AlwaysFalse - // filter means there is no range to validate against and the commit must fail. if (validate_added_files_match_overwrite_filter_) { if (RowFilter() == nullptr || RowFilter() == Expressions::AlwaysFalse()) { return ValidationFailed( @@ -215,20 +164,13 @@ Status OverwriteFiles::Validate(const TableMetadata& current_metadata, ValidateAddedFilesMatchOverwriteFilterImpl(current_metadata)); } - // 1. Concurrent newly-added data files: fail if any snapshot after the starting point - // added a data file matching the resolved conflict-detection filter. if (validate_no_conflicting_data_) { ICEBERG_RETURN_UNEXPECTED(ValidateAddedDataFiles( current_metadata, starting_snapshot_id_, DataConflictDetectionFilter(), snapshot, ctx_->table->io(), IsCaseSensitive())); } - // 2. Concurrent deletes: the two sub-checks are independent and both may fire. if (validate_no_conflicting_deletes_) { - // Path A: a row filter is in play (set and not the AlwaysFalse singleton). Fail if a - // new delete file matches the range, or if a data file in the range was concurrently - // removed. Prefer the explicit conflict_detection_filter_ when set, else the row - // filter that describes the overwrite range. if (RowFilter() != nullptr && RowFilter() != Expressions::AlwaysFalse()) { auto delete_filter = conflict_detection_filter_ != nullptr ? conflict_detection_filter_ @@ -241,12 +183,6 @@ Status OverwriteFiles::Validate(const TableMetadata& current_metadata, snapshot, ctx_->table->io(), IsCaseSensitive())); } - // Path B: explicit old data files were registered for replacement. Fail if a new - // delete file matching the conflict-detection filter covers any of them. Use the - // STATIC data_filter overload, passing the raw conflict_detection_filter_ (which may - // be nullptr — a nullptr filter means no filter, so all delete files are considered). - // The 3rd argument is an Expression, so this overload is unambiguous and needs no - // member-function-pointer cast. if (!deleted_data_files_.empty()) { ICEBERG_RETURN_UNEXPECTED(ValidateNoNewDeletesForDataFiles( current_metadata, starting_snapshot_id_, conflict_detection_filter_, @@ -258,54 +194,26 @@ Status OverwriteFiles::Validate(const TableMetadata& current_metadata, } // Every added data file must be fully contained in the overwrite range defined by -// RowFilter(). The validation resolves a single partition spec via DataSpec() and then, -// for each added file, an inclusive partition projection provides a fast negative (reject -// when the file's partition cannot possibly fall in range), a strict partition projection -// provides a fast positive (accept when the whole partition is guaranteed in range), and -// a file-level StrictMetricsEvaluator provides the fallback proof. Metrics that are -// missing or insufficient yield a non-matching result and therefore fail validation -// (conservative: never silently accept an unprovable file). -// -// DataSpec() returns InvalidArgument when zero data files were added (empty-added-files -// is rejected) and when more than one partition spec is represented among the added files -// (multi-spec is rejected). DataSpec() reads the spec from the producer's base metadata; -// the projections use the table schema from metadata.Schema(). -// -// Implementation notes: -// * `Evaluator::Make` takes a `Schema`, and the partition type is obtained via -// `PartitionSpec::PartitionType(schema)` (which needs the table schema and returns a -// `StructType`). We therefore wrap the partition `StructType`'s fields in a `Schema` -// and bind the projected expression against it. -// * `StrictMetricsEvaluator::Make` binds its expression internally, and `Binder::Bind` -// rejects an already-bound predicate. So the bound filter is used only for the -// projections (`ProjectionEvaluator::Project` requires a bound expression), while the -// UNBOUND `RowFilter()` is handed to `StrictMetricsEvaluator::Make`. +// RowFilter(). Partition projection handles whole-partition proofs; strict metrics are +// used only when the partition alone is not enough. Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( const TableMetadata& metadata) { ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); - // Single spec for all added files. This rejects both the empty-added-files case and - // the multi-spec case via InvalidArgument. ICEBERG_ASSIGN_OR_RAISE(auto spec, DataSpec()); - // Bind the row filter once for projection (Project requires a bound expression); build - // the strict metrics evaluator once over the UNBOUND row filter (it binds internally). + // Project requires a bound expression; StrictMetricsEvaluator binds internally. ICEBERG_ASSIGN_OR_RAISE(auto bound_filter, Binder::Bind(*schema, RowFilter(), IsCaseSensitive())); ICEBERG_ASSIGN_OR_RAISE( auto strict_metrics_evaluator, StrictMetricsEvaluator::Make(RowFilter(), schema, IsCaseSensitive())); - // Build ONE inclusive and ONE strict partition-value evaluator from the single spec. - // The Evaluators bind on construction and do not retain the partition schema, so the - // local schema below may safely go out of scope. ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec->PartitionType(*schema)); auto partition_fields = partition_type->fields(); Schema partition_schema( std::vector(partition_fields.begin(), partition_fields.end())); - // Inclusive projection (fast negative): project the bound filter into the partition - // space and build an evaluator over the spec's partition values. auto inclusive_projection = Projections::Inclusive(*spec, *schema, IsCaseSensitive()); ICEBERG_ASSIGN_OR_RAISE(auto inclusive_expr, inclusive_projection->Project(bound_filter)); @@ -313,7 +221,6 @@ Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( auto inclusive_evaluator, Evaluator::Make(partition_schema, inclusive_expr, IsCaseSensitive())); - // Strict projection (fast positive). auto strict_projection = Projections::Strict(*spec, *schema, IsCaseSensitive()); ICEBERG_ASSIGN_OR_RAISE(auto strict_expr, strict_projection->Project(bound_filter)); ICEBERG_ASSIGN_OR_RAISE( @@ -326,8 +233,6 @@ Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( "Cannot validate added files match overwrite filter: null data file"); } - // Step 1: inclusive projection — fast negative. If the file's partition cannot match - // the (inclusively projected) filter, the file is definitively outside the range. ICEBERG_ASSIGN_OR_RAISE(bool inclusive_match, inclusive_evaluator->Evaluate(file->partition)); if (!inclusive_match) { @@ -337,17 +242,12 @@ Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( file->file_path); } - // Step 2: strict projection — fast positive. If the whole partition is guaranteed in - // range, the file is accepted without inspecting metrics. ICEBERG_ASSIGN_OR_RAISE(bool strict_match, strict_evaluator->Evaluate(file->partition)); if (strict_match) { continue; } - // Step 3: file-level proof via strict metrics. A false result — including the case - // where metrics are missing or insufficient — fails validation conservatively rather - // than silently accepting a file that cannot be proved fully in range. ICEBERG_ASSIGN_OR_RAISE(bool metrics_match, strict_metrics_evaluator->Evaluate(*file)); if (!metrics_match) { diff --git a/src/iceberg/update/overwrite_files.h b/src/iceberg/update/overwrite_files.h index 59f25914d..1c1979108 100644 --- a/src/iceberg/update/overwrite_files.h +++ b/src/iceberg/update/overwrite_files.h @@ -36,18 +36,11 @@ namespace iceberg { /// \brief Overwrite data files in a table. /// -/// OverwriteFiles expresses a logical overwrite transaction: either explicit file -/// replacement (`DeleteFile(old)` + `AddFile(new)`) or range-based replacement -/// (`OverwriteByRowFilter(expr)` + `AddFile(new...)`). It is a concrete subclass of -/// MergingSnapshotUpdate that adds only the overwrite-specific builder surface and the -/// overwrite-specific `Validate(...)` logic, reusing the inherited filter → write → -/// merge pipeline, summary building, commit retry, and cleanup unchanged. +/// Supports explicit file replacement and range-based replacement by row filter. class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { public: /// \brief Create a new OverwriteFiles instance. /// - /// Mirrors FastAppend::Make: validates inputs and returns a heap instance. - /// /// \param table_name The name of the table /// \param ctx The transaction context to use for this update /// \return A Result containing the OverwriteFiles instance or an error @@ -56,8 +49,6 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { ~OverwriteFiles() override; - // --- Overwrite builder surface (chained, ErrorCollector-deferred) --- - /// \brief Add a new data file to the overwrite. /// /// \param file The data file to add @@ -70,8 +61,7 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { /// \return Reference to this for method chaining OverwriteFiles& DeleteFile(const std::shared_ptr& file); - /// \brief Bulk equivalent of repeated DeleteFile(...) plus explicit delete-file - /// removal. + /// \brief Remove data files and delete files as part of the overwrite. /// /// \param data_files_to_delete The data files to delete /// \param delete_files_to_delete The delete files to remove @@ -85,8 +75,6 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { /// \return Reference to this for method chaining OverwriteFiles& OverwriteByRowFilter(std::shared_ptr expr); - // --- Concurrency / correctness controls --- - /// \brief Set the lower bound snapshot id for concurrency scans. /// /// \param snapshot_id The starting snapshot id @@ -117,15 +105,10 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { /// \brief Set case sensitivity for binding, projection, and metrics evaluation. /// - /// Forwards to the protected MergingSnapshotUpdate::CaseSensitive(bool); the public - /// name differs to avoid the public/protected name clash and keep a fluent return. - /// /// \param case_sensitive Whether matching should be case-sensitive /// \return Reference to this for method chaining OverwriteFiles& WithCaseSensitivity(bool case_sensitive); - // --- SnapshotUpdate / MergingSnapshotUpdate overrides --- - std::string operation() override; Status Validate(const TableMetadata& current_metadata, From b93b4da149bf88cb21c3928f7f4e1502e430fe3d Mon Sep 17 00:00:00 2001 From: "shuxu.li" Date: Sat, 20 Jun 2026 08:37:15 +0800 Subject: [PATCH 4/5] fix(update): Fix OverwriteFiles data/delete content validation for direct AddFile/DeleteFile paths and trim overly verbose test comments. --- src/iceberg/test/overwrite_files_test.cc | 100 ++++++++--------------- src/iceberg/update/overwrite_files.cc | 6 ++ 2 files changed, 42 insertions(+), 64 deletions(-) diff --git a/src/iceberg/test/overwrite_files_test.cc b/src/iceberg/test/overwrite_files_test.cc index 9b4c6c0c4..bbdc06d13 100644 --- a/src/iceberg/test/overwrite_files_test.cc +++ b/src/iceberg/test/overwrite_files_test.cc @@ -50,11 +50,8 @@ namespace iceberg { -// Test harness for OverwriteFiles. -// // The base table (TableMetadataV2ValidMinimal.json) has schema {x: long (id 1), -// y: long (id 2), z: long (id 3)} and a single partition spec (spec id 0) that -// partitions by identity(x). +// y: long (id 2), z: long (id 3)} and partitions by identity(x). class OverwriteFilesTest : public UpdateTestBase { protected: static void SetUpTestSuite() { avro::RegisterAll(); } @@ -73,7 +70,6 @@ class OverwriteFilesTest : public UpdateTestBase { file_b_ = MakeDataFile("/data/file_b.parquet", /*partition_x=*/2L); } - // A plain data file in spec 0 (identity(x)) with the given partition value for x. std::shared_ptr MakeDataFile(const std::string& path, int64_t partition_x, int64_t record_count = 100) { auto f = std::make_shared(); @@ -87,8 +83,7 @@ class OverwriteFilesTest : public UpdateTestBase { return f; } - // A data file carrying column metrics for column y (field id 2): lower/upper bounds - // plus value/null counts, so the StrictMetricsEvaluator can reason about it. + // Add y metrics so StrictMetricsEvaluator can prove row-filter containment. std::shared_ptr MakeDataFileWithYBounds(const std::string& path, int64_t partition_x, int64_t y_lower, int64_t y_upper) { @@ -106,7 +101,6 @@ class OverwriteFilesTest : public UpdateTestBase { return f; } - // An equality delete file in spec 0 (partition x = partition_x). std::shared_ptr MakeEqualityDeleteFile(const std::string& path, int64_t partition_x) { auto f = MakeDeleteFile(path, partition_x); @@ -115,9 +109,7 @@ class OverwriteFilesTest : public UpdateTestBase { return f; } - // Write a delete manifest containing the given delete files, with the snapshot id and - // sequence number assigned on each entry (so the manifest list writer does not need to - // inherit them). + // Entries carry assigned snapshot and sequence numbers. Result WriteDeleteManifest( const std::string& path, const std::vector>& files, int64_t snapshot_id, int64_t sequence_number) { @@ -137,8 +129,6 @@ class OverwriteFilesTest : public UpdateTestBase { return writer->ToManifestFile(); } - // Build a synthetic snapshot from the given manifests (mirrors the merging-update test - // harness). Used to inject a concurrent commit between read and validate. Result> MakeSyntheticSnapshot( std::string operation, int64_t snapshot_id, std::optional parent_snapshot_id, int64_t sequence_number, @@ -161,10 +151,7 @@ class OverwriteFilesTest : public UpdateTestBase { return std::shared_ptr(std::move(snapshot)); } - // Inject a concurrent snapshot that adds an equality delete file covering partition - // `partition_x`, layered on top of `parent`. Returns the metadata containing the new - // snapshot (as current) plus the new snapshot itself, so a subsequent Validate(...) can - // scan the range (parent, new] for new deletes. + // Build metadata with a synthetic concurrent equality delete after `parent`. struct ConcurrentDelete { std::shared_ptr metadata; std::shared_ptr snapshot; @@ -199,7 +186,6 @@ class OverwriteFilesTest : public UpdateTestBase { return table_->NewOverwrite(); } - // Commit file_a_ with FastAppend and refresh the table; returns its snapshot id. int64_t CommitFileA() { auto fa = table_->NewFastAppend(); EXPECT_TRUE(fa.has_value()); @@ -211,7 +197,6 @@ class OverwriteFilesTest : public UpdateTestBase { return snap.value()->snapshot_id; } - // Append a single file via FastAppend and refresh; returns the new snapshot. std::shared_ptr CommitFastAppend(const std::shared_ptr& file) { auto fa = table_->NewFastAppend(); EXPECT_TRUE(fa.has_value()); @@ -406,11 +391,7 @@ TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { EXPECT_EQ(snapshot->summary.at("custom-prop"), "custom-value"); } -// A DataFileSet + DeleteFileSet forwards data files to DeleteDataFile and delete files -// to DeleteDeleteFile; the committed snapshot reflects the data-file removal. (The -// delete file is forwarded to DeleteDeleteFile; with no matching committed delete file -// present its removal is a harmless no-op, mirroring the inherited missing-delete -// behavior.) +// With no matching committed delete file, deleting `del_file` is a harmless no-op. TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { { ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); @@ -475,7 +456,30 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kTotalDataFiles), "0"); } -// DeleteFiles validates content because both input sets store DataFile pointers. +// OverwriteFiles validates content because the C++ API stores data and delete files in +// DataFile pointers, while Java uses separate DataFile/DeleteFile types. +TEST_F(OverwriteFilesTest, AddFileRejectsDeleteFileContent) { + auto del_file = MakeDeleteFile("/delete/del_as_data.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->AddFile(del_file); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid data file to add")); + EXPECT_THAT(result, HasErrorMessage("has delete-file content")); +} + +TEST_F(OverwriteFilesTest, DeleteFileRejectsDeleteFileContent) { + auto del_file = MakeDeleteFile("/delete/del_as_delete.parquet", 1L); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->DeleteFile(del_file); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Invalid data file to delete")); + EXPECT_THAT(result, HasErrorMessage("has delete-file content")); +} + TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); // content = positionDeletes @@ -511,8 +515,6 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { EXPECT_THAT(op->Commit(), IsOk()); } -// ValidateNoConflictingData: a competing FastAppend that added a data file matching the -// resolved conflict-detection filter makes the overwrite commit fail. TEST_F(OverwriteFilesTest, ValidateNoConflictingDataDetectsConflictingAdd) { const int64_t first_id = CommitFileA(); CommitFastAppend(file_b_); @@ -543,8 +545,6 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDataAllowsNonConflictingChange) EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), expected_operation); } -// ValidateNoConflictingDeletes: a competing snapshot that deleted a data file in the -// overwrite range makes the commit fail. TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) { const int64_t first_id = CommitFileA(); @@ -563,10 +563,8 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } -// ValidateNoConflictingDeletes allows a non-conflicting concurrent append to commit. TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChange) { const int64_t first_id = CommitFileA(); - // Competing append in partition x=2 (no deletes in the x=1 range). CommitFastAppend(file_b_); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -577,8 +575,7 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChang EXPECT_THAT(op->Commit(), IsOk()); } -// Explicit replaced-file validation applies ConflictDetectionFilter to concurrent delete -// files that cover replaced data files. +// Explicit replaced-file validation checks concurrent deletes covering replaced files. TEST_F(OverwriteFilesTest, PathBExplicitDeletesDetectsConcurrentDeleteWithoutFilter) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); @@ -595,8 +592,7 @@ TEST_F(OverwriteFilesTest, PathBExplicitDeletesDetectsConcurrentDeleteWithoutFil IsError(ErrorKind::kValidationFailed)); } -// A conflict filter that does not cover the replaced file's partition narrows the scope, -// so the concurrent delete is filtered out and validation succeeds. +// A narrower conflict filter can exclude the concurrent delete. TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); @@ -613,10 +609,6 @@ TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), IsOk()); } -// These exercise OverwriteFiles::Validate(...) directly (the same entry point the commit -// kernel invokes), which is sufficient and deterministic: the strict-range branch does -// not depend on concurrent snapshots. - TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); @@ -625,8 +617,6 @@ TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); } -// Strict partition projection is insufficient (filter on a non-partition column) but the -// StrictMetricsEvaluator proves containment from the file's bounds. TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictMetrics) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); @@ -636,7 +626,6 @@ TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictMetrics) { EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); } -// Neither the strict projection nor the metrics can prove containment. TEST_F(OverwriteFilesTest, StrictRangeRejectedWhenNotProvable) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); @@ -664,10 +653,8 @@ TEST_F(OverwriteFilesTest, StrictRangeRequiresRowFilter) { IsError(ErrorKind::kValidationFailed)); } -// ValidateAddedFilesMatchOverwriteFilter resolves one data spec for all added files. TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { - // Add a second partition spec before creating the builder, - // so the producer's base metadata can resolve both specs when files are staged. + // Add the second spec before creating the builder so staged files can resolve it. ICEBERG_UNWRAP_OR_FAIL( auto spec1, PartitionSpec::Make(*schema_, /*spec_id=*/1, {PartitionField(/*source_id=*/1, /*field_id=*/1001, @@ -699,9 +686,7 @@ TEST_F(OverwriteFilesTest, StrictRangeRejectsEmptyAddedFiles) { IsError(ErrorKind::kInvalidArgument)); } -// Case-insensitive binding accepts a differently-cased column name where the -// case-sensitive (default) binding rejects it. Observed through the strict-range -// validation, which binds the row filter using the configured case sensitivity. +// Strict-range validation binds the row filter with the configured case sensitivity. TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -721,8 +706,6 @@ TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { } } -// Null arguments to the pointer-taking builder methods surface at Commit() without -// crashing. TEST_F(OverwriteFilesTest, NullAddFileRejectedAtCommit) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(nullptr); @@ -764,8 +747,7 @@ TEST_F(OverwriteFilesTest, NullArgumentDoesNotCrashBuilderChain) { EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); } -// A null element cannot enter a DataFileSet / DeleteFileSet (insert() rejects nullptr), -// so DeleteFiles({null...}, {null...}) is a no-op rather than an error. +// DataFileSet/DeleteFileSet reject nullptr on insert. TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { DataFileSet data_files; data_files.insert(std::shared_ptr{nullptr}); @@ -780,9 +762,7 @@ TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { EXPECT_THAT(op->Commit(), IsOk()); } -// ValidateFromSnapshot accepts a non-negative id (0 is in the generated id range -// [0, INT64_MAX]) and rejects a negative id (including kInvalidSnapshotId == -1) as a -// deferred error surfaced at Commit(). +// Snapshot id 0 is valid; negative ids are rejected as builder errors. TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_).ValidateFromSnapshot(-1); @@ -797,11 +777,6 @@ TEST_F(OverwriteFilesTest, ValidateFromSnapshotAcceptsZeroSnapshotId) { EXPECT_THAT(op->Commit(), IsOk()); } -// AddedDataFiles() / deleted_data_files_ are not publicly observable, so the properties -// are checked indirectly through operation(): for any non-null file, AddFile-only yields -// `append` (the file was added and the row filter is untouched), while DeleteFile-only -// and DeleteFiles-only (data portion) yield `delete` (the file was registered for -// deletion and DeletesDataFiles() became true). TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { const std::vector> files = { MakeDataFile("/data/p0.parquet", 1L), @@ -903,10 +878,7 @@ TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { } } -// DataConflictDetectionFilter() is private, so its three resolution outcomes are observed -// indirectly through ValidateNoConflictingData against a competing concurrent add of -// file_b (partition x=2): -// explicit filter, row filter only, and explicit file replacement. +// Exercise explicit filter, row-filter-only, and explicit file replacement resolution. TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { { const int64_t first_id = CommitFileA(); diff --git a/src/iceberg/update/overwrite_files.cc b/src/iceberg/update/overwrite_files.cc index 4e5dbfeef..8b493babb 100644 --- a/src/iceberg/update/overwrite_files.cc +++ b/src/iceberg/update/overwrite_files.cc @@ -56,12 +56,18 @@ OverwriteFiles::~OverwriteFiles() = default; OverwriteFiles& OverwriteFiles::AddFile(const std::shared_ptr& file) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + ICEBERG_BUILDER_CHECK(file->content == DataFile::Content::kData, + "Invalid data file to add: {} has delete-file content", + file->file_path); ICEBERG_BUILDER_RETURN_IF_ERROR(AddDataFile(file)); return *this; } OverwriteFiles& OverwriteFiles::DeleteFile(const std::shared_ptr& file) { ICEBERG_BUILDER_CHECK(file != nullptr, "Invalid data file: null"); + ICEBERG_BUILDER_CHECK(file->content == DataFile::Content::kData, + "Invalid data file to delete: {} has delete-file content", + file->file_path); deleted_data_files_.insert(file); ICEBERG_BUILDER_RETURN_IF_ERROR(DeleteDataFile(file)); return *this; From 23699bb411fba82266cb3693d78e1a2fa9964a1d Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 24 Jun 2026 10:26:22 +0800 Subject: [PATCH 5/5] polish comments and logic to align with java --- src/iceberg/table.cc | 6 +- src/iceberg/table.h | 2 + src/iceberg/test/overwrite_files_test.cc | 472 +++++------------- src/iceberg/test/table_test.cc | 1 + src/iceberg/update/delete_files.h | 47 +- src/iceberg/update/fast_append.h | 37 +- src/iceberg/update/merge_append.h | 28 +- src/iceberg/update/merging_snapshot_update.cc | 7 +- src/iceberg/update/merging_snapshot_update.h | 30 +- src/iceberg/update/overwrite_files.cc | 155 +++--- src/iceberg/update/overwrite_files.h | 144 ++++-- src/iceberg/update/row_delta.h | 13 +- src/iceberg/update/snapshot_update.h | 43 +- 13 files changed, 423 insertions(+), 562 deletions(-) diff --git a/src/iceberg/table.cc b/src/iceberg/table.cc index 1365c0565..9dbc5acf7 100644 --- a/src/iceberg/table.cc +++ b/src/iceberg/table.cc @@ -35,8 +35,8 @@ #include "iceberg/update/expire_snapshots.h" #include "iceberg/update/fast_append.h" #include "iceberg/update/merge_append.h" -#include "iceberg/update/row_delta.h" #include "iceberg/update/overwrite_files.h" +#include "iceberg/update/row_delta.h" #include "iceberg/update/set_snapshot.h" #include "iceberg/update/snapshot_manager.h" #include "iceberg/update/update_location.h" @@ -356,6 +356,10 @@ Result> StaticTable::NewRowDelta() { return NotSupported("Cannot create a row delta for a static table"); } +Result> StaticTable::NewOverwrite() { + return NotSupported("Cannot create an overwrite for a static table"); +} + Result> StaticTable::NewSnapshotManager() { return NotSupported("Cannot create a snapshot manager for a static table"); } diff --git a/src/iceberg/table.h b/src/iceberg/table.h index 536c97ad7..64ed21ef8 100644 --- a/src/iceberg/table.h +++ b/src/iceberg/table.h @@ -261,6 +261,8 @@ class ICEBERG_EXPORT StaticTable : public Table { Result> NewRowDelta() override; + Result> NewOverwrite() override; + Result> NewSnapshotManager() override; private: diff --git a/src/iceberg/test/overwrite_files_test.cc b/src/iceberg/test/overwrite_files_test.cc index bbdc06d13..5d4d851f1 100644 --- a/src/iceberg/test/overwrite_files_test.cc +++ b/src/iceberg/test/overwrite_files_test.cc @@ -19,7 +19,6 @@ #include "iceberg/update/overwrite_files.h" -#include #include #include #include @@ -30,9 +29,6 @@ #include "iceberg/avro/avro_register.h" #include "iceberg/constants.h" #include "iceberg/expression/expressions.h" -#include "iceberg/manifest/manifest_entry.h" -#include "iceberg/manifest/manifest_list.h" -#include "iceberg/manifest/manifest_writer.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" #include "iceberg/row/partition_values.h" @@ -45,6 +41,7 @@ #include "iceberg/transaction.h" #include "iceberg/transform.h" #include "iceberg/update/fast_append.h" +#include "iceberg/update/row_delta.h" #include "iceberg/util/data_file_set.h" #include "iceberg/util/macros.h" @@ -109,77 +106,12 @@ class OverwriteFilesTest : public UpdateTestBase { return f; } - // Entries carry assigned snapshot and sequence numbers. - Result WriteDeleteManifest( - const std::string& path, const std::vector>& files, - int64_t snapshot_id, int64_t sequence_number) { - ICEBERG_ASSIGN_OR_RAISE( - auto writer, - ManifestWriter::MakeWriter(/*format_version=*/2, snapshot_id, path, file_io_, - spec_, schema_, ManifestContent::kDeletes)); - for (const auto& f : files) { - ManifestEntry entry; - entry.status = ManifestStatus::kAdded; - entry.snapshot_id = snapshot_id; - entry.sequence_number = sequence_number; - entry.data_file = f; - ICEBERG_RETURN_UNEXPECTED(writer->WriteAddedEntry(entry)); - } - ICEBERG_RETURN_UNEXPECTED(writer->Close()); - return writer->ToManifestFile(); - } - - Result> MakeSyntheticSnapshot( - std::string operation, int64_t snapshot_id, - std::optional parent_snapshot_id, int64_t sequence_number, - const std::vector& manifests) { - auto manifest_list_path = table_location_ + "/metadata/manifest-list-" + - std::to_string(snapshot_id) + ".avro"; - ICEBERG_ASSIGN_OR_RAISE( - auto writer, - ManifestListWriter::MakeWriter(table_->metadata()->format_version, snapshot_id, - parent_snapshot_id, manifest_list_path, file_io_, - sequence_number)); - ICEBERG_RETURN_UNEXPECTED(writer->AddAll(manifests)); - ICEBERG_RETURN_UNEXPECTED(writer->Close()); - - ICEBERG_ASSIGN_OR_RAISE( - auto snapshot, - Snapshot::Make(sequence_number, snapshot_id, parent_snapshot_id, TimePointMs{}, - std::move(operation), {}, table_->metadata()->current_schema_id, - manifest_list_path)); - return std::shared_ptr(std::move(snapshot)); - } - - // Build metadata with a synthetic concurrent equality delete after `parent`. - struct ConcurrentDelete { - std::shared_ptr metadata; - std::shared_ptr snapshot; - }; - ConcurrentDelete InjectConcurrentEqualityDelete(const std::shared_ptr& parent, - const std::string& delete_path, - int64_t partition_x) { - ConcurrentDelete out; + void CommitEqualityDelete(const std::string& delete_path, int64_t partition_x) { auto del_file = MakeEqualityDeleteFile(delete_path, partition_x); - const int64_t new_snapshot_id = parent->snapshot_id + 1000; - const int64_t new_sequence_number = parent->sequence_number + 1; - auto manifest = - WriteDeleteManifest(table_location_ + "/metadata/concurrent-del-manifest.avro", - {del_file}, new_snapshot_id, new_sequence_number); - EXPECT_TRUE(manifest.has_value()); - auto snap = MakeSyntheticSnapshot(DataOperation::kOverwrite, new_snapshot_id, - parent->snapshot_id, new_sequence_number, - {manifest.value()}); - if (!snap.has_value()) { - ADD_FAILURE() << "MakeSyntheticSnapshot failed: " << snap.error().message; - } - EXPECT_TRUE(snap.has_value()); - out.snapshot = snap.value(); - out.metadata = std::make_shared(*table_->metadata()); - out.metadata->snapshots.push_back(out.snapshot); - out.metadata->current_snapshot_id = out.snapshot->snapshot_id; - out.metadata->last_sequence_number = out.snapshot->sequence_number; - return out; + ICEBERG_UNWRAP_OR_FAIL(auto row_delta, table_->NewRowDelta()); + row_delta->AddDeletes(del_file); + EXPECT_THAT(row_delta->Commit(), IsOk()); + EXPECT_THAT(table_->Refresh(), IsOk()); } Result> NewOverwrite() { @@ -214,12 +146,7 @@ class OverwriteFilesTest : public UpdateTestBase { std::shared_ptr file_b_; }; -TEST_F(OverwriteFilesTest, TableNewOverwriteReturnsBuilder) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - ASSERT_NE(op, nullptr); -} - -TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { +TEST_F(OverwriteFilesTest, TxnNewOverwrite) { ICEBERG_UNWRAP_OR_FAIL(auto txn, Transaction::Make(table_, TransactionKind::kUpdate)); ICEBERG_UNWRAP_OR_FAIL(auto op, txn->NewOverwrite()); ASSERT_NE(op, nullptr); @@ -234,61 +161,7 @@ TEST_F(OverwriteFilesTest, TransactionNewOverwriteReturnsBuilder) { DataOperation::kOverwrite); } -TEST_F(OverwriteFilesTest, BuilderMethodsReturnSelfAndChain) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - auto* self = op.get(); - - EXPECT_EQ(&op->AddFile(file_a_), self); - EXPECT_EQ(&op->DeleteFile(file_b_), self); - EXPECT_EQ(&op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))), self); - EXPECT_EQ(&op->ValidateFromSnapshot(42), self); - EXPECT_EQ(&op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(1L))), - self); - EXPECT_EQ(&op->ValidateNoConflictingData(), self); - EXPECT_EQ(&op->ValidateNoConflictingDeletes(), self); - EXPECT_EQ(&op->ValidateAddedFilesMatchOverwriteFilter(), self); - EXPECT_EQ(&op->WithCaseSensitivity(false), self); - - OverwriteFiles& chained = - (*op) - .AddFile(MakeDataFile("/data/chain.parquet", 1L)) - .OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))) - .ValidateNoConflictingData(); - EXPECT_EQ(&chained, self); -} - -TEST_F(OverwriteFilesTest, OperationAddOnlyIsAppend) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->AddFile(file_a_); - EXPECT_EQ(op->operation(), DataOperation::kAppend); -} - -TEST_F(OverwriteFilesTest, OperationDeleteOnlyIsDelete) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFile(file_a_); - EXPECT_EQ(op->operation(), DataOperation::kDelete); -} - -TEST_F(OverwriteFilesTest, OperationAddAndDeleteIsOverwrite) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFile(file_a_); - op->AddFile(file_b_); - EXPECT_EQ(op->operation(), DataOperation::kOverwrite); -} - -TEST_F(OverwriteFilesTest, OperationPureRowFilterAndAddIsOverwrite) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->AddFile(file_b_); - EXPECT_EQ(op->operation(), DataOperation::kOverwrite); -} - -TEST_F(OverwriteFilesTest, OperationNeitherIsOverwrite) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - EXPECT_EQ(op->operation(), DataOperation::kOverwrite); -} - -TEST_F(OverwriteFilesTest, CommitDeleteAndAddIsOverwrite) { +TEST_F(OverwriteFilesTest, DeleteAndAddCommit) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -305,7 +178,7 @@ TEST_F(OverwriteFilesTest, CommitDeleteAndAddIsOverwrite) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); } -TEST_F(OverwriteFilesTest, CommitRowFilterAndAddIsOverwrite) { +TEST_F(OverwriteFilesTest, RowFilterAndAddCommit) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -319,7 +192,7 @@ TEST_F(OverwriteFilesTest, CommitRowFilterAndAddIsOverwrite) { DataOperation::kOverwrite); } -TEST_F(OverwriteFilesTest, CommitEmptyOverwrite) { +TEST_F(OverwriteFilesTest, EmptyCommit) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -332,7 +205,7 @@ TEST_F(OverwriteFilesTest, CommitEmptyOverwrite) { DataOperation::kOverwrite); } -TEST_F(OverwriteFilesTest, CommitDeduplicatesDuplicateAddAndDelete) { +TEST_F(OverwriteFilesTest, DeduplicatesFiles) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -349,7 +222,7 @@ TEST_F(OverwriteFilesTest, CommitDeduplicatesDuplicateAddAndDelete) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kDeletedDataFiles), "1"); } -TEST_F(OverwriteFilesTest, CommitStageOnlyDoesNotAdvanceCurrentSnapshot) { +TEST_F(OverwriteFilesTest, StageOnly) { const int64_t base_snapshot_id = CommitFileA(); const size_t base_snapshot_count = table_->metadata()->snapshots.size(); @@ -366,7 +239,7 @@ TEST_F(OverwriteFilesTest, CommitStageOnlyDoesNotAdvanceCurrentSnapshot) { EXPECT_GT(table_->metadata()->snapshots.size(), base_snapshot_count); } -TEST_F(OverwriteFilesTest, CommitToTargetBranch) { +TEST_F(OverwriteFilesTest, TargetBranch) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -378,7 +251,7 @@ TEST_F(OverwriteFilesTest, CommitToTargetBranch) { EXPECT_TRUE(table_->metadata()->refs.contains("audit")); } -TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { +TEST_F(OverwriteFilesTest, CustomSummary) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -392,7 +265,7 @@ TEST_F(OverwriteFilesTest, CommitCustomSummaryProperty) { } // With no matching committed delete file, deleting `del_file` is a harmless no-op. -TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { +TEST_F(OverwriteFilesTest, BulkDeleteCommit) { { ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); seed->AddFile(file_a_); @@ -419,22 +292,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRemovesDataAndDeleteFiles) { EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kAddedDataFiles), "1"); } -TEST_F(OverwriteFilesTest, BulkDeleteFilesEmptySetsAreNoOp) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFiles(DataFileSet{}, DeleteFileSet{}); - EXPECT_EQ(op->operation(), DataOperation::kOverwrite); // neither adds nor deletes -} - -TEST_F(OverwriteFilesTest, BulkDeleteFilesDataPortionMarksDelete) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - DataFileSet data_files; - data_files.insert(file_a_); - data_files.insert(file_b_); - op->DeleteFiles(data_files, DeleteFileSet{}); - EXPECT_EQ(op->operation(), DataOperation::kDelete); -} - -TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { +TEST_F(OverwriteFilesTest, BulkDeleteData) { { ICEBERG_UNWRAP_OR_FAIL(auto seed, NewOverwrite()); seed->AddFile(file_a_); @@ -458,7 +316,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesEquivalentToRepeatedDeleteFile) { // OverwriteFiles validates content because the C++ API stores data and delete files in // DataFile pointers, while Java uses separate DataFile/DeleteFile types. -TEST_F(OverwriteFilesTest, AddFileRejectsDeleteFileContent) { +TEST_F(OverwriteFilesTest, AddRejectsDeleteContent) { auto del_file = MakeDeleteFile("/delete/del_as_data.parquet", 1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -469,7 +327,7 @@ TEST_F(OverwriteFilesTest, AddFileRejectsDeleteFileContent) { EXPECT_THAT(result, HasErrorMessage("has delete-file content")); } -TEST_F(OverwriteFilesTest, DeleteFileRejectsDeleteFileContent) { +TEST_F(OverwriteFilesTest, DeleteRejectsDeleteContent) { auto del_file = MakeDeleteFile("/delete/del_as_delete.parquet", 1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); @@ -480,7 +338,7 @@ TEST_F(OverwriteFilesTest, DeleteFileRejectsDeleteFileContent) { EXPECT_THAT(result, HasErrorMessage("has delete-file content")); } -TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { +TEST_F(OverwriteFilesTest, BulkRejectsDeleteAsData) { auto del_file = MakeDeleteFile("/delete/del_a.parquet", 1L); // content = positionDeletes DataFileSet data_files; @@ -493,7 +351,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDeleteFileInDataSet) { EXPECT_THAT(result, HasErrorMessage("has delete-file content")); } -TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDataFileInDeleteSet) { +TEST_F(OverwriteFilesTest, BulkRejectsDataAsDelete) { DeleteFileSet delete_files; delete_files.insert(file_a_); // content = kData @@ -504,7 +362,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesRejectsDataFileInDeleteSet) { EXPECT_THAT(result, HasErrorMessage("has data-file content")); } -TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { +TEST_F(OverwriteFilesTest, BulkAcceptsEqualityDelete) { auto eq_delete = MakeEqualityDeleteFile("/delete/eq_a.parquet", 1L); DeleteFileSet delete_files; delete_files.insert(eq_delete); @@ -515,37 +373,7 @@ TEST_F(OverwriteFilesTest, BulkDeleteFilesAcceptsEqualityDeleteInDeleteSet) { EXPECT_THAT(op->Commit(), IsOk()); } -TEST_F(OverwriteFilesTest, ValidateNoConflictingDataDetectsConflictingAdd) { - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); - op->AddFile(MakeDataFile("/data/replacement_x2.parquet", 2L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); -} - -TEST_F(OverwriteFilesTest, ValidateNoConflictingDataAllowsNonConflictingChange) { - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->AddFile(MakeDataFile("/data/replacement_x1.parquet", 1L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - op->ValidateNoConflictingDeletes(); - const std::string expected_operation = op->operation(); - EXPECT_THAT(op->Commit(), IsOk()); - - EXPECT_THAT(table_->Refresh(), IsOk()); - ICEBERG_UNWRAP_OR_FAIL(auto snapshot, table_->current_snapshot()); - EXPECT_EQ(snapshot->summary.at(SnapshotSummaryFields::kOperation), expected_operation); -} - -TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) { +TEST_F(OverwriteFilesTest, NoConflictingDeletesFails) { const int64_t first_id = CommitFileA(); { @@ -563,7 +391,7 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesDetectsConflictingDelete) EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } -TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChange) { +TEST_F(OverwriteFilesTest, NoConflictingDeletesPasses) { const int64_t first_id = CommitFileA(); CommitFastAppend(file_b_); @@ -576,29 +404,24 @@ TEST_F(OverwriteFilesTest, ValidateNoConflictingDeletesAllowsNonConflictingChang } // Explicit replaced-file validation checks concurrent deletes covering replaced files. -TEST_F(OverwriteFilesTest, PathBExplicitDeletesDetectsConcurrentDeleteWithoutFilter) { +TEST_F(OverwriteFilesTest, ExplicitDeleteConflict) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); - - auto concurrent = InjectConcurrentEqualityDelete( - first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); + CommitEqualityDelete("/delete/concurrent_x1.parquet", /*partition_x=*/1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFile(file_a_); op->AddFile(MakeDataFile("/data/rewrite_x1.parquet", 1L)); op->ValidateFromSnapshot(first_snapshot->snapshot_id); op->ValidateNoConflictingDeletes(); - EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), - IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } // A narrower conflict filter can exclude the concurrent delete. -TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { +TEST_F(OverwriteFilesTest, ExplicitDeleteFilterScope) { CommitFileA(); ICEBERG_UNWRAP_OR_FAIL(auto first_snapshot, table_->current_snapshot()); - - auto concurrent = InjectConcurrentEqualityDelete( - first_snapshot, "/delete/concurrent_x1.parquet", /*partition_x=*/1L); + CommitEqualityDelete("/delete/concurrent_x1.parquet", /*partition_x=*/1L); ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFile(file_a_); @@ -606,54 +429,60 @@ TEST_F(OverwriteFilesTest, PathBExplicitDeletesConflictFilterNarrowsScope) { op->ValidateFromSnapshot(first_snapshot->snapshot_id); op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); op->ValidateNoConflictingDeletes(); - EXPECT_THAT(op->Validate(*concurrent.metadata, concurrent.snapshot), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); } -TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictProjection) { +TEST_F(OverwriteFilesTest, StrictRangeByProjection) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/in_partition.parquet", 1L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); } -TEST_F(OverwriteFilesTest, StrictRangeAcceptedByStrictMetrics) { +TEST_F(OverwriteFilesTest, StrictRangeByMetrics) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); // y bounds [5, 5] prove every row has y == 5. op->AddFile(MakeDataFileWithYBounds("/data/y_eq_5.parquet", 1L, 5L, 5L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); } -TEST_F(OverwriteFilesTest, StrictRangeRejectedWhenNotProvable) { +TEST_F(OverwriteFilesTest, StrictRangeRejectsPartialMetrics) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("y", Literal::Long(5L))); // y bounds [1, 10] do not prove every row has y == 5. op->AddFile(MakeDataFileWithYBounds("/data/y_range.parquet", 1L, 1L, 10L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), - IsError(ErrorKind::kValidationFailed)); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, + HasErrorMessage("Cannot append file with rows that do not match filter")); } -TEST_F(OverwriteFilesTest, StrictRangeRejectsFileOutsidePartitionRange) { +TEST_F(OverwriteFilesTest, StrictRangeRejectsOutsidePartition) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/wrong_partition.parquet", /*partition_x=*/2L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), - IsError(ErrorKind::kValidationFailed)); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, + HasErrorMessage("Cannot append file with rows that do not match filter")); } -TEST_F(OverwriteFilesTest, StrictRangeRequiresRowFilter) { +TEST_F(OverwriteFilesTest, StrictRangeRequiresFilter) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(MakeDataFile("/data/no_filter.parquet", 1L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), - IsError(ErrorKind::kValidationFailed)); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, + HasErrorMessage("Cannot append file with rows that do not match filter")); } -TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { +TEST_F(OverwriteFilesTest, StrictRangeRejectsMultipleSpecs) { // Add the second spec before creating the builder so staged files can resolve it. ICEBERG_UNWRAP_OR_FAIL( auto spec1, PartitionSpec::Make(*schema_, /*spec_id=*/1, @@ -674,39 +503,41 @@ TEST_F(OverwriteFilesTest, StrictRangeRejectsMultiplePartitionSpecs) { op->AddFile(file_spec0); op->AddFile(file_spec1); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), - IsError(ErrorKind::kInvalidArgument)); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot return a single partition spec")); } -TEST_F(OverwriteFilesTest, StrictRangeRejectsEmptyAddedFiles) { +TEST_F(OverwriteFilesTest, StrictRangeRejectsNoAdds) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), - IsError(ErrorKind::kInvalidArgument)); + auto result = op->Commit(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(result, HasErrorMessage("Cannot determine partition specs")); } // Strict-range validation binds the row filter with the configured case sensitivity. -TEST_F(OverwriteFilesTest, CaseSensitivityAffectsFilterBinding) { +TEST_F(OverwriteFilesTest, StrictRangeCaseSensitivity) { { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(Expressions::Equal("X", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/cs.parquet", 1L)); op->ValidateAddedFilesMatchOverwriteFilter(); - auto result = op->Validate(*table_->metadata(), /*snapshot=*/nullptr); + auto result = op->Commit(); EXPECT_FALSE(result.has_value()); } { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->WithCaseSensitivity(false); + op->CaseSensitive(false); op->OverwriteByRowFilter(Expressions::Equal("X", Literal::Long(1L))); op->AddFile(MakeDataFile("/data/ci.parquet", 1L)); op->ValidateAddedFilesMatchOverwriteFilter(); - EXPECT_THAT(op->Validate(*table_->metadata(), /*snapshot=*/nullptr), IsOk()); + EXPECT_THAT(op->Commit(), IsOk()); } } -TEST_F(OverwriteFilesTest, NullAddFileRejectedAtCommit) { +TEST_F(OverwriteFilesTest, NullAddFileRejected) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(nullptr); auto result = op->Commit(); @@ -714,7 +545,7 @@ TEST_F(OverwriteFilesTest, NullAddFileRejectedAtCommit) { EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); } -TEST_F(OverwriteFilesTest, NullDeleteFileRejectedAtCommit) { +TEST_F(OverwriteFilesTest, NullDeleteFileRejected) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->DeleteFile(nullptr); auto result = op->Commit(); @@ -722,7 +553,7 @@ TEST_F(OverwriteFilesTest, NullDeleteFileRejectedAtCommit) { EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); } -TEST_F(OverwriteFilesTest, NullOverwriteByRowFilterRejectedAtCommit) { +TEST_F(OverwriteFilesTest, NullRowFilterRejected) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->OverwriteByRowFilter(nullptr); auto result = op->Commit(); @@ -730,7 +561,7 @@ TEST_F(OverwriteFilesTest, NullOverwriteByRowFilterRejectedAtCommit) { EXPECT_THAT(result, HasErrorMessage("Invalid row filter expression: null")); } -TEST_F(OverwriteFilesTest, NullConflictDetectionFilterRejectedAtCommit) { +TEST_F(OverwriteFilesTest, NullConflictFilterRejected) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->ConflictDetectionFilter(nullptr); auto result = op->Commit(); @@ -738,32 +569,7 @@ TEST_F(OverwriteFilesTest, NullConflictDetectionFilterRejectedAtCommit) { EXPECT_THAT(result, HasErrorMessage("Invalid conflict detection filter: null")); } -TEST_F(OverwriteFilesTest, NullArgumentDoesNotCrashBuilderChain) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - (*op).AddFile(nullptr).AddFile(file_a_).OverwriteByRowFilter( - Expressions::Equal("x", Literal::Long(1L))); - auto result = op->Commit(); - EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); - EXPECT_THAT(result, HasErrorMessage("Invalid data file: null")); -} - -// DataFileSet/DeleteFileSet reject nullptr on insert. -TEST_F(OverwriteFilesTest, DeleteFilesNullElementsCannotEnterSets) { - DataFileSet data_files; - data_files.insert(std::shared_ptr{nullptr}); - DeleteFileSet delete_files; - delete_files.insert(std::shared_ptr{nullptr}); - EXPECT_TRUE(data_files.empty()); - EXPECT_TRUE(delete_files.empty()); - - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFiles(data_files, delete_files); - EXPECT_EQ(op->operation(), DataOperation::kOverwrite); - EXPECT_THAT(op->Commit(), IsOk()); -} - -// Snapshot id 0 is valid; negative ids are rejected as builder errors. -TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { +TEST_F(OverwriteFilesTest, RejectsNegativeSnapshotId) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_).ValidateFromSnapshot(-1); auto result = op->Commit(); @@ -771,58 +577,13 @@ TEST_F(OverwriteFilesTest, ValidateFromSnapshotRejectsNegativeSnapshotId) { EXPECT_THAT(result, HasErrorMessage("Invalid snapshot id")); } -TEST_F(OverwriteFilesTest, ValidateFromSnapshotAcceptsZeroSnapshotId) { +TEST_F(OverwriteFilesTest, AcceptsZeroSnapshotId) { ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); op->AddFile(file_a_).ValidateFromSnapshot(0); EXPECT_THAT(op->Commit(), IsOk()); } -TEST_F(OverwriteFilesTest, PropertyBuilderForwardingAndDualTracking) { - const std::vector> files = { - MakeDataFile("/data/p0.parquet", 1L), - MakeDataFile("/data/p1.parquet", 2L), - MakeDataFile("/data/p2.parquet", 3L), - MakeDataFile("/data/p3.parquet", 7L), - }; - - for (const auto& file : files) { - { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->AddFile(file); - EXPECT_EQ(op->operation(), DataOperation::kAppend) << file->file_path; - } - { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->DeleteFile(file); - EXPECT_EQ(op->operation(), DataOperation::kDelete) << file->file_path; - } - { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - DataFileSet data_files; - data_files.insert(file); - op->DeleteFiles(data_files, DeleteFileSet{}); - EXPECT_EQ(op->operation(), DataOperation::kDelete) << file->file_path; - } - } -} - -TEST_F(OverwriteFilesTest, PropertyNullArgumentRejection) { - using Mutator = std::function; - const std::vector mutators = { - [](OverwriteFiles& op) { op.AddFile(nullptr); }, - [](OverwriteFiles& op) { op.DeleteFile(nullptr); }, - [](OverwriteFiles& op) { op.OverwriteByRowFilter(nullptr); }, - [](OverwriteFiles& op) { op.ConflictDetectionFilter(nullptr); }, - }; - - for (const auto& mutate : mutators) { - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - mutate(*op); - EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); - } -} - -TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { +TEST_F(OverwriteFilesTest, OperationMatrix) { struct Case { bool add; bool delete_file; @@ -878,57 +639,54 @@ TEST_F(OverwriteFilesTest, PropertyOperationTruthTable) { } } -// Exercise explicit filter, row-filter-only, and explicit file replacement resolution. -TEST_F(OverwriteFilesTest, PropertyConflictFilterResolution) { - { - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->AddFile(MakeDataFile("/data/r2_ok.parquet", 1L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), - IsOk()); - } - { - SetUp(); // fresh table - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); - op->AddFile(MakeDataFile("/data/r2_conflict.parquet", 2L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), - IsError(ErrorKind::kValidationFailed)); - } - { - SetUp(); - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); - op->AddFile(MakeDataFile("/data/r1.parquet", 1L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), - IsError(ErrorKind::kValidationFailed)); - } - { - SetUp(); - const int64_t first_id = CommitFileA(); - CommitFastAppend(file_b_); - ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); - op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); - op->DeleteFile(file_a_); - op->AddFile(MakeDataFile("/data/r3.parquet", 1L)); - op->ValidateFromSnapshot(first_id); - op->ValidateNoConflictingData(); - EXPECT_THAT(op->Validate(*table_->metadata(), table_->current_snapshot().value()), - IsError(ErrorKind::kValidationFailed)); - } +TEST_F(OverwriteFilesTest, DefaultConflictFilter) { + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->AddFile(MakeDataFile("/data/r2_ok.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Commit(), IsOk()); +} + +TEST_F(OverwriteFilesTest, ConflictFilterMatchesAdd) { + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(2L))); + op->AddFile(MakeDataFile("/data/r2_conflict.parquet", 2L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(OverwriteFilesTest, ConflictFilterUsesExplicitFilter) { + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->ConflictDetectionFilter(Expressions::Equal("x", Literal::Long(2L))); + op->AddFile(MakeDataFile("/data/r1.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); +} + +TEST_F(OverwriteFilesTest, ExplicitReplaceConflicts) { + const int64_t first_id = CommitFileA(); + CommitFastAppend(file_b_); + + ICEBERG_UNWRAP_OR_FAIL(auto op, NewOverwrite()); + op->OverwriteByRowFilter(Expressions::Equal("x", Literal::Long(1L))); + op->DeleteFile(file_a_); + op->AddFile(MakeDataFile("/data/r3.parquet", 1L)); + op->ValidateFromSnapshot(first_id); + op->ValidateNoConflictingData(); + EXPECT_THAT(op->Commit(), IsError(ErrorKind::kValidationFailed)); } } // namespace iceberg diff --git a/src/iceberg/test/table_test.cc b/src/iceberg/test/table_test.cc index d7ebe4a0a..ef21f12d6 100644 --- a/src/iceberg/test/table_test.cc +++ b/src/iceberg/test/table_test.cc @@ -163,6 +163,7 @@ TEST(StaticTableTest, NewMutatingOperationsAreNotSupported) { EXPECT_THAT(table->NewMergeAppend(), IsError(ErrorKind::kNotSupported)); EXPECT_THAT(table->NewDeleteFiles(), IsError(ErrorKind::kNotSupported)); EXPECT_THAT(table->NewRowDelta(), IsError(ErrorKind::kNotSupported)); + EXPECT_THAT(table->NewOverwrite(), IsError(ErrorKind::kNotSupported)); EXPECT_THAT(table->NewSnapshotManager(), IsError(ErrorKind::kNotSupported)); } diff --git a/src/iceberg/update/delete_files.h b/src/iceberg/update/delete_files.h index 1be08e35b..7e567830e 100644 --- a/src/iceberg/update/delete_files.h +++ b/src/iceberg/update/delete_files.h @@ -34,27 +34,58 @@ namespace iceberg { /// \brief API for deleting data files from a table. /// -/// This accumulates data-file deletions, produces a new snapshot, and commits that -/// snapshot as current. File paths are matched exactly against table metadata values; -/// equivalent but differently-normalized URIs are not considered matches. +/// This API accumulates file deletions, produces a new Snapshot of the table, +/// and commits that snapshot as current. When committing, these changes are +/// applied to the latest table snapshot. Commit conflicts are resolved by +/// applying the changes to the new latest snapshot and reattempting the commit. +/// +/// File paths are matched exactly against table metadata values; equivalent but +/// differently-normalized URIs are not considered matches. class ICEBERG_EXPORT DeleteFiles : public MergingSnapshotUpdate { public: static Result> Make( std::string table_name, std::shared_ptr ctx); - /// \brief Delete a data-file path from the table. + /// \brief Delete a file by path from the underlying table. + /// + /// \param path A path to remove from the table. + /// \return This DeleteFiles for method chaining. DeleteFiles& DeleteFile(std::string_view path); - /// \brief Delete a data file tracked by object identity and path. + /// \brief Delete a file tracked by a DataFile from the underlying table. + /// + /// \param file A DataFile to remove from the table. + /// \return This DeleteFiles for method chaining. DeleteFiles& DeleteFile(const std::shared_ptr& file); - /// \brief Delete files whose rows all match the given expression. + /// \brief Delete files that match an expression on data rows from the table. + /// + /// A file is selected to be deleted by the expression if it could contain any + /// rows that match the expression. Candidate files are selected using an + /// inclusive partition projection. These candidate files are deleted if all of + /// the rows in the file must match the expression, determined by the + /// expression's strict partition projection. This guarantees that files are + /// deleted if and only if all rows in the file must match the expression. + /// + /// Files that may contain some rows that match the expression and some rows + /// that do not will result in a validation error. + /// + /// \param expr An expression on rows in the table. + /// \return This DeleteFiles for method chaining. DeleteFiles& DeleteFromRowFilter(std::shared_ptr expr); - /// \brief Set case sensitivity for expression binding. + /// \brief Enable or disable case-sensitive expression binding for validations. + /// + /// \param case_sensitive Whether expression binding should be case sensitive. + /// \return This DeleteFiles for method chaining. DeleteFiles& CaseSensitive(bool case_sensitive); - /// \brief Validate that explicitly requested deleted files still exist. + /// \brief Enable validation that deleted files still exist. + /// + /// If this method is called, any files that are part of the deletion must + /// still exist when committing the operation. + /// + /// \return This DeleteFiles for method chaining. DeleteFiles& ValidateFilesExist(); std::string operation() override; diff --git a/src/iceberg/update/fast_append.h b/src/iceberg/update/fast_append.h index 07018989c..f5e1ceaba 100644 --- a/src/iceberg/update/fast_append.h +++ b/src/iceberg/update/fast_append.h @@ -34,36 +34,43 @@ namespace iceberg { -/// \brief Appending new files in a table. +/// \brief API for appending new files in a table. /// -/// FastAppend is optimized for appending new data files to a table, it creates new -/// manifest files for the added data without compacting or rewriting existing manifests, -/// making it faster for write-heavy workloads. +/// This API accumulates file additions, produces a new Snapshot of the table, +/// and commits that snapshot as current. When committing, these changes are +/// applied to the latest table snapshot. Commit conflicts are resolved by +/// applying the changes to the new latest snapshot and reattempting the commit. +/// +/// FastAppend is optimized for appending new data files to a table. It creates +/// new manifest files for the added data without compacting or rewriting +/// existing manifests. class ICEBERG_EXPORT FastAppend : public SnapshotUpdate { public: /// \brief Create a new FastAppend instance. /// /// \param table_name The name of the table - /// \param transaction The transaction to use for this update + /// \param ctx The transaction context to use for this update /// \return A Result containing the FastAppend instance or an error static Result> Make( std::string table_name, std::shared_ptr ctx); - /// \brief Append a data file to this update. + /// \brief Append a DataFile to the table. /// - /// \param file The data file to append - /// \return Reference to this for method chaining + /// \param file A data file. + /// \return This FastAppend for method chaining. FastAppend& AppendFile(const std::shared_ptr& file); - /// \brief Append a manifest file to this update. + /// \brief Append a ManifestFile to the table. + /// + /// The manifest must contain only appended files. All files in the manifest + /// are appended to the table in the snapshot created by this update. /// - /// The manifest must only contain added files (no existing or deleted files). - /// If the manifest doesn't have a snapshot ID assigned and snapshot ID inheritance - /// is enabled, it will be used directly. Otherwise, it will be copied with the - /// new snapshot ID. + /// If the manifest doesn't have a snapshot ID assigned and snapshot ID + /// inheritance is enabled, it will be used directly. Otherwise, it will be + /// copied with the new snapshot ID. /// - /// \param manifest The manifest file to append - /// \return Reference to this for method chaining + /// \param manifest A manifest file of files to append. + /// \return This FastAppend for method chaining. FastAppend& AppendManifest(const ManifestFile& manifest); std::string operation() override; diff --git a/src/iceberg/update/merge_append.h b/src/iceberg/update/merge_append.h index 8122a262c..cd4f4acbb 100644 --- a/src/iceberg/update/merge_append.h +++ b/src/iceberg/update/merge_append.h @@ -31,10 +31,15 @@ namespace iceberg { -/// \brief Append files while merging manifests when table properties allow it. +/// \brief API for appending new files in a table while merging manifests. /// -/// MergeAppend uses the shared MergingSnapshotUpdate pipeline, so it can compact -/// newly written and existing manifests into fewer manifests during commit. +/// This API accumulates file additions, produces a new Snapshot of the table, +/// and commits that snapshot as current. When committing, these changes are +/// applied to the latest table snapshot. Commit conflicts are resolved by +/// applying the changes to the new latest snapshot and reattempting the commit. +/// +/// MergeAppend uses the shared MergingSnapshotUpdate pipeline, so it can merge +/// newly written and existing manifests according to table properties. class ICEBERG_EXPORT MergeAppend : public MergingSnapshotUpdate { public: /// \brief Create a new MergeAppend instance. @@ -45,19 +50,20 @@ class ICEBERG_EXPORT MergeAppend : public MergingSnapshotUpdate { static Result> Make( std::string table_name, std::shared_ptr ctx); - /// \brief Append a data file to this update. + /// \brief Append a DataFile to the table. /// - /// \param file The data file to append - /// \return Reference to this for method chaining + /// \param file A data file. + /// \return This MergeAppend for method chaining. MergeAppend& AppendFile(const std::shared_ptr& file); - /// \brief Append a data manifest to this update. + /// \brief Append a ManifestFile to the table. /// - /// The manifest must contain only added files. Snapshot ID and sequence number - /// assignment happen during commit. + /// The manifest must contain only appended files. All files in the manifest + /// are appended to the table in the snapshot created by this update. + /// Snapshot ID and sequence number assignment happen during commit. /// - /// \param manifest The manifest file to append - /// \return Reference to this for method chaining + /// \param manifest A manifest file of files to append. + /// \return This MergeAppend for method chaining. MergeAppend& AppendManifest(const ManifestFile& manifest); std::string operation() override; diff --git a/src/iceberg/update/merging_snapshot_update.cc b/src/iceberg/update/merging_snapshot_update.cc index c4d108dcb..8f88633ba 100644 --- a/src/iceberg/update/merging_snapshot_update.cc +++ b/src/iceberg/update/merging_snapshot_update.cc @@ -617,12 +617,13 @@ void MergingSnapshotUpdate::SetSummaryProperty(const std::string& property, Result> MergingSnapshotUpdate::DataSpec() const { if (new_data_files_by_spec_.empty()) { - return InvalidArgument("DataSpec() called before any data file was added"); + return InvalidArgument( + "Cannot determine partition specs: no data files have been added"); } if (new_data_files_by_spec_.size() > 1) { return InvalidArgument( - "DataSpec() requires exactly one partition spec; got {} different specs", - new_data_files_by_spec_.size()); + "Cannot return a single partition spec: data files with different partition " + "specs have been added"); } return base().PartitionSpecById(new_data_files_by_spec_.begin()->first); } diff --git a/src/iceberg/update/merging_snapshot_update.h b/src/iceberg/update/merging_snapshot_update.h index fc3987ee1..d11a9b1f9 100644 --- a/src/iceberg/update/merging_snapshot_update.h +++ b/src/iceberg/update/merging_snapshot_update.h @@ -40,12 +40,17 @@ namespace iceberg { -/// \brief Abstract base class for all merge-based snapshot write operations. +/// \brief Abstract base class for merge-based snapshot write operations. /// -/// Provides the complete filter → write → merge pipeline that all merge-based -/// operations (MergeAppend, OverwriteFiles, RowDelta, ReplacePartitions, -/// RewriteFiles) share. Subclasses only need to implement `operation()` and -/// call the protected primitive API to describe what changes to make. +/// This class is the C++ counterpart of Java's MergingSnapshotProducer. It +/// provides the shared filter, write, and merge pipeline used by operations +/// that accumulate file additions and deletions, produce a new Snapshot, and +/// commit that snapshot as current. Commit conflicts are resolved by applying +/// the same staged changes to the new latest snapshot and reattempting the +/// commit. +/// +/// Subclasses describe their changes by calling the protected primitive API and +/// implement `operation()` to set the snapshot operation summary. /// /// The Apply() pipeline: /// 1. Filter data manifests (via data_filter_manager_) @@ -98,9 +103,9 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Add all files in a pre-existing data manifest to the new snapshot. /// - /// The manifest must contain DATA content. If snapshot ID inheritance is - /// enabled and the manifest has no snapshot ID assigned, it is used directly; - /// otherwise it is copied with the current snapshot ID. + /// The manifest must contain only appended data files. If snapshot ID + /// inheritance is enabled and the manifest has no snapshot ID assigned, it is + /// used directly; otherwise it is copied with the current snapshot ID. Status AddManifest(ManifestFile manifest); /// \brief Register a data file (by object) to be deleted from the table. @@ -116,14 +121,15 @@ class ICEBERG_EXPORT MergingSnapshotUpdate : public SnapshotUpdate { /// \brief Register an expression to delete matching rows. /// - /// Both data and delete filter managers receive the expression: delete files that - /// match the row filter can also be removed because those rows will be deleted. + /// Both data and delete filter managers receive the expression: delete files + /// that match the row filter can also be removed because those rows will be + /// deleted. Status DeleteByRowFilter(std::shared_ptr expr); /// \brief Register a partition to be dropped. /// - /// Both data and delete filter managers receive the partition drop, since dropping - /// data in a partition also drops all delete files in that partition. + /// Both data and delete filter managers receive the partition drop, since + /// dropping data in a partition also drops all delete files in that partition. Status DropPartition(int32_t spec_id, PartitionValues partition); /// \brief Fail if any registered delete path is not found in any manifest. diff --git a/src/iceberg/update/overwrite_files.cc b/src/iceberg/update/overwrite_files.cc index 8b493babb..cba6e028c 100644 --- a/src/iceberg/update/overwrite_files.cc +++ b/src/iceberg/update/overwrite_files.cc @@ -21,7 +21,6 @@ #include -#include "iceberg/expression/binder.h" #include "iceberg/expression/evaluator.h" #include "iceberg/expression/expressions.h" #include "iceberg/expression/projections.h" @@ -114,19 +113,19 @@ OverwriteFiles& OverwriteFiles::ConflictDetectionFilter( return *this; } -OverwriteFiles& OverwriteFiles::WithCaseSensitivity(bool case_sensitive) { - CaseSensitive(case_sensitive); +OverwriteFiles& OverwriteFiles::CaseSensitive(bool case_sensitive) { + MergingSnapshotUpdate::CaseSensitive(case_sensitive); return *this; } OverwriteFiles& OverwriteFiles::ValidateNoConflictingData() { - validate_no_conflicting_data_ = true; + validate_new_data_files_ = true; FailMissingDeletePaths(); return *this; } OverwriteFiles& OverwriteFiles::ValidateNoConflictingDeletes() { - validate_no_conflicting_deletes_ = true; + validate_new_deletes_ = true; FailMissingDeletePaths(); return *this; } @@ -146,46 +145,88 @@ std::string OverwriteFiles::operation() { return DataOperation::kOverwrite; } -// Pure row-filter overwrites use the row filter as the data conflict filter. Explicit -// file replacement is more conservative unless the caller supplies a narrower filter. std::shared_ptr OverwriteFiles::DataConflictDetectionFilter() const { if (conflict_detection_filter_ != nullptr) { return conflict_detection_filter_; } - if (RowFilter() != nullptr && RowFilter() != Expressions::AlwaysFalse() && - deleted_data_files_.empty()) { - return RowFilter(); + if (auto filter = RowFilter(); filter != nullptr && + filter != Expressions::AlwaysFalse() && + deleted_data_files_.empty()) { + return filter; } return Expressions::AlwaysTrue(); } Status OverwriteFiles::Validate(const TableMetadata& current_metadata, const std::shared_ptr& snapshot) { + auto row_filter = RowFilter(); + if (validate_added_files_match_overwrite_filter_) { - if (RowFilter() == nullptr || RowFilter() == Expressions::AlwaysFalse()) { - return ValidationFailed( - "Cannot validate added files match overwrite filter: row filter is not set"); + ICEBERG_ASSIGN_OR_RAISE(auto spec, DataSpec()); + ICEBERG_ASSIGN_OR_RAISE(auto schema, current_metadata.Schema()); + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec->PartitionType(*schema)); + auto partition_schema = partition_type->ToSchema(); + + ICEBERG_ASSIGN_OR_RAISE( + auto inclusive_expr, + Projections::Inclusive(*spec, *schema, IsCaseSensitive())->Project(row_filter)); + ICEBERG_ASSIGN_OR_RAISE( + auto inclusive_evaluator, + Evaluator::Make(*partition_schema, inclusive_expr, IsCaseSensitive())); + + ICEBERG_ASSIGN_OR_RAISE( + auto strict_expr, + Projections::Strict(*spec, *schema, IsCaseSensitive())->Project(row_filter)); + ICEBERG_ASSIGN_OR_RAISE( + auto strict_evaluator, + Evaluator::Make(*partition_schema, strict_expr, IsCaseSensitive())); + + ICEBERG_ASSIGN_OR_RAISE( + auto metrics_evaluator, + StrictMetricsEvaluator::Make(row_filter, schema, IsCaseSensitive())); + + // the real test is that the strict or metrics test matches the file, indicating that + // all records in the file match the filter. inclusive is used to avoid testing the + // metrics, which is more complicated + const auto file_test = [&](const DataFile& file) -> Result { + ICEBERG_ASSIGN_OR_RAISE(bool inclusive_match, + inclusive_evaluator->Evaluate(file.partition)); + if (!inclusive_match) { + return false; + } + ICEBERG_ASSIGN_OR_RAISE(bool strict_match, + strict_evaluator->Evaluate(file.partition)); + if (strict_match) { + return true; + } + return metrics_evaluator->Evaluate(file); + }; + + for (const auto& file : AddedDataFiles()) { + ICEBERG_ASSIGN_OR_RAISE(bool matches_filter, file_test(*file)); + if (!matches_filter) { + return ValidationFailed( + "Cannot append file with rows that do not match filter: {}: {}", + row_filter->ToString(), file->file_path); + } } - ICEBERG_RETURN_UNEXPECTED( - ValidateAddedFilesMatchOverwriteFilterImpl(current_metadata)); } - if (validate_no_conflicting_data_) { + if (validate_new_data_files_) { ICEBERG_RETURN_UNEXPECTED(ValidateAddedDataFiles( current_metadata, starting_snapshot_id_, DataConflictDetectionFilter(), snapshot, ctx_->table->io(), IsCaseSensitive())); } - if (validate_no_conflicting_deletes_) { - if (RowFilter() != nullptr && RowFilter() != Expressions::AlwaysFalse()) { - auto delete_filter = conflict_detection_filter_ != nullptr - ? conflict_detection_filter_ - : RowFilter(); + if (validate_new_deletes_) { + if (row_filter != nullptr && row_filter != Expressions::AlwaysFalse()) { + auto filter = + conflict_detection_filter_ != nullptr ? conflict_detection_filter_ : row_filter; ICEBERG_RETURN_UNEXPECTED( - ValidateNoNewDeleteFiles(current_metadata, starting_snapshot_id_, delete_filter, + ValidateNoNewDeleteFiles(current_metadata, starting_snapshot_id_, filter, snapshot, ctx_->table->io(), IsCaseSensitive())); ICEBERG_RETURN_UNEXPECTED( - ValidateDeletedDataFiles(current_metadata, starting_snapshot_id_, delete_filter, + ValidateDeletedDataFiles(current_metadata, starting_snapshot_id_, filter, snapshot, ctx_->table->io(), IsCaseSensitive())); } @@ -199,72 +240,4 @@ Status OverwriteFiles::Validate(const TableMetadata& current_metadata, return {}; } -// Every added data file must be fully contained in the overwrite range defined by -// RowFilter(). Partition projection handles whole-partition proofs; strict metrics are -// used only when the partition alone is not enough. -Status OverwriteFiles::ValidateAddedFilesMatchOverwriteFilterImpl( - const TableMetadata& metadata) { - ICEBERG_ASSIGN_OR_RAISE(auto schema, metadata.Schema()); - - ICEBERG_ASSIGN_OR_RAISE(auto spec, DataSpec()); - - // Project requires a bound expression; StrictMetricsEvaluator binds internally. - ICEBERG_ASSIGN_OR_RAISE(auto bound_filter, - Binder::Bind(*schema, RowFilter(), IsCaseSensitive())); - ICEBERG_ASSIGN_OR_RAISE( - auto strict_metrics_evaluator, - StrictMetricsEvaluator::Make(RowFilter(), schema, IsCaseSensitive())); - - ICEBERG_ASSIGN_OR_RAISE(auto partition_type, spec->PartitionType(*schema)); - auto partition_fields = partition_type->fields(); - Schema partition_schema( - std::vector(partition_fields.begin(), partition_fields.end())); - - auto inclusive_projection = Projections::Inclusive(*spec, *schema, IsCaseSensitive()); - ICEBERG_ASSIGN_OR_RAISE(auto inclusive_expr, - inclusive_projection->Project(bound_filter)); - ICEBERG_ASSIGN_OR_RAISE( - auto inclusive_evaluator, - Evaluator::Make(partition_schema, inclusive_expr, IsCaseSensitive())); - - auto strict_projection = Projections::Strict(*spec, *schema, IsCaseSensitive()); - ICEBERG_ASSIGN_OR_RAISE(auto strict_expr, strict_projection->Project(bound_filter)); - ICEBERG_ASSIGN_OR_RAISE( - auto strict_evaluator, - Evaluator::Make(partition_schema, strict_expr, IsCaseSensitive())); - - for (const auto& file : AddedDataFiles()) { - if (file == nullptr) { - return ValidationFailed( - "Cannot validate added files match overwrite filter: null data file"); - } - - ICEBERG_ASSIGN_OR_RAISE(bool inclusive_match, - inclusive_evaluator->Evaluate(file->partition)); - if (!inclusive_match) { - return ValidationFailed( - "Cannot commit file {}: added file does not match overwrite filter (outside " - "the overwrite range)", - file->file_path); - } - - ICEBERG_ASSIGN_OR_RAISE(bool strict_match, - strict_evaluator->Evaluate(file->partition)); - if (strict_match) { - continue; - } - - ICEBERG_ASSIGN_OR_RAISE(bool metrics_match, - strict_metrics_evaluator->Evaluate(*file)); - if (!metrics_match) { - return ValidationFailed( - "Cannot commit file {}: added file is not fully contained in the overwrite " - "filter", - file->file_path); - } - } - - return {}; -} - } // namespace iceberg diff --git a/src/iceberg/update/overwrite_files.h b/src/iceberg/update/overwrite_files.h index 1c1979108..f76263cad 100644 --- a/src/iceberg/update/overwrite_files.h +++ b/src/iceberg/update/overwrite_files.h @@ -34,9 +34,23 @@ namespace iceberg { -/// \brief Overwrite data files in a table. +/// \brief API for overwriting files in a table. /// -/// Supports explicit file replacement and range-based replacement by row filter. +/// This API accumulates file additions and produces a new Snapshot of the table by +/// replacing all deleted files with the set of additions. This operation is used +/// to implement idempotent writes that always replace a section of a table with +/// new data or update/delete operations that eagerly overwrite files. +/// +/// Overwrites can be validated. The default validation mode is idempotent, +/// meaning the overwrite is correct and should be committed regardless of other +/// concurrent changes to the table. For example, this can be used for replacing +/// all data for day D with query results. Alternatively, this API can be +/// configured for overwriting certain files with their filtered versions while +/// ensuring no new data that would need to be filtered has been added. +/// +/// When committing, these changes are applied to the latest table snapshot. +/// Commit conflicts are resolved by applying the changes to the new latest +/// snapshot and reattempting the commit. class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { public: /// \brief Create a new OverwriteFiles instance. @@ -49,68 +63,123 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { ~OverwriteFiles() override; - /// \brief Add a new data file to the overwrite. + /// \brief Add a DataFile to the table. /// - /// \param file The data file to add - /// \return Reference to this for method chaining + /// \param file A data file. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& AddFile(const std::shared_ptr& file); - /// \brief Delete an existing data file as part of the overwrite. + /// \brief Delete a DataFile from the table. /// - /// \param file The data file to delete - /// \return Reference to this for method chaining + /// \param file A data file. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& DeleteFile(const std::shared_ptr& file); - /// \brief Remove data files and delete files as part of the overwrite. + /// \brief Delete a set of data files from the table with their respective + /// delete files. /// - /// \param data_files_to_delete The data files to delete - /// \param delete_files_to_delete The delete files to remove - /// \return Reference to this for method chaining + /// \param data_files_to_delete The data files to be deleted from the table. + /// \param delete_files_to_delete The delete files corresponding to the data + /// files to be deleted from the table. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& DeleteFiles(const DataFileSet& data_files_to_delete, const DeleteFileSet& delete_files_to_delete); - /// \brief Overwrite all rows matching the given expression. + /// \brief Delete files that match an expression on data rows from the table. + /// + /// A file is selected to be deleted by the expression if it could contain any + /// rows that match the expression. Candidate files are selected using an + /// inclusive partition projection. These candidate files are deleted if all of + /// the rows in the file must match the expression, determined by the + /// expression's strict partition projection. This guarantees that files are + /// deleted if and only if all rows in the file must match the expression. /// - /// \param expr The row filter expression defining the overwrite range - /// \return Reference to this for method chaining + /// Files that may contain some rows that match the expression and some rows + /// that do not will result in a validation error. + /// + /// \param expr An expression on rows in the table. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& OverwriteByRowFilter(std::shared_ptr expr); - /// \brief Set the lower bound snapshot id for concurrency scans. + /// \brief Set the snapshot ID used in any reads for this operation. + /// + /// Validations check changes after this snapshot ID. If the from snapshot is + /// not set, all ancestor snapshots through the table's initial snapshot are + /// validated. /// - /// \param snapshot_id The starting snapshot id - /// \return Reference to this for method chaining + /// \param snapshot_id A snapshot ID. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& ValidateFromSnapshot(int64_t snapshot_id); - /// \brief Set an explicit conflict-detection filter. + /// \brief Set a conflict detection filter used to validate concurrently added + /// data and delete files. /// - /// \param expr The conflict-detection filter expression - /// \return Reference to this for method chaining + /// \param expr An expression on rows in the table. + /// \return This OverwriteFiles for method chaining. OverwriteFiles& ConflictDetectionFilter(std::shared_ptr expr); - /// \brief Enable validation that no concurrent commit added conflicting data files. + /// \brief Enable validation that data added concurrently does not conflict + /// with this commit's operation. /// - /// \return Reference to this for method chaining + /// This method should be called while committing non-idempotent overwrite + /// operations. If a concurrent operation commits a new file after the data was + /// read and that file might contain rows matching the specified conflict + /// detection filter, the overwrite operation detects this and fails. + /// + /// Calling this method with a correct conflict detection filter is required to + /// maintain isolation for non-idempotent overwrite operations. + /// + /// Validation uses the conflict detection filter passed to + /// ConflictDetectionFilter() and applies to operations that happened after the + /// snapshot passed to ValidateFromSnapshot(). If the conflict detection filter + /// is not set, any new data added concurrently will fail this overwrite + /// operation. + /// + /// \return This OverwriteFiles for method chaining. OverwriteFiles& ValidateNoConflictingData(); - /// \brief Enable validation that no concurrent commit added conflicting deletes. + /// \brief Enable validation that deletes that happened concurrently do not + /// conflict with this commit's operation. + /// + /// Validating concurrent deletes is required during non-idempotent overwrite + /// operations. If a concurrent operation deletes data in one of the files being + /// overwritten, the overwrite operation must be aborted as it may undelete rows + /// that were removed concurrently. /// - /// \return Reference to this for method chaining + /// Calling this method with a correct conflict detection filter is required to + /// maintain isolation for non-idempotent overwrite operations. + /// + /// Validation uses the conflict detection filter passed to + /// ConflictDetectionFilter() and applies to operations that happened after the + /// snapshot passed to ValidateFromSnapshot(). If the conflict detection filter + /// is not set, this operation will use the row filter provided in + /// OverwriteByRowFilter() to check for new delete files and will ensure there + /// are no conflicting deletes for data files removed via DeleteFile(). + /// + /// \return This OverwriteFiles for method chaining. OverwriteFiles& ValidateNoConflictingDeletes(); - /// \brief Enable strict validation that every added file is fully within the - /// overwrite range. + /// \brief Signal that each file added to the table must match the overwrite + /// expression. /// - /// \return Reference to this for method chaining + /// If this method is called, each added file is validated on commit to ensure + /// that it matches the overwrite row filter. This is used to ensure that + /// writes are idempotent: files cannot be added during a commit that would not + /// be removed if the operation were run a second time. + /// + /// \return This OverwriteFiles for method chaining. OverwriteFiles& ValidateAddedFilesMatchOverwriteFilter(); - /// \brief Set case sensitivity for binding, projection, and metrics evaluation. + /// \brief Enable or disable case-sensitive expression binding for validations + /// that accept expressions. /// - /// \param case_sensitive Whether matching should be case-sensitive - /// \return Reference to this for method chaining - OverwriteFiles& WithCaseSensitivity(bool case_sensitive); + /// \param case_sensitive Whether expression binding should be case sensitive. + /// \return This OverwriteFiles for method chaining. + OverwriteFiles& CaseSensitive(bool case_sensitive); std::string operation() override; + protected: Status Validate(const TableMetadata& current_metadata, const std::shared_ptr& snapshot) override; @@ -121,15 +190,12 @@ class ICEBERG_EXPORT OverwriteFiles : public MergingSnapshotUpdate { /// \brief Select the conflict-detection filter from the configured state. std::shared_ptr DataConflictDetectionFilter() const; - /// \brief Verify every added data file is fully contained in the row filter. - Status ValidateAddedFilesMatchOverwriteFilterImpl(const TableMetadata& metadata); - - std::optional starting_snapshot_id_; - std::shared_ptr conflict_detection_filter_; DataFileSet deleted_data_files_; - bool validate_no_conflicting_data_ = false; - bool validate_no_conflicting_deletes_ = false; bool validate_added_files_match_overwrite_filter_ = false; + std::optional starting_snapshot_id_; + std::shared_ptr conflict_detection_filter_; + bool validate_new_data_files_ = false; + bool validate_new_deletes_ = false; }; } // namespace iceberg diff --git a/src/iceberg/update/row_delta.h b/src/iceberg/update/row_delta.h index ddb54d836..bf58a048c 100644 --- a/src/iceberg/update/row_delta.h +++ b/src/iceberg/update/row_delta.h @@ -39,7 +39,7 @@ namespace iceberg { /// \brief API for encoding row-level changes to a table. /// /// This API accumulates data and delete file changes, produces a new Snapshot -/// of the table, and commits that snapshot as current. +/// of the table, and commits that snapshot as the current. /// /// When committing, these changes are applied to the latest table snapshot. /// Commit conflicts are resolved by applying the changes to the new latest @@ -50,25 +50,25 @@ class ICEBERG_EXPORT RowDelta : public MergingSnapshotUpdate { static Result> Make(std::string table_name, std::shared_ptr ctx); - /// \brief Add a data file to the table. + /// \brief Add a DataFile to the table. /// /// \param inserts A data file of rows to insert. /// \return This RowDelta for method chaining. RowDelta& AddRows(const std::shared_ptr& inserts); - /// \brief Add a delete file to the table. + /// \brief Add a DeleteFile to the table. /// /// \param deletes A delete file of rows to delete. /// \return This RowDelta for method chaining. RowDelta& AddDeletes(const std::shared_ptr& deletes); - /// \brief Remove a data file from the table. + /// \brief Remove a DataFile from the table. /// /// \param file A data file. /// \return This RowDelta for method chaining. RowDelta& RemoveRows(const std::shared_ptr& file); - /// \brief Remove a rewritten delete file from the table. + /// \brief Remove a rewritten DeleteFile from the table. /// /// \param deletes A delete file that can be removed from the table. /// \return This RowDelta for method chaining. @@ -114,7 +114,8 @@ class ICEBERG_EXPORT RowDelta : public MergingSnapshotUpdate { /// \return This RowDelta for method chaining. RowDelta& ValidateDeletedFiles(); - /// \brief Set a conflict detection filter used to validate added files. + /// \brief Set a conflict detection filter used to validate concurrently added + /// data and delete files. /// /// If not called, a true literal is used as the conflict detection filter. /// diff --git a/src/iceberg/update/snapshot_update.h b/src/iceberg/update/snapshot_update.h index 7397034b3..b2c92bfb1 100644 --- a/src/iceberg/update/snapshot_update.h +++ b/src/iceberg/update/snapshot_update.h @@ -36,10 +36,11 @@ namespace iceberg { -/// \brief Base class for operations that produce snapshots. +/// \brief API for table changes that produce snapshots. /// -/// This class provides common functionality for creating new snapshots, -/// including manifest list writing and cleanup. +/// This class contains common methods for all updates that create a new table +/// Snapshot. It also provides the shared implementation for snapshot ID +/// assignment, manifest list writing, retry-safe apply, and cleanup. class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { public: /// \brief Result of applying a snapshot update @@ -56,9 +57,9 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { /// \brief Set a callback to delete files instead of the table's default. /// - /// \param delete_func A function used to delete file locations - /// \return Reference to this for method chaining - /// \note Cannot be called more than once + /// \param delete_func A function used to delete file locations. + /// \return This update for method chaining. + /// \note Cannot be called more than once. auto& DeleteWith(this auto& self, std::function delete_func) { if (self.delete_func_) { @@ -69,18 +70,21 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { return self; } - /// \brief Stage a snapshot in table metadata, but not update the current snapshot id. + /// \brief Stage a snapshot in table metadata, but do not make it current. /// - /// \return Reference to this for method chaining + /// The snapshot is assigned an ID and added to table metadata. The table's + /// current snapshot ID is not updated. + /// + /// \return This update for method chaining. auto& StageOnly(this auto& self) { self.stage_only_ = true; return self; } - /// \brief Perform operations on a particular branch + /// \brief Perform operations on a particular branch. /// - /// \param branch Which is name of SnapshotRef of type branch - /// \return Reference to this for method chaining + /// \param branch The name of a SnapshotRef of type branch. + /// \return This update for method chaining. auto& SetTargetBranch(this auto& self, const std::string& branch) { if (branch.empty()) [[unlikely]] { return self.AddError(ErrorKind::kInvalidArgument, "Branch name cannot be empty"); @@ -99,11 +103,11 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { return self; } - /// \brief Set a summary property. + /// \brief Set a summary property in the snapshot produced by this update. /// - /// \param property The property name - /// \param value The property value - /// \return Reference to this for method chaining + /// \param property A String property name. + /// \param value A String property value. + /// \return This update for method chaining. auto& Set(this auto& self, const std::string& property, const std::string& value) { static_cast(self).SetSummaryProperty(property, value); return self; @@ -111,11 +115,12 @@ class ICEBERG_EXPORT SnapshotUpdate : public PendingUpdate { /// \brief Apply the update's changes to create a new snapshot. /// - /// This method validates the changes, applies them to the metadata, - /// and creates a new snapshot without committing it. The snapshot - /// is stored internally and can be accessed after Apply() succeeds. + /// This method validates the changes, applies them to the current base + /// metadata, and creates a new snapshot without committing it. Commit retries + /// call Apply() again with refreshed metadata so the same changes can be + /// applied to the new latest snapshot. /// - /// \return A result containing the new snapshot, or an error + /// \return A result containing the new snapshot, or an error. Result Apply(); /// \brief Finalize the snapshot update, cleaning up any uncommitted files.