From 5d534db0fff289e1031aaeb493b2f75a9068fe18 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 10:36:44 +0800 Subject: [PATCH 01/27] fix: FileBatchReader returns discontinuous batch, and change GetPreviousBatchFirstRowId to GetGlobalRowId --- .../shared_shredding_file_reader.cpp | 4 +- .../shredding/shared_shredding_file_reader.h | 2 +- .../bitmap/apply_bitmap_index_batch_reader.h | 14 +-- .../reader/delegating_prefetch_reader.h | 4 +- .../prefetch_file_batch_reader_impl.cpp | 67 +++++++++---- .../reader/prefetch_file_batch_reader_impl.h | 6 +- .../prefetch_file_batch_reader_impl_test.cpp | 31 +++--- .../apply_deletion_vector_batch_reader.h | 16 ++- .../complete_row_tracking_fields_reader.cpp | 8 +- .../io/complete_row_tracking_fields_reader.h | 4 +- src/paimon/core/io/field_mapping_reader.h | 4 +- .../io/key_value_data_file_record_reader.cpp | 8 +- .../io/key_value_data_file_record_reader.h | 7 +- .../format/avro/avro_file_batch_reader.h | 4 +- .../avro/avro_file_batch_reader_test.cpp | 22 ++--- .../format/blob/blob_file_batch_reader.h | 6 +- .../blob/blob_file_batch_reader_test.cpp | 14 ++- .../format/lance/lance_file_batch_reader.h | 6 +- .../lance/lance_format_reader_writer_test.cpp | 13 ++- src/paimon/format/orc/orc_file_batch_reader.h | 4 +- .../format/orc/orc_file_batch_reader_test.cpp | 14 +-- .../parquet/parquet_file_batch_reader.cpp | 58 ++++++++++- .../parquet/parquet_file_batch_reader.h | 18 +++- .../parquet_file_batch_reader_test.cpp | 99 +++++++++++++++++-- src/paimon/format/parquet/row_ranges.cpp | 6 ++ src/paimon/format/parquet/row_ranges.h | 2 + .../testing/mock/mock_file_batch_reader.h | 4 +- 27 files changed, 310 insertions(+), 135 deletions(-) diff --git a/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp b/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp index 4c6bbc0ca..c9c249caf 100644 --- a/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp +++ b/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp @@ -469,8 +469,8 @@ void SharedShreddingFileReader::Close() { reader_->Close(); } -Result SharedShreddingFileReader::GetPreviousBatchFirstRowNumber() const { - return reader_->GetPreviousBatchFirstRowNumber(); +Result SharedShreddingFileReader::GetGlobalRowId(uint64_t batch_row_id) const { + return reader_->GetGlobalRowId(batch_row_id); } Result SharedShreddingFileReader::GetNumberOfRows() const { diff --git a/src/paimon/common/data/shredding/shared_shredding_file_reader.h b/src/paimon/common/data/shredding/shared_shredding_file_reader.h index c89e919e2..7998b7ab1 100644 --- a/src/paimon/common/data/shredding/shared_shredding_file_reader.h +++ b/src/paimon/common/data/shredding/shared_shredding_file_reader.h @@ -46,7 +46,7 @@ class SharedShreddingFileReader : public FileBatchReader { void Close() override; - Result GetPreviousBatchFirstRowNumber() const override; + Result GetGlobalRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index 7e2c9338a..66b75bb3e 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -80,8 +80,8 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { return Status::Invalid("ApplyBitmapIndexBatchReader does not support SetReadSchema"); } - Result GetPreviousBatchFirstRowNumber() const override { - return reader_->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -95,11 +95,11 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { private: Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; - PAIMON_ASSIGN_OR_RAISE(int32_t start_pos, reader_->GetPreviousBatchFirstRowNumber()); - int32_t length = batch_size; - for (auto iter = bitmap_.EqualOrLarger(start_pos); - iter != bitmap_.End() && *iter < start_pos + length; ++iter) { - is_valid.Add(*iter - start_pos); + for (int32_t i = 0; i < batch_size; ++i) { + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(i)); + if (bitmap_.Contains(global_row_id)) { + is_valid.Add(i); + } } return is_valid; } diff --git a/src/paimon/common/reader/delegating_prefetch_reader.h b/src/paimon/common/reader/delegating_prefetch_reader.h index fe2e3eda2..de92431cc 100644 --- a/src/paimon/common/reader/delegating_prefetch_reader.h +++ b/src/paimon/common/reader/delegating_prefetch_reader.h @@ -54,8 +54,8 @@ class DelegatingPrefetchReader : public FileBatchReader { return prefetch_reader_->SetReadSchema(read_schema, predicate, selection_bitmap); } - Result GetPreviousBatchFirstRowNumber() const override { - return GetReader()->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return GetReader()->GetGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index da74f3484..cf842583c 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "arrow/array/array_base.h" @@ -40,6 +41,19 @@ class Schema; namespace paimon { +namespace { + +std::pair ComputeBatchSliceByReadRange( + const std::vector& global_row_ids, const std::pair& read_range) { + auto begin_it = + std::lower_bound(global_row_ids.begin(), global_row_ids.end(), read_range.first); + auto end_it = std::lower_bound(global_row_ids.begin(), global_row_ids.end(), read_range.second); + return {static_cast(std::distance(global_row_ids.begin(), begin_it)), + static_cast(std::distance(global_row_ids.begin(), end_it))}; +} + +} // namespace + Result> PrefetchFileBatchReaderImpl::Create( const std::string& data_file_path, const ReaderBuilder* reader_builder, const std::shared_ptr& fs, uint32_t prefetch_max_parallel_num, int32_t batch_size, @@ -265,6 +279,7 @@ Status PrefetchFileBatchReaderImpl::CleanUp() { read_ranges_.clear(); read_ranges_in_group_.clear(); + current_batch_global_row_ids_.clear(); clean_prefetch_queue(); for (size_t i = 0; i < readers_pos_.size(); i++) { readers_pos_[i]->store(0); @@ -409,42 +424,53 @@ Status PrefetchFileBatchReaderImpl::EnsureReaderPosition( Status PrefetchFileBatchReaderImpl::HandleReadResult( size_t reader_idx, const std::pair& read_range, ReadBatchWithBitmap&& read_batch_with_bitmap) { - PAIMON_ASSIGN_OR_RAISE(uint64_t first_row_number, - readers_[reader_idx]->GetPreviousBatchFirstRowNumber()); auto& prefetch_queue = prefetch_queues_[reader_idx]; if (!BatchReader::IsEofBatch(read_batch_with_bitmap)) { auto& [read_batch, bitmap] = read_batch_with_bitmap; auto& [c_array, c_schema] = read_batch; - - if (first_row_number >= read_range.second) { - // fully out of range, data before first_row_number has been filtered out - readers_pos_[reader_idx]->store(first_row_number); + std::vector global_row_ids; + global_row_ids.reserve(c_array->length); + for (int64_t i = 0; i < c_array->length; ++i) { + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, readers_[reader_idx]->GetGlobalRowId(i)); + global_row_ids.push_back(global_row_id); + } + if (global_row_ids.empty()) { ReaderUtils::ReleaseReadBatch(std::move(read_batch)); return Status::OK(); - } else if (first_row_number + c_array->length > read_range.second) { - // partially out of range, data before read_range.second has been effectively consumed + } + auto [slice_begin, slice_end] = ComputeBatchSliceByReadRange(global_row_ids, read_range); + if (slice_begin >= slice_end) { + readers_pos_[reader_idx]->store(read_range.second); + ReaderUtils::ReleaseReadBatch(std::move(read_batch)); + return Status::OK(); + } else if (slice_begin > 0 || slice_end < c_array->length) { readers_pos_[reader_idx]->store(read_range.second); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr src_array, arrow::ImportArray(c_array.get(), c_schema.get())); - int32_t target_length = read_range.second - first_row_number; - auto array = src_array->Slice(/*offset=*/0, target_length); + auto array = src_array->Slice(slice_begin, slice_end - slice_begin); PAIMON_RETURN_NOT_OK_FROM_ARROW( arrow::ExportArray(*array, c_array.get(), c_schema.get())); - bitmap.RemoveRange(target_length, src_array->length()); + RoaringBitmap32 sliced_bitmap; + for (auto iter = bitmap.EqualOrLarger(slice_begin); + iter != bitmap.End() && *iter < slice_end; ++iter) { + sliced_bitmap.Add(*iter - slice_begin); + } + bitmap = std::move(sliced_bitmap); + global_row_ids = std::vector(global_row_ids.begin() + slice_begin, + global_row_ids.begin() + slice_end); } else { - // all within the range, data before readers_[reader_idx]->GetNextRowToRead() has been - // effectively consumed readers_pos_[reader_idx]->store(readers_[reader_idx]->GetNextRowToRead()); } if (bitmap.IsEmpty()) { ReaderUtils::ReleaseReadBatch(std::move(read_batch)); return Status::OK(); } - prefetch_queue->push({read_range, std::move(read_batch_with_bitmap), first_row_number}); + prefetch_queue->push( + {read_range, std::move(read_batch_with_bitmap), std::move(global_row_ids)}); } else { std::pair eof_range; PAIMON_ASSIGN_OR_RAISE(eof_range, EofRange()); - prefetch_queue->push({eof_range, std::move(read_batch_with_bitmap), first_row_number}); + prefetch_queue->push({eof_range, std::move(read_batch_with_bitmap), {}}); readers_pos_[reader_idx]->store(std::numeric_limits::max()); } return Status::OK(); @@ -527,7 +553,7 @@ Result PrefetchFileBatchReaderImpl::NextBatchW std::unique_lock lock(working_mutex_); cv_.notify_one(); } - previous_batch_first_row_num_ = prefetch_batch.value().previous_batch_first_row_num; + current_batch_global_row_ids_ = std::move(prefetch_batch.value().global_row_ids); return std::move(prefetch_batch).value().batch; } } @@ -537,7 +563,7 @@ Result PrefetchFileBatchReaderImpl::NextBatchW assert(false); return Status::Invalid("peek batch not suppose to be nullptr"); } - previous_batch_first_row_num_ = peek_batch->previous_batch_first_row_num; + // current_batch_global_row_ids_.clear(); return BatchReader::MakeEofBatchWithBitmap(); } if (value_count == prefetch_queues_.size()) { @@ -571,8 +597,11 @@ Result> PrefetchFileBatchReaderImpl::GetFileSchem return readers_[0]->GetFileSchema(); } -Result PrefetchFileBatchReaderImpl::GetPreviousBatchFirstRowNumber() const { - return previous_batch_first_row_num_; +Result PrefetchFileBatchReaderImpl::GetGlobalRowId(uint64_t batch_row_id) const { + if (batch_row_id >= current_batch_global_row_ids_.size()) { + return std::numeric_limits::max(); + } + return current_batch_global_row_ids_[batch_row_id]; } Result PrefetchFileBatchReaderImpl::GetNumberOfRows() const { diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h index 5ed9fb352..4d99dc843 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h @@ -76,7 +76,7 @@ class PrefetchFileBatchReaderImpl : public PrefetchFileBatchReader { const std::optional& selection_bitmap) override; Status SeekToRow(uint64_t row_number) override; - Result GetPreviousBatchFirstRowNumber() const override; + Result GetGlobalRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; uint64_t GetNextRowToRead() const override; void Close() override; @@ -105,7 +105,7 @@ class PrefetchFileBatchReaderImpl : public PrefetchFileBatchReader { struct PrefetchBatch { std::pair read_range; BatchReader::ReadBatchWithBitmap batch; - uint64_t previous_batch_first_row_num; + std::vector global_row_ids; }; PrefetchFileBatchReaderImpl( @@ -160,7 +160,7 @@ class PrefetchFileBatchReaderImpl : public PrefetchFileBatchReader { std::unique_ptr background_thread_; Status read_status_; std::atomic is_shutdown_ = false; - uint64_t previous_batch_first_row_num_ = std::numeric_limits::max(); + std::vector current_batch_global_row_ids_; bool need_prefetch_ = false; bool read_ranges_freshed_ = false; const uint32_t prefetch_queue_capacity_; diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index 7c1acef32..ef78927ca 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -283,11 +283,10 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestSimple) { /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 101); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); } @@ -605,11 +604,10 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestReadWithLargeBatchSize) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 101); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); } @@ -633,11 +631,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPartialReaderSuccessRead) { } arrow::ArrayVector result_array_vector; - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); auto& [batch, bitmap] = batch_with_bitmap; ASSERT_EQ(batch.first->length, bitmap.Cardinality()); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 0); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), 0); ASSERT_OK_AND_ASSIGN(auto array, ReadResultCollector::GetArray(std::move(batch))); result_array_vector.push_back(array); ASSERT_OK(prefetch_reader->GetReadStatus()); @@ -678,9 +676,9 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { ->SetNextBatchStatus(Status::IOError("mock error")); } - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); auto batch_result = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(batch_result.ok()); ASSERT_TRUE(batch_result.status().IsIOError()); ASSERT_FALSE(prefetch_reader->is_shutdown_); @@ -689,7 +687,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { // call NextBatch again, will still return error status auto batch_result2 = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(batch_result2.ok()); ASSERT_TRUE(batch_result2.status().IsIOError()); } @@ -706,11 +704,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPrefetchWithEmptyData) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 0); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(result_array); } @@ -726,11 +724,10 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 10); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); @@ -832,11 +829,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestPrefetchWithPredicatePushdownWithCom PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 90); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), 80); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 30)); @@ -868,11 +865,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_OK(reader->RefreshReadRanges()); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchFirstRowNumber().value(), 90); + ASSERT_EQ(reader->GetGlobalRowId(0).value(), 80); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 20)); diff --git a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h index 5e2ecf279..f6d824a76 100644 --- a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h +++ b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h @@ -82,8 +82,8 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { return Status::Invalid("ApplyDeletionVectorBatchReader does not support SetReadSchema"); } - Result GetPreviousBatchFirstRowNumber() const override { - return reader_->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -96,9 +96,15 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { private: Result Filter(int32_t batch_size) const { - PAIMON_ASSIGN_OR_RAISE(uint64_t previous_batch_first_row_number, - reader_->GetPreviousBatchFirstRowNumber()); - return deletion_vector_->IsValid(previous_batch_first_row_number, batch_size); + RoaringBitmap32 is_valid; + for (int32_t i = 0; i < batch_size; ++i) { + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(i)); + PAIMON_ASSIGN_OR_RAISE(bool is_deleted, deletion_vector_->IsDeleted(global_row_id)); + if (!is_deleted) { + is_valid.Add(i); + } + } + return is_valid; } private: diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp index 2aef9b29f..8ed53615a 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp @@ -86,15 +86,13 @@ CompleteRowTrackingFieldsBatchReader::NextBatchWithBitmap() { std::string row_id_field_name = SpecialFields::RowId().Name(); if (read_schema_->GetFieldIndex(row_id_field_name) != -1) { row_id_array = src_struct_array->GetFieldByName(row_id_field_name); - PAIMON_ASSIGN_OR_RAISE(uint64_t previous_batch_first_row_number, - reader_->GetPreviousBatchFirstRowNumber()); - auto row_id_convert_func = [previous_batch_first_row_number, - this](int32_t idx_in_array) -> Result { + auto row_id_convert_func = [this](int32_t idx_in_array) -> Result { if (first_row_id_ == std::nullopt) { return Status::Invalid( "unexpected: read _ROW_ID special field, but first row id is null in meta"); } - return first_row_id_.value() + previous_batch_first_row_number + idx_in_array; + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(idx_in_array)); + return first_row_id_.value() + global_row_id; }; PAIMON_RETURN_NOT_OK(ConvertRowTrackingField(src_struct_array->length(), /*init_value=*/0, row_id_convert_func, &row_id_array)); diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.h b/src/paimon/core/io/complete_row_tracking_fields_reader.h index cc2f9f7bf..6f49fea90 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.h +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.h @@ -60,8 +60,8 @@ class CompleteRowTrackingFieldsBatchReader : public FileBatchReader { reader_->Close(); } - Result GetPreviousBatchFirstRowNumber() const override { - return reader_->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h index da1897814..39c02dcaf 100644 --- a/src/paimon/core/io/field_mapping_reader.h +++ b/src/paimon/core/io/field_mapping_reader.h @@ -74,8 +74,8 @@ class FieldMappingReader : public FileBatchReader { return Status::Invalid("FieldMappingReader does not support SetReadSchema"); } - Result GetPreviousBatchFirstRowNumber() const override { - return reader_->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/key_value_data_file_record_reader.cpp b/src/paimon/core/io/key_value_data_file_record_reader.cpp index a4edd04e0..ea5deb5dd 100644 --- a/src/paimon/core/io/key_value_data_file_record_reader.cpp +++ b/src/paimon/core/io/key_value_data_file_record_reader.cpp @@ -81,15 +81,14 @@ Result KeyValueDataFileRecordReader::Iterator::Next() { Result> KeyValueDataFileRecordReader::Iterator::NextWithFilePos() { PAIMON_ASSIGN_OR_RAISE(KeyValue kv, Next()); - return std::make_pair(previous_batch_first_row_number_ + cursor_ - 1, std::move(kv)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->reader_->GetGlobalRowId(cursor_ - 1)); + return std::make_pair(static_cast(global_row_id), std::move(kv)); } Result> KeyValueDataFileRecordReader::NextBatch() { Reset(); PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, reader_->NextBatchWithBitmap()); - PAIMON_ASSIGN_OR_RAISE(int64_t previous_batch_first_row_number, - reader_->GetPreviousBatchFirstRowNumber()); if (BatchReader::IsEofBatch(batch_with_bitmap)) { // reader eof, just return return std::unique_ptr(); @@ -140,8 +139,7 @@ Result> KeyValueDataFileRecordRe key_ctx_ = std::make_shared(key_fields, pool_); value_ctx_ = std::make_shared(value_fields, pool_); ArrowUtils::TraverseArray(data_batch); - return std::make_unique( - this, previous_batch_first_row_number); + return std::make_unique(this); } void KeyValueDataFileRecordReader::Reset() { diff --git a/src/paimon/core/io/key_value_data_file_record_reader.h b/src/paimon/core/io/key_value_data_file_record_reader.h index c271a3bdb..3fe87a89e 100644 --- a/src/paimon/core/io/key_value_data_file_record_reader.h +++ b/src/paimon/core/io/key_value_data_file_record_reader.h @@ -54,16 +54,13 @@ class KeyValueDataFileRecordReader : public KeyValueRecordReader { class Iterator : public KeyValueRecordReader::Iterator { public: - Iterator(KeyValueDataFileRecordReader* reader, int64_t previous_batch_first_row_number) - : previous_batch_first_row_number_(previous_batch_first_row_number), - reader_(reader), - selection_cardinality_(reader->selection_bitmap_.Cardinality()) {} + explicit Iterator(KeyValueDataFileRecordReader* reader) + : reader_(reader), selection_cardinality_(reader->selection_bitmap_.Cardinality()) {} Result HasNext() const override; Result Next() override; Result> NextWithFilePos(); private: - int64_t previous_batch_first_row_number_; mutable int64_t cursor_ = 0; KeyValueDataFileRecordReader* reader_ = nullptr; int64_t selection_cardinality_ = 0; diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index 98d5deede..355e849a6 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -45,8 +45,8 @@ class AvroFileBatchReader : public FileBatchReader { Status SetReadSchema(::ArrowSchema* read_schema, const std::shared_ptr& predicate, const std::optional& selection_bitmap) override; - Result GetPreviousBatchFirstRowNumber() const override { - return previous_first_row_; + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return previous_first_row_ + batch_row_id; } Result GetNumberOfRows() const override; diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index f4f052a34..bd2b1e1fd 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -327,7 +327,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) "does not support nested sub-field projection"); } -TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFirstRowNumber) { +TEST_F(AvroFileBatchReaderTest, TestGetGlobalRowId) { std::string path = paimon::test::GetDataDir() + "/avro/append_simple.db/" "append_simple/bucket-0/" @@ -352,26 +352,25 @@ TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFirstRowNumber) { ASSERT_OK_AND_ASSIGN(auto num_rows, reader->GetNumberOfRows()); ASSERT_EQ(4, num_rows); - ASSERT_EQ(std::numeric_limits::max(), - reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(1, reader->GetGlobalRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(2, reader->GetGlobalRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(3, reader->GetGlobalRowId(0).value()); ArrowArrayRelease(batch4.first.get()); ArrowSchemaRelease(batch4.second.get()); ASSERT_OK_AND_ASSIGN(auto batch5, reader->NextBatch()); - ASSERT_EQ(4, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(4, reader->GetGlobalRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch5)); } @@ -397,7 +396,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_OK_AND_ASSIGN(auto reader, reader_builder->Build(in)); ASSERT_OK_AND_ASSIGN(auto first_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); auto first_array = arrow::ImportArray(first_batch.first.get(), first_batch.second.get()).ValueOrDie(); ASSERT_TRUE(first_array->Equals(src_array->Slice(0, 2))) << first_array->ToString(); @@ -407,11 +406,10 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); ASSERT_OK(reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt)); - ASSERT_EQ(std::numeric_limits::max(), - reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto projected_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); auto projected_array = arrow::ImportArray(projected_batch.first.get(), projected_batch.second.get()).ValueOrDie(); auto expected_projected_array = arrow::ipc::internal::json::ArrayFromJSON( diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index 06287d759..0c04b43f3 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -97,14 +97,14 @@ class BlobFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetPreviousBatchFirstRowNumber() const override { + Result GetGlobalRowId(uint64_t batch_row_id) const override { if (all_blob_lengths_.size() != target_blob_lengths_.size()) { return Status::Invalid( - "Cannot call GetPreviousBatchFirstRowNumber in BlobFileBatchReader because, after " + "Cannot call GetGlobalRowId in BlobFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } - return previous_batch_first_row_number_; + return previous_batch_first_row_number_ + batch_row_id; } Result GetNumberOfRows() const override { diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index bde27d64d..f3687eb5e 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -169,22 +169,21 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(3, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), - reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(1, reader->GetGlobalRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(2, reader->GetGlobalRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(3, reader->GetGlobalRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } @@ -255,8 +254,7 @@ TEST_P(BlobFileBatchReaderTest, EmptyFile) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(0, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), - reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch)); } diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index fb2628035..361561411 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -41,15 +41,15 @@ class LanceFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetPreviousBatchFirstRowNumber() const override { + Result GetGlobalRowId(uint64_t batch_row_id) const override { if (!read_row_ids_.empty() && read_row_ids_.size() != num_rows_) { // TODO(xinyu.lxy): support function return Status::Invalid( - "Cannot call GetPreviousBatchFirstRowNumber in LanceFileBatchReader because, after " + "Cannot call GetGlobalRowId in LanceFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } - return previous_batch_first_row_num_; + return previous_batch_first_row_num_ + batch_row_id; } Result GetNumberOfRows() const override { diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp index b1ad6be73..eab303905 100644 --- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp +++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp @@ -478,27 +478,26 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK_AND_ASSIGN( std::unique_ptr reader, LanceFileBatchReader::Create(file_path, /*batch_size=*/4, /*batch_readahead=*/2)); - ASSERT_EQ(std::numeric_limits::max(), - reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); // first batch row 0-3 ASSERT_OK_AND_ASSIGN(auto read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(auto read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(0, 4))); - ASSERT_EQ(0, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); // second batch 4-5 ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(4, 2))); - ASSERT_EQ(4, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(4, reader->GetGlobalRowId(0).value()); // eof ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(read_batch)); - ASSERT_EQ(6, reader->GetPreviousBatchFirstRowNumber().value()); + ASSERT_EQ(6, reader->GetGlobalRowId(0).value()); // test with bitmap pushdown ArrowSchema c_read_schema; @@ -506,8 +505,8 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK(reader->SetReadSchema(&c_read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/RoaringBitmap32::From({0, 3}))); ASSERT_NOK_WITH_MSG( - reader->GetPreviousBatchFirstRowNumber(), - "Cannot call GetPreviousBatchFirstRowNumber in LanceFileBatchReader because, after bitmap " + reader->GetGlobalRowId(0), + "Cannot call GetGlobalRowId in LanceFileBatchReader because, after bitmap " "pushdown, rows in the array returned by NextBatch are no longer contiguous."); } } // namespace paimon::lance::test diff --git a/src/paimon/format/orc/orc_file_batch_reader.h b/src/paimon/format/orc/orc_file_batch_reader.h index c2460f3a7..0e3fade23 100644 --- a/src/paimon/format/orc/orc_file_batch_reader.h +++ b/src/paimon/format/orc/orc_file_batch_reader.h @@ -62,8 +62,8 @@ class OrcFileBatchReader : public PrefetchFileBatchReader { // OrcFileBatchReader. Therefore, we need to hold BatchReader when using output ArrowArray. Result NextBatch() override; - Result GetPreviousBatchFirstRowNumber() const override { - return reader_->GetRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetRowNumber() + batch_row_id; } Result GetNumberOfRows() const override { diff --git a/src/paimon/format/orc/orc_file_batch_reader_test.cpp b/src/paimon/format/orc/orc_file_batch_reader_test.cpp index de80861b6..57665fa0d 100644 --- a/src/paimon/format/orc/orc_file_batch_reader_test.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader_test.cpp @@ -492,10 +492,10 @@ TEST_P(OrcFileBatchReaderTest, TestNextBatchSimple) { for (auto batch_size : {1, 2, 3, 5, 8, 10}) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, batch_size, natural_read_size); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( orc_batch_reader.get())); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), 8); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 8); orc_batch_reader->Close(); auto expected_array = std::make_shared(struct_array_); ASSERT_TRUE(result_array->Equals(expected_array)); @@ -766,18 +766,18 @@ TEST_F(OrcFileBatchReaderTest, TestReadNoField) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, /*batch_size=*/3, /*natural_read_size=*/10); // read 3 rows - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), -1); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto batch1, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), 0); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 0); // read 3 rows ASSERT_OK_AND_ASSIGN(auto batch2, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), 3); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 3); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), 6); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 6); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFirstRowNumber().value(), 8); + ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 8); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); orc_batch_reader->Close(); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 5e6893c39..16c79dd0e 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -39,6 +39,7 @@ #include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/options_utils.h" +#include "paimon/common/utils/scope_guard.h" #include "paimon/common/utils/string_utils.h" #include "paimon/core/schema/arrow_schema_validator.h" #include "paimon/core/utils/nested_projection_utils.h" @@ -213,11 +214,13 @@ Status ParquetFileBatchReader::SetReadSchema( target_row_groups.emplace_back(/*rg_index=*/rg_id, /*is_partially_matched=*/true, /*ranges=*/it->second); } else { - target_row_groups.emplace_back(/*rg_index=*/rg_id, - /*is_partially_matched=*/false, - /*ranges=*/RowRanges()); + target_row_groups.emplace_back( + /*rg_index=*/rg_id, + /*is_partially_matched=*/false, + /*ranges=*/RowRanges(Range(0, reader_->GetAllRowGroupRanges()[rg_id].second))); } } + PAIMON_ASSIGN_OR_RAISE(all_row_ranges_, GetAllTargetRowRanges(target_row_groups)); PAIMON_RETURN_NOT_OK(reader_->PrepareForReadingLazy(target_row_groups, column_indices)); } PAIMON_PARQUET_CATCH_AND_RETURN_STATUS("ParquetFileBatchReader::SetReadSchema") @@ -361,6 +364,7 @@ Result ParquetFileBatchReader::NextBatch() { "equal with read schema {}", array->type()->ToString(), read_data_type_->ToString())); } + PAIMON_RETURN_NOT_OK(GenerateRowMapping(array->length())); std::unique_ptr c_array = std::make_unique(); std::unique_ptr c_schema = std::make_unique(); PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, c_array.get(), c_schema.get())); @@ -526,4 +530,52 @@ Result> ParquetFileBatchReader::ComputeNestedColumnIndices( return indices; } +Result ParquetFileBatchReader::GetAllTargetRowRanges( + const std::vector& target_row_groups) const { + auto all_row_group_ranges = reader_->GetAllRowGroupRanges(); + RowRanges all_ranges; + for (const auto& target_row_group : target_row_groups) { + for (const auto& range : target_row_group.row_ranges.GetRanges()) { + all_ranges.Add( + Range(range.from + all_row_group_ranges[target_row_group.row_group_index].first, + range.to + all_row_group_ranges[target_row_group.row_group_index].first)); + } + } + return all_ranges; +} + +Status ParquetFileBatchReader::GenerateRowMapping(int64_t batch_length) { + const std::vector& all_ranges = all_row_ranges_.GetRanges(); + PAIMON_ASSIGN_OR_RAISE(int64_t batch_start_row, reader_->GetPreviousBatchFirstRowNumber()); + + auto cur_range_it = + std::upper_bound(all_ranges.begin(), all_ranges.end(), batch_start_row, + [](int64_t value, const Range& r) { return value < r.from; }); + if (cur_range_it == all_ranges.begin()) { + return Status::Invalid("No range found!"); + } + --cur_range_it; + if (batch_start_row < cur_range_it->from || batch_start_row > cur_range_it->to) { + return Status::Invalid( + fmt::format("Batch start row {} is not in the current range [{}, {}]!", batch_start_row, + cur_range_it->from, cur_range_it->to)); + } + + std::vector row_mapping; + row_mapping.reserve(batch_length); + int64_t global_row = batch_start_row; + for (int64_t i = 0; i < batch_length; ++i) { + if (global_row > cur_range_it->to) { + ++cur_range_it; + if (cur_range_it == all_ranges.end()) { + return Status::Invalid("Batch length exceeds the total row ranges!"); + } + global_row = cur_range_it->from; + } + row_mapping.push_back(global_row); + global_row++; + } + row_mapping_ = std::move(row_mapping); + return Status::OK(); +} } // namespace paimon::parquet diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 4bd684e8f..c89669632 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -16,6 +16,8 @@ #pragma once +#include + #include #include #include @@ -94,9 +96,11 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { Result>> GenReadRanges( bool* need_prefetch) const override; - Result GetPreviousBatchFirstRowNumber() const override { - assert(reader_); - return reader_->GetPreviousBatchFirstRowNumber(); + Result GetGlobalRowId(uint64_t batch_row_id) const override { + if (batch_row_id >= row_mapping_.size()) { + return std::numeric_limits::max(); + } + return row_mapping_[batch_row_id]; } Result GetNumberOfRows() const override { @@ -173,6 +177,9 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { const std::shared_ptr& read_schema, const std::shared_ptr& file_schema); + Result GetAllTargetRowRanges( + const std::vector& target_row_groups) const; + // precondition: predicate supposed not be empty Result> FilterRowGroupsByPredicate( const std::shared_ptr& predicate, @@ -189,6 +196,8 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { const std::map& column_name_to_index, const std::vector& src_row_groups); + Status GenerateRowMapping(int64_t batch_length); + private: std::map options_; // hold the lifecycle of arrow memory pool. @@ -204,6 +213,9 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { uint64_t read_rows_ = 0; uint64_t read_batch_count_ = 0; + + RowRanges all_row_ranges_; + std::vector row_mapping_; }; } // namespace paimon::parquet diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index ec353f071..4699fdff0 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -168,12 +168,15 @@ class ParquetFileBatchReaderTest : public ::testing::Test, void WriteArray(const std::string& file_path, const std::shared_ptr& src_array, const std::shared_ptr& arrow_schema, int64_t write_batch_size, - bool enable_dictionary, int64_t max_row_group_length) const { + bool enable_dictionary, int64_t max_row_group_length, + int64_t max_page_size = 1024 * 1024 * 1024) const { ASSERT_OK_AND_ASSIGN(std::shared_ptr out, fs_->Create(file_path, /*overwrite=*/true)); ::parquet::WriterProperties::Builder builder; builder.write_batch_size(write_batch_size); builder.max_row_group_length(max_row_group_length); + builder.data_pagesize(max_page_size); + builder.enable_write_page_index(); enable_dictionary ? builder.enable_dictionary() : builder.disable_dictionary(); auto writer_properties = builder.build(); ASSERT_OK_AND_ASSIGN(auto format_writer, ParquetFormatWriter::Create( @@ -229,6 +232,31 @@ class ParquetFileBatchReaderTest : public ::testing::Test, std::shared_ptr struct_array_; }; +static std::shared_ptr MakeSequentialIntData(int32_t num_rows) { + arrow::Int32Builder val_builder; + EXPECT_TRUE(val_builder.Reserve(num_rows).ok()); + for (int32_t i = 0; i < num_rows; ++i) { + val_builder.UnsafeAppend(i); + } + auto val_array = val_builder.Finish().ValueOrDie(); + auto field = arrow::field("f0", arrow::int32()); + return arrow::StructArray::Make({val_array}, {field}).ValueOrDie(); +} + +static Result> CollectOneBatch(ParquetFileBatchReader* reader) { + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatch batch, reader->NextBatch()); + if (BatchReader::IsEofBatch(batch)) { + return std::shared_ptr(nullptr); + } + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr result_array, + arrow::ImportArray(batch.first.get(), batch.second.get())); + auto struct_array = std::dynamic_pointer_cast(result_array); + if (!struct_array) { + return Status::Invalid("CollectOneBatch expected StructArray"); + } + return struct_array; +} + TEST_F(ParquetFileBatchReaderTest, TestParquetMetadataCacheReusesSerializedFooter) { WriteArray(file_path_, struct_array_, schema_, /*write_batch_size=*/struct_array_->length(), /*enable_dictionary=*/false, @@ -447,11 +475,10 @@ TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, schema_, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, batch_size); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( parquet_batch_reader.get())); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), 6); parquet_batch_reader->Close(); auto expected_array = std::make_shared(struct_array_); ASSERT_TRUE(result_array->Equals(expected_array)); @@ -812,19 +839,19 @@ TEST_F(ParquetFileBatchReaderTest, TestReadNoField) { PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, /*batch_size=*/2); // read 2 rows - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto batch1, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), 0); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 0); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch2, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), 2); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 2); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), 4); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 4); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFirstRowNumber().value(), 6); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 4); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); parquet_batch_reader->Close(); @@ -1013,4 +1040,60 @@ TEST_F(ParquetFileBatchReaderTest, TestAddMetadataPerFieldMetadata) { ASSERT_TRUE(data->Equals(*result_array->chunk(0))) << result_array->ToString(); } +TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { + arrow::FieldVector fields = {arrow::field("f0", arrow::int32())}; + auto src_array = MakeSequentialIntData(12); + // data in file rowGroup0:[0, 1, 2, 3, 4, 5] | rowGroup1:[6, 7, 8, 9, 10, 11] + // one row per page + auto arrow_schema = arrow::schema(fields); + WriteArray(file_path_, src_array, arrow_schema, /*write_batch_size=*/1, + /*enable_dictionary=*/true, /*max_row_group_length=*/12, /*max_page_size=*/1); + + // 1<=f0<=3 || 5<=f0<=6 + ASSERT_OK_AND_ASSIGN( + auto predicate, + PredicateBuilder::Or({PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(1), Literal(3)), + PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5), Literal(6))})); + + auto parquet_batch_reader = PrepareParquetFileBatchReader( + file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/2); + + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), + std::numeric_limits::max()); + + ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + auto expected_batch1 = src_array->Slice(1, 2); + ASSERT_TRUE(batch1->Equals(expected_batch1)) << batch1->ToString(); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 1); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), 2); + + // Not adjacent pages + ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + auto expected_batch2 = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +[3], +[5] + ])") + .ValueOrDie()); + ASSERT_TRUE(batch2->Equals(expected_batch2)) << batch2->ToString(); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 3); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), 5); + + // Only one record read + ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); + auto expected_batch3 = src_array->Slice(6, 1); + ASSERT_TRUE(batch3->Equals(expected_batch3)) << batch3->ToString(); + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 6); + // out of bound, return max value + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), + std::numeric_limits::max()); + + ASSERT_OK_AND_ASSIGN(auto eof_batch, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_EQ(nullptr, eof_batch); + // previous batch is eof, return last none-eof batch's row id + ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 6); +} + } // namespace paimon::parquet::test diff --git a/src/paimon/format/parquet/row_ranges.cpp b/src/paimon/format/parquet/row_ranges.cpp index 1b03715be..6e20780b8 100644 --- a/src/paimon/format/parquet/row_ranges.cpp +++ b/src/paimon/format/parquet/row_ranges.cpp @@ -104,6 +104,12 @@ void RowRanges::Add(const Range& range) { ranges_.insert(it, merged); } +void RowRanges::Union(const RowRanges& other) { + for (const auto& range : other.ranges_) { + Add(range); + } +} + std::optional RowRanges::MapFilteredIndexToOriginalRow(int64_t filtered_index) const { int64_t accumulated = 0; for (const auto& range : ranges_) { diff --git a/src/paimon/format/parquet/row_ranges.h b/src/paimon/format/parquet/row_ranges.h index 46c3f4d21..f6b41178f 100644 --- a/src/paimon/format/parquet/row_ranges.h +++ b/src/paimon/format/parquet/row_ranges.h @@ -92,6 +92,8 @@ class RowRanges { /// Adds a range to the end of the list, maintaining sorted disjoint ranges. void Add(const Range& range); + void Union(const RowRanges& other); + /// Maps a filtered-result index to the original row index within the row group. /// For example, if RowRanges = {[10,19], [50,59]}, then: /// MapFilteredIndexToOriginalRow(0) = 10 (first row of first range) diff --git a/src/paimon/testing/mock/mock_file_batch_reader.h b/src/paimon/testing/mock/mock_file_batch_reader.h index eb2bc1b59..6eda3b005 100644 --- a/src/paimon/testing/mock/mock_file_batch_reader.h +++ b/src/paimon/testing/mock/mock_file_batch_reader.h @@ -149,8 +149,8 @@ class MockFileBatchReader : public PrefetchFileBatchReader { return metrics; } - Result GetPreviousBatchFirstRowNumber() const override { - return previous_batch_first_row_num_; + Result GetGlobalRowId(uint64_t batch_row_id) const override { + return previous_batch_first_row_num_ + batch_row_id; } Result GetNumberOfRows() const override { From ad034986b2fcc665ca9142fc9b628d73deb000ec Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 10:44:26 +0800 Subject: [PATCH 02/27] style: change interface name --- .../shared_shredding_file_reader.cpp | 5 ++-- .../shredding/shared_shredding_file_reader.h | 2 +- .../bitmap/apply_bitmap_index_batch_reader.h | 6 ++-- .../reader/delegating_prefetch_reader.h | 4 +-- .../prefetch_file_batch_reader_impl.cpp | 6 ++-- .../reader/prefetch_file_batch_reader_impl.h | 2 +- .../prefetch_file_batch_reader_impl_test.cpp | 29 ++++++++++--------- .../apply_deletion_vector_batch_reader.h | 6 ++-- .../complete_row_tracking_fields_reader.cpp | 3 +- .../io/complete_row_tracking_fields_reader.h | 4 +-- src/paimon/core/io/field_mapping_reader.h | 4 +-- .../io/key_value_data_file_record_reader.cpp | 3 +- .../format/avro/avro_file_batch_reader.h | 2 +- .../avro/avro_file_batch_reader_test.cpp | 20 ++++++------- .../format/blob/blob_file_batch_reader.h | 4 +-- .../blob/blob_file_batch_reader_test.cpp | 12 ++++---- .../format/lance/lance_file_batch_reader.h | 4 +-- .../lance/lance_format_reader_writer_test.cpp | 12 ++++---- src/paimon/format/orc/orc_file_batch_reader.h | 2 +- .../format/orc/orc_file_batch_reader_test.cpp | 14 ++++----- .../parquet/parquet_file_batch_reader.h | 2 +- .../parquet_file_batch_reader_test.cpp | 28 +++++++++--------- .../testing/mock/mock_file_batch_reader.h | 2 +- 23 files changed, 91 insertions(+), 85 deletions(-) diff --git a/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp b/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp index c9c249caf..00fe9bd27 100644 --- a/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp +++ b/src/paimon/common/data/shredding/shared_shredding_file_reader.cpp @@ -469,8 +469,9 @@ void SharedShreddingFileReader::Close() { reader_->Close(); } -Result SharedShreddingFileReader::GetGlobalRowId(uint64_t batch_row_id) const { - return reader_->GetGlobalRowId(batch_row_id); +Result SharedShreddingFileReader::GetPreviousBatchGlobalRowId( + uint64_t batch_row_id) const { + return reader_->GetPreviousBatchGlobalRowId(batch_row_id); } Result SharedShreddingFileReader::GetNumberOfRows() const { diff --git a/src/paimon/common/data/shredding/shared_shredding_file_reader.h b/src/paimon/common/data/shredding/shared_shredding_file_reader.h index 7998b7ab1..5003874aa 100644 --- a/src/paimon/common/data/shredding/shared_shredding_file_reader.h +++ b/src/paimon/common/data/shredding/shared_shredding_file_reader.h @@ -46,7 +46,7 @@ class SharedShreddingFileReader : public FileBatchReader { void Close() override; - Result GetGlobalRowId(uint64_t batch_row_id) const override; + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index 66b75bb3e..8213c4f2a 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -80,8 +80,8 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { return Status::Invalid("ApplyBitmapIndexBatchReader does not support SetReadSchema"); } - Result GetGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetGlobalRowId(batch_row_id); + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -96,7 +96,7 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; for (int32_t i = 0; i < batch_size; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(i)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchGlobalRowId(i)); if (bitmap_.Contains(global_row_id)) { is_valid.Add(i); } diff --git a/src/paimon/common/reader/delegating_prefetch_reader.h b/src/paimon/common/reader/delegating_prefetch_reader.h index de92431cc..64d57a155 100644 --- a/src/paimon/common/reader/delegating_prefetch_reader.h +++ b/src/paimon/common/reader/delegating_prefetch_reader.h @@ -54,8 +54,8 @@ class DelegatingPrefetchReader : public FileBatchReader { return prefetch_reader_->SetReadSchema(read_schema, predicate, selection_bitmap); } - Result GetGlobalRowId(uint64_t batch_row_id) const override { - return GetReader()->GetGlobalRowId(batch_row_id); + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + return GetReader()->GetPreviousBatchGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index cf842583c..b779d75c5 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -431,7 +431,8 @@ Status PrefetchFileBatchReaderImpl::HandleReadResult( std::vector global_row_ids; global_row_ids.reserve(c_array->length); for (int64_t i = 0; i < c_array->length; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, readers_[reader_idx]->GetGlobalRowId(i)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, + readers_[reader_idx]->GetPreviousBatchGlobalRowId(i)); global_row_ids.push_back(global_row_id); } if (global_row_ids.empty()) { @@ -597,7 +598,8 @@ Result> PrefetchFileBatchReaderImpl::GetFileSchem return readers_[0]->GetFileSchema(); } -Result PrefetchFileBatchReaderImpl::GetGlobalRowId(uint64_t batch_row_id) const { +Result PrefetchFileBatchReaderImpl::GetPreviousBatchGlobalRowId( + uint64_t batch_row_id) const { if (batch_row_id >= current_batch_global_row_ids_.size()) { return std::numeric_limits::max(); } diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h index 4d99dc843..908e06b85 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h @@ -76,7 +76,7 @@ class PrefetchFileBatchReaderImpl : public PrefetchFileBatchReader { const std::optional& selection_bitmap) override; Status SeekToRow(uint64_t row_number) override; - Result GetGlobalRowId(uint64_t batch_row_id) const override; + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; uint64_t GetNextRowToRead() const override; void Close() override; diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index ef78927ca..cac4905d0 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -283,7 +283,8 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestSimple) { /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), + std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -604,7 +605,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestReadWithLargeBatchSize) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -631,11 +632,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPartialReaderSuccessRead) { } arrow::ArrayVector result_array_vector; - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); auto& [batch, bitmap] = batch_with_bitmap; ASSERT_EQ(batch.first->length, bitmap.Cardinality()); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), 0); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 0); ASSERT_OK_AND_ASSIGN(auto array, ReadResultCollector::GetArray(std::move(batch))); result_array_vector.push_back(array); ASSERT_OK(prefetch_reader->GetReadStatus()); @@ -676,9 +677,9 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { ->SetNextBatchStatus(Status::IOError("mock error")); } - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); auto batch_result = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(batch_result.ok()); ASSERT_TRUE(batch_result.status().IsIOError()); ASSERT_FALSE(prefetch_reader->is_shutdown_); @@ -687,7 +688,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { // call NextBatch again, will still return error status auto batch_result2 = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(batch_result2.ok()); ASSERT_TRUE(batch_result2.status().IsIOError()); } @@ -704,11 +705,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPrefetchWithEmptyData) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_FALSE(result_array); } @@ -724,7 +725,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -829,11 +830,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestPrefetchWithPredicatePushdownWithCom PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), 80); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 80); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 30)); @@ -865,11 +866,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_OK(reader->RefreshReadRanges()); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetGlobalRowId(0).value(), 80); + ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 80); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 20)); diff --git a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h index f6d824a76..cb2867075 100644 --- a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h +++ b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h @@ -82,8 +82,8 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { return Status::Invalid("ApplyDeletionVectorBatchReader does not support SetReadSchema"); } - Result GetGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetGlobalRowId(batch_row_id); + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -98,7 +98,7 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; for (int32_t i = 0; i < batch_size; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(i)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchGlobalRowId(i)); PAIMON_ASSIGN_OR_RAISE(bool is_deleted, deletion_vector_->IsDeleted(global_row_id)); if (!is_deleted) { is_valid.Add(i); diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp index 8ed53615a..54e55b380 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp @@ -91,7 +91,8 @@ CompleteRowTrackingFieldsBatchReader::NextBatchWithBitmap() { return Status::Invalid( "unexpected: read _ROW_ID special field, but first row id is null in meta"); } - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetGlobalRowId(idx_in_array)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, + reader_->GetPreviousBatchGlobalRowId(idx_in_array)); return first_row_id_.value() + global_row_id; }; PAIMON_RETURN_NOT_OK(ConvertRowTrackingField(src_struct_array->length(), /*init_value=*/0, diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.h b/src/paimon/core/io/complete_row_tracking_fields_reader.h index 6f49fea90..b3a42cb69 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.h +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.h @@ -60,8 +60,8 @@ class CompleteRowTrackingFieldsBatchReader : public FileBatchReader { reader_->Close(); } - Result GetGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetGlobalRowId(batch_row_id); + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h index 39c02dcaf..5a75ba440 100644 --- a/src/paimon/core/io/field_mapping_reader.h +++ b/src/paimon/core/io/field_mapping_reader.h @@ -74,8 +74,8 @@ class FieldMappingReader : public FileBatchReader { return Status::Invalid("FieldMappingReader does not support SetReadSchema"); } - Result GetGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetGlobalRowId(batch_row_id); + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchGlobalRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/key_value_data_file_record_reader.cpp b/src/paimon/core/io/key_value_data_file_record_reader.cpp index ea5deb5dd..9ccc71486 100644 --- a/src/paimon/core/io/key_value_data_file_record_reader.cpp +++ b/src/paimon/core/io/key_value_data_file_record_reader.cpp @@ -81,7 +81,8 @@ Result KeyValueDataFileRecordReader::Iterator::Next() { Result> KeyValueDataFileRecordReader::Iterator::NextWithFilePos() { PAIMON_ASSIGN_OR_RAISE(KeyValue kv, Next()); - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->reader_->GetGlobalRowId(cursor_ - 1)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, + reader_->reader_->GetPreviousBatchGlobalRowId(cursor_ - 1)); return std::make_pair(static_cast(global_row_id), std::move(kv)); } diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index 355e849a6..463264f39 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -45,7 +45,7 @@ class AvroFileBatchReader : public FileBatchReader { Status SetReadSchema(::ArrowSchema* read_schema, const std::shared_ptr& predicate, const std::optional& selection_bitmap) override; - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { return previous_first_row_ + batch_row_id; } diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index bd2b1e1fd..c3970bf33 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -327,7 +327,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) "does not support nested sub-field projection"); } -TEST_F(AvroFileBatchReaderTest, TestGetGlobalRowId) { +TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchGlobalRowId) { std::string path = paimon::test::GetDataDir() + "/avro/append_simple.db/" "append_simple/bucket-0/" @@ -352,25 +352,25 @@ TEST_F(AvroFileBatchReaderTest, TestGetGlobalRowId) { ASSERT_OK_AND_ASSIGN(auto num_rows, reader->GetNumberOfRows()); ASSERT_EQ(4, num_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(1, reader->GetPreviousBatchGlobalRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(2, reader->GetPreviousBatchGlobalRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(3, reader->GetPreviousBatchGlobalRowId(0).value()); ArrowArrayRelease(batch4.first.get()); ArrowSchemaRelease(batch4.second.get()); ASSERT_OK_AND_ASSIGN(auto batch5, reader->NextBatch()); - ASSERT_EQ(4, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(4, reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch5)); } @@ -396,7 +396,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_OK_AND_ASSIGN(auto reader, reader_builder->Build(in)); ASSERT_OK_AND_ASSIGN(auto first_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); auto first_array = arrow::ImportArray(first_batch.first.get(), first_batch.second.get()).ValueOrDie(); ASSERT_TRUE(first_array->Equals(src_array->Slice(0, 2))) << first_array->ToString(); @@ -406,10 +406,10 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); ASSERT_OK(reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto projected_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); auto projected_array = arrow::ImportArray(projected_batch.first.get(), projected_batch.second.get()).ValueOrDie(); auto expected_projected_array = arrow::ipc::internal::json::ArrayFromJSON( diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index 0c04b43f3..15d6c8075 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -97,10 +97,10 @@ class BlobFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { if (all_blob_lengths_.size() != target_blob_lengths_.size()) { return Status::Invalid( - "Cannot call GetGlobalRowId in BlobFileBatchReader because, after " + "Cannot call GetPreviousBatchGlobalRowId in BlobFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index f3687eb5e..afee869b5 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -169,21 +169,21 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(3, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(1, reader->GetPreviousBatchGlobalRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(2, reader->GetPreviousBatchGlobalRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(3, reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } @@ -254,7 +254,7 @@ TEST_P(BlobFileBatchReaderTest, EmptyFile) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(0, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch)); } diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index 361561411..ba8e16cf2 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -41,11 +41,11 @@ class LanceFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { if (!read_row_ids_.empty() && read_row_ids_.size() != num_rows_) { // TODO(xinyu.lxy): support function return Status::Invalid( - "Cannot call GetGlobalRowId in LanceFileBatchReader because, after " + "Cannot call GetPreviousBatchGlobalRowId in LanceFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp index eab303905..920cd4b0e 100644 --- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp +++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp @@ -478,26 +478,26 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK_AND_ASSIGN( std::unique_ptr reader, LanceFileBatchReader::Create(file_path, /*batch_size=*/4, /*batch_readahead=*/2)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); // first batch row 0-3 ASSERT_OK_AND_ASSIGN(auto read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(auto read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(0, 4))); - ASSERT_EQ(0, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); // second batch 4-5 ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(4, 2))); - ASSERT_EQ(4, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(4, reader->GetPreviousBatchGlobalRowId(0).value()); // eof ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(read_batch)); - ASSERT_EQ(6, reader->GetGlobalRowId(0).value()); + ASSERT_EQ(6, reader->GetPreviousBatchGlobalRowId(0).value()); // test with bitmap pushdown ArrowSchema c_read_schema; @@ -505,8 +505,8 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK(reader->SetReadSchema(&c_read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/RoaringBitmap32::From({0, 3}))); ASSERT_NOK_WITH_MSG( - reader->GetGlobalRowId(0), - "Cannot call GetGlobalRowId in LanceFileBatchReader because, after bitmap " + reader->GetPreviousBatchGlobalRowId(0), + "Cannot call GetPreviousBatchGlobalRowId in LanceFileBatchReader because, after bitmap " "pushdown, rows in the array returned by NextBatch are no longer contiguous."); } } // namespace paimon::lance::test diff --git a/src/paimon/format/orc/orc_file_batch_reader.h b/src/paimon/format/orc/orc_file_batch_reader.h index 0e3fade23..3cd870db3 100644 --- a/src/paimon/format/orc/orc_file_batch_reader.h +++ b/src/paimon/format/orc/orc_file_batch_reader.h @@ -62,7 +62,7 @@ class OrcFileBatchReader : public PrefetchFileBatchReader { // OrcFileBatchReader. Therefore, we need to hold BatchReader when using output ArrowArray. Result NextBatch() override; - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { return reader_->GetRowNumber() + batch_row_id; } diff --git a/src/paimon/format/orc/orc_file_batch_reader_test.cpp b/src/paimon/format/orc/orc_file_batch_reader_test.cpp index 57665fa0d..aef2ccbba 100644 --- a/src/paimon/format/orc/orc_file_batch_reader_test.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader_test.cpp @@ -492,10 +492,10 @@ TEST_P(OrcFileBatchReaderTest, TestNextBatchSimple) { for (auto batch_size : {1, 2, 3, 5, 8, 10}) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, batch_size, natural_read_size); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), -1); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( orc_batch_reader.get())); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 8); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 8); orc_batch_reader->Close(); auto expected_array = std::make_shared(struct_array_); ASSERT_TRUE(result_array->Equals(expected_array)); @@ -766,18 +766,18 @@ TEST_F(OrcFileBatchReaderTest, TestReadNoField) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, /*batch_size=*/3, /*natural_read_size=*/10); // read 3 rows - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), -1); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto batch1, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 0); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 0); // read 3 rows ASSERT_OK_AND_ASSIGN(auto batch2, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 3); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 3); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 6); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetGlobalRowId(0).value(), 8); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 8); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); orc_batch_reader->Close(); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index c89669632..8dde7e6a3 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -96,7 +96,7 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { Result>> GenReadRanges( bool* need_prefetch) const override; - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { if (batch_row_id >= row_mapping_.size()) { return std::numeric_limits::max(); } diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 4699fdff0..c92cfc196 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -475,7 +475,7 @@ TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, schema_, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, batch_size); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( parquet_batch_reader.get())); @@ -839,19 +839,19 @@ TEST_F(ParquetFileBatchReaderTest, TestReadNoField) { PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, /*batch_size=*/2); // read 2 rows - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto batch1, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 0); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 0); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch2, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 2); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 2); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 4); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 4); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 4); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 4); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); parquet_batch_reader->Close(); @@ -1060,14 +1060,14 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/2); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->Equals(expected_batch1)) << batch1->ToString(); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 1); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), 2); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 1); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), 2); // Not adjacent pages ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); @@ -1078,22 +1078,22 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { ])") .ValueOrDie()); ASSERT_TRUE(batch2->Equals(expected_batch2)) << batch2->ToString(); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 3); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), 5); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 3); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), 5); // Only one record read ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->Equals(expected_batch3)) << batch3->ToString(); - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 6); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); // out of bound, return max value - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(1).value(), + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto eof_batch, CollectOneBatch(parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return last none-eof batch's row id - ASSERT_EQ(parquet_batch_reader->GetGlobalRowId(0).value(), 6); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); } } // namespace paimon::parquet::test diff --git a/src/paimon/testing/mock/mock_file_batch_reader.h b/src/paimon/testing/mock/mock_file_batch_reader.h index 6eda3b005..6ef6428e1 100644 --- a/src/paimon/testing/mock/mock_file_batch_reader.h +++ b/src/paimon/testing/mock/mock_file_batch_reader.h @@ -149,7 +149,7 @@ class MockFileBatchReader : public PrefetchFileBatchReader { return metrics; } - Result GetGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { return previous_batch_first_row_num_ + batch_row_id; } From cd7bd44ab39566537719dbd6d608a1a9c728fcce Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 10:46:29 +0800 Subject: [PATCH 03/27] update header files --- include/paimon/reader/file_batch_reader.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/paimon/reader/file_batch_reader.h b/include/paimon/reader/file_batch_reader.h index 272de3c82..586c8cbc7 100644 --- a/include/paimon/reader/file_batch_reader.h +++ b/include/paimon/reader/file_batch_reader.h @@ -46,8 +46,8 @@ class PAIMON_EXPORT FileBatchReader : public BatchReader { using BatchReader::NextBatch; using BatchReader::NextBatchWithBitmap; - /// Get the row number of the first row in the previously read batch. - virtual Result GetPreviousBatchFirstRowNumber() const = 0; + /// Get the global row number of the row in the previously read batch. + virtual Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const = 0; /// Get the number of rows in the file. virtual Result GetNumberOfRows() const = 0; From 41f932d3c1eb2712ae4b0a8c231ac22249022cca Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 14:06:49 +0800 Subject: [PATCH 04/27] fix: return Status::Invalid intead of returning max value --- .../prefetch_file_batch_reader_impl.cpp | 11 ++++- .../prefetch_file_batch_reader_impl_test.cpp | 33 +++++++------ .../parquet/parquet_file_batch_reader.cpp | 1 + .../parquet/parquet_file_batch_reader.h | 9 +++- .../parquet_file_batch_reader_test.cpp | 47 ++++++++++--------- 5 files changed, 62 insertions(+), 39 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index b779d75c5..d22b1d318 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -564,7 +564,7 @@ Result PrefetchFileBatchReaderImpl::NextBatchW assert(false); return Status::Invalid("peek batch not suppose to be nullptr"); } - // current_batch_global_row_ids_.clear(); + current_batch_global_row_ids_.clear(); return BatchReader::MakeEofBatchWithBitmap(); } if (value_count == prefetch_queues_.size()) { @@ -600,8 +600,15 @@ Result> PrefetchFileBatchReaderImpl::GetFileSchem Result PrefetchFileBatchReaderImpl::GetPreviousBatchGlobalRowId( uint64_t batch_row_id) const { + if (current_batch_global_row_ids_.size() == 0) { + return Status::Invalid( + "Last batch is not read or last batch is empty, cannot get previous batch global row " + "id"); + } if (batch_row_id >= current_batch_global_row_ids_.size()) { - return std::numeric_limits::max(); + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", batch_row_id, + current_batch_global_row_ids_.size())); } return current_batch_global_row_ids_[batch_row_id]; } diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index f9713af54..120cd67e6 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -284,8 +284,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestSimple) { /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), - std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -606,10 +605,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestReadWithLargeBatchSize) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); } @@ -633,11 +633,13 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPartialReaderSuccessRead) { } arrow::ArrayVector result_array_vector; - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); auto& [batch, bitmap] = batch_with_bitmap; ASSERT_EQ(batch.first->length, bitmap.Cardinality()); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 0); + uint64_t global_row_id = 0; + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 0); ASSERT_OK_AND_ASSIGN(auto array, ReadResultCollector::GetArray(std::move(batch))); result_array_vector.push_back(array); ASSERT_OK(prefetch_reader->GetReadStatus()); @@ -678,9 +680,9 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { ->SetNextBatchStatus(Status::IOError("mock error")); } - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); auto batch_result = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_FALSE(batch_result.ok()); ASSERT_TRUE(batch_result.status().IsIOError()); ASSERT_FALSE(prefetch_reader->is_shutdown_); @@ -689,7 +691,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { // call NextBatch again, will still return error status auto batch_result2 = reader->NextBatchWithBitmap(); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_FALSE(batch_result2.ok()); ASSERT_TRUE(batch_result2.status().IsIOError()); } @@ -706,11 +708,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPrefetchWithEmptyData) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_FALSE(result_array); } @@ -726,7 +728,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -735,6 +737,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { // continue to call NextBatch() after reading eof ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch_with_bitmap)); } @@ -831,11 +834,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestPrefetchWithPredicatePushdownWithCom PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 80); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 30)); @@ -867,11 +870,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_OK(reader->RefreshReadRanges()); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), std::numeric_limits::max()); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_EQ(reader->GetPreviousBatchGlobalRowId(0).value(), 80); + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 20)); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 16c79dd0e..e6617ecab 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -346,6 +346,7 @@ Result ParquetFileBatchReader::NextBatch() { try { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr batch, reader_->Next()); if (batch == nullptr) { + row_mapping_.clear(); return BatchReader::MakeEofBatch(); } PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 8dde7e6a3..4d2743eef 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -97,8 +97,15 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { bool* need_prefetch) const override; Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + if (row_mapping_.size() == 0) { + return Status::Invalid( + "Last batch is not read or last batch is empty, cannot get previous batch global " + "row id"); + } if (batch_row_id >= row_mapping_.size()) { - return std::numeric_limits::max(); + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", + batch_row_id, row_mapping_.size())); } return row_mapping_[batch_row_id]; } diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index c92cfc196..c9ddaab8e 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -475,8 +475,6 @@ TEST_F(ParquetFileBatchReaderTest, TestNextBatchSimple) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, schema_, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, batch_size); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), - std::numeric_limits::max()); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( parquet_batch_reader.get())); parquet_batch_reader->Close(); @@ -838,21 +836,24 @@ TEST_F(ParquetFileBatchReaderTest, TestReadNoField) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, /*batch_size=*/2); + uint64_t global_row_id = 0; // read 2 rows - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), - std::numeric_limits::max()); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 0); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 0); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch2, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 2); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 2); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 4); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 4); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, parquet_batch_reader->NextBatch()); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 4); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); parquet_batch_reader->Close(); arrow::FieldVector fields; @@ -1060,14 +1061,17 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/2); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), - std::numeric_limits::max()); - + uint64_t global_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->Equals(expected_batch1)) << batch1->ToString(); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 1); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), 2); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 1); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_EQ(global_row_id, 2); + // out of bound return invalid + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); // Not adjacent pages ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); @@ -1078,22 +1082,23 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { ])") .ValueOrDie()); ASSERT_TRUE(batch2->Equals(expected_batch2)) << batch2->ToString(); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 3); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), 5); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 3); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_EQ(global_row_id, 5); // Only one record read ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->Equals(expected_batch3)) << batch3->ToString(); - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); - // out of bound, return max value - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(1).value(), - std::numeric_limits::max()); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 6); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); ASSERT_OK_AND_ASSIGN(auto eof_batch, CollectOneBatch(parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); - // previous batch is eof, return last none-eof batch's row id - ASSERT_EQ(parquet_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); + // previous batch is eof, return invalid. + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); } } // namespace paimon::parquet::test From 6bd98d8cee06c80b8e09f6933aa53e3978d879eb Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 14:09:12 +0800 Subject: [PATCH 05/27] fix: lance and blob return NotImplemented --- src/paimon/format/blob/blob_file_batch_reader.h | 2 +- src/paimon/format/lance/lance_file_batch_reader.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index 15d6c8075..c4619aff5 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -99,7 +99,7 @@ class BlobFileBatchReader : public FileBatchReader { Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { if (all_blob_lengths_.size() != target_blob_lengths_.size()) { - return Status::Invalid( + return Status::NotImplemented( "Cannot call GetPreviousBatchGlobalRowId in BlobFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index ba8e16cf2..cfd899c76 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -44,7 +44,7 @@ class LanceFileBatchReader : public FileBatchReader { Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { if (!read_row_ids_.empty() && read_row_ids_.size() != num_rows_) { // TODO(xinyu.lxy): support function - return Status::Invalid( + return Status::NotImplemented( "Cannot call GetPreviousBatchGlobalRowId in LanceFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); From a6945676a4b6b4cc7d5ad7280900c693fd77b095 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 14:42:36 +0800 Subject: [PATCH 06/27] fix: add inclusive extend for fully matched rowgroup in SetReadSchema --- .../parquet/parquet_file_batch_reader.cpp | 4 +- .../parquet_file_batch_reader_test.cpp | 43 ++++++++++++++++++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index e6617ecab..123eef80b 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -217,7 +217,9 @@ Status ParquetFileBatchReader::SetReadSchema( target_row_groups.emplace_back( /*rg_index=*/rg_id, /*is_partially_matched=*/false, - /*ranges=*/RowRanges(Range(0, reader_->GetAllRowGroupRanges()[rg_id].second))); + /*ranges=*/ + RowRanges(Range(0, reader_->GetAllRowGroupRanges()[rg_id].second - + reader_->GetAllRowGroupRanges()[rg_id].first - 1))); } } PAIMON_ASSIGN_OR_RAISE(all_row_ranges_, GetAllTargetRowRanges(target_row_groups)); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index c9ddaab8e..627586ffc 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -1041,10 +1041,10 @@ TEST_F(ParquetFileBatchReaderTest, TestAddMetadataPerFieldMetadata) { ASSERT_TRUE(data->Equals(*result_array->chunk(0))) << result_array->ToString(); } -TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { +TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { arrow::FieldVector fields = {arrow::field("f0", arrow::int32())}; auto src_array = MakeSequentialIntData(12); - // data in file rowGroup0:[0, 1, 2, 3, 4, 5] | rowGroup1:[6, 7, 8, 9, 10, 11] + // data in file rowGroup0:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] // one row per page auto arrow_schema = arrow::schema(fields); WriteArray(file_path_, src_array, arrow_schema, /*write_batch_size=*/1, @@ -1101,4 +1101,43 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMapping) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); } +TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { + arrow::FieldVector fields = {arrow::field("f0", arrow::int32())}; + auto src_array = MakeSequentialIntData(12); + // data in file RowGroup0:[0, 1, 2] | RowGroup1:[3, 4, 5] | RowGroup2:[6, 7, 8] | RowGroup3:[9, + // 10, 11] one row per page + auto arrow_schema = arrow::schema(fields); + WriteArray(file_path_, src_array, arrow_schema, /*write_batch_size=*/1, + /*enable_dictionary=*/true, /*max_row_group_length=*/3, /*max_page_size=*/1); + + // 3<=f0<=5 || f0==6 || f0==8 + // RowGroup 1 is fully matched, RowGroup 2 is partially matched, RowGroup 0 and RowGroup 3 are + // not matched. + ASSERT_OK_AND_ASSIGN( + auto predicate, + PredicateBuilder::Or({PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(3), Literal(5)), + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(6)), + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(8))})); + + auto parquet_batch_reader = PrepareParquetFileBatchReader( + file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); + + uint64_t global_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 3); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); + ASSERT_EQ(global_row_id, 5); + + ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 6); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_EQ(global_row_id, 8); +} + } // namespace paimon::parquet::test From eb48e4239d55526f4497347f297dd7942aadf8d2 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 14:58:31 +0800 Subject: [PATCH 07/27] fix: calling SetReadSchema many time do not clear row_mapping --- .../parquet/parquet_file_batch_reader.cpp | 3 +- .../parquet/parquet_file_batch_reader.h | 3 +- .../parquet_file_batch_reader_test.cpp | 50 +++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 123eef80b..73d066b6a 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -534,7 +534,8 @@ Result> ParquetFileBatchReader::ComputeNestedColumnIndices( } Result ParquetFileBatchReader::GetAllTargetRowRanges( - const std::vector& target_row_groups) const { + const std::vector& target_row_groups) { + row_mapping_.clear(); auto all_row_group_ranges = reader_->GetAllRowGroupRanges(); RowRanges all_ranges; for (const auto& target_row_group : target_row_groups) { diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 4d2743eef..b05345037 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -184,8 +184,7 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { const std::shared_ptr& read_schema, const std::shared_ptr& file_schema); - Result GetAllTargetRowRanges( - const std::vector& target_row_groups) const; + Result GetAllTargetRowRanges(const std::vector& target_row_groups); // precondition: predicate supposed not be empty Result> FilterRowGroupsByPredicate( diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 627586ffc..23e516dd4 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -1140,4 +1140,54 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { ASSERT_EQ(global_row_id, 8); } +TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { + arrow::FieldVector fields = {arrow::field("f0", arrow::int32())}; + auto src_array = MakeSequentialIntData(12); + // data in file RowGroup0:[0, 1, 2] | RowGroup1:[3, 4, 5] | RowGroup2:[6, 7, 8] | RowGroup3:[9, + // 10, 11] one row per page + auto arrow_schema = arrow::schema(fields); + WriteArray(file_path_, src_array, arrow_schema, /*write_batch_size=*/1, + /*enable_dictionary=*/true, /*max_row_group_length=*/3, /*max_page_size=*/1); + + // 1<=f0<=3 || 6<=f0<=7 + ASSERT_OK_AND_ASSIGN( + auto predicate, + PredicateBuilder::Or({PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(1), Literal(3)), + PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(6), Literal(7))})); + + auto parquet_batch_reader = PrepareParquetFileBatchReader( + file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); + + uint64_t global_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 1); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_EQ(global_row_id, 2); + + ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 3); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + + ASSERT_OK_AND_ASSIGN( + predicate, + PredicateBuilder::Or({PredicateBuilder::Between(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(3), Literal(5))})); + + std::unique_ptr c_schema = std::make_unique(); + auto arrow_status = arrow::ExportSchema(*arrow_schema, c_schema.get()); + ASSERT_OK( + parquet_batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/predicate, std::nullopt)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 3); + ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); + ASSERT_EQ(global_row_id, 5); +} + } // namespace paimon::parquet::test From 7b0079453e87b968ff364262cdea34776bcba699 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 16:01:27 +0800 Subject: [PATCH 08/27] test: add test for PrefetchFileBatchReaderImpl --- .../prefetch_file_batch_reader_impl_test.cpp | 52 +++++++++++++++++++ .../parquet_file_batch_reader_test.cpp | 49 ++++++++--------- .../testing/utils/read_result_collector.h | 47 +++++++++++++++++ 3 files changed, 121 insertions(+), 27 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index 120cd67e6..e3cda6eea 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -914,4 +914,56 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPrefetchWithBitmap) { ASSERT_TRUE(result_chunk_array->Equals(expected_chunk_array)); } +TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { + auto [file_format, cache_mode] = GetParam(); + auto data_array = PrepareArray(90); + PrepareTestData(file_format, data_array, /*stripe_row_count=*/30, /*row_index_stride=*/10); + auto schema = arrow::schema(fields_); + ASSERT_OK_AND_ASSIGN( + auto predicate, + PredicateBuilder::Or({ + PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(20l), Literal(29l)), + PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(70l), Literal(79l)), + })); + + auto reader = + PreparePrefetchReader(file_format, schema.get(), predicate, + /*selection_bitmap=*/std::nullopt, + /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); + uint64_t global_row_id = 0; + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_OK_AND_ASSIGN(auto batch, + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 20); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); + ASSERT_EQ(global_row_id, 25); + + ASSERT_OK_AND_ASSIGN(batch, + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 70); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); + ASSERT_EQ(global_row_id, 75); + + // Set read schema again + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*schema, c_schema.get()).ok()); + predicate = PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(30l), Literal(49l)); + ASSERT_OK(reader->SetReadSchema(c_schema.get(), predicate, std::nullopt)); + + ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_OK_AND_ASSIGN(batch, + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_EQ(global_row_id, 30); + ASSERT_OK_AND_ASSIGN(batch, + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); + ASSERT_EQ(global_row_id, 45); +} + } // namespace paimon::test diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 23e516dd4..4e705674f 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -243,20 +243,6 @@ static std::shared_ptr MakeSequentialIntData(int32_t num_row return arrow::StructArray::Make({val_array}, {field}).ValueOrDie(); } -static Result> CollectOneBatch(ParquetFileBatchReader* reader) { - PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatch batch, reader->NextBatch()); - if (BatchReader::IsEofBatch(batch)) { - return std::shared_ptr(nullptr); - } - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr result_array, - arrow::ImportArray(batch.first.get(), batch.second.get())); - auto struct_array = std::dynamic_pointer_cast(result_array); - if (!struct_array) { - return Status::Invalid("CollectOneBatch expected StructArray"); - } - return struct_array; -} - TEST_F(ParquetFileBatchReaderTest, TestParquetMetadataCacheReusesSerializedFooter) { WriteArray(file_path_, struct_array_, schema_, /*write_batch_size=*/struct_array_->length(), /*enable_dictionary=*/false, @@ -1063,9 +1049,10 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); - ASSERT_TRUE(batch1->Equals(expected_batch1)) << batch1->ToString(); + ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 1); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); @@ -1074,28 +1061,31 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); // Not adjacent pages - ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); auto expected_batch2 = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ [3], [5] ])") .ValueOrDie()); - ASSERT_TRUE(batch2->Equals(expected_batch2)) << batch2->ToString(); + ASSERT_TRUE(batch2->chunk(0)->Equals(expected_batch2)) << batch2->ToString(); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); ASSERT_EQ(global_row_id, 5); // Only one record read - ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch3, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); - ASSERT_TRUE(batch3->Equals(expected_batch3)) << batch3->ToString(); + ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 6); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_OK_AND_ASSIGN(auto eof_batch, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto eof_batch, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return invalid. ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); @@ -1127,13 +1117,15 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); ASSERT_EQ(global_row_id, 5); - ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 6); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); @@ -1162,13 +1154,15 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 1); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); ASSERT_EQ(global_row_id, 2); - ASSERT_OK_AND_ASSIGN(auto batch2, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); @@ -1179,11 +1173,12 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { FieldType::INT, Literal(3), Literal(5))})); std::unique_ptr c_schema = std::make_unique(); - auto arrow_status = arrow::ExportSchema(*arrow_schema, c_schema.get()); + ASSERT_TRUE(arrow::ExportSchema(*arrow_schema, c_schema.get()).ok()); ASSERT_OK( parquet_batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/predicate, std::nullopt)); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch3, CollectOneBatch(parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN(auto batch3, paimon::test::ReadResultCollector::CollectResultOneBatch( + parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); diff --git a/src/paimon/testing/utils/read_result_collector.h b/src/paimon/testing/utils/read_result_collector.h index 5106f3b26..ce16a66dc 100644 --- a/src/paimon/testing/utils/read_result_collector.h +++ b/src/paimon/testing/utils/read_result_collector.h @@ -131,6 +131,53 @@ class ReadResultCollector { return chunk_array; } + static Result> CollectResultOneBatch( + BatchReader* batch_reader) { + return CollectResultOneBatch(batch_reader, /*max simulated data processing time*/ 0); + } + + static Result> CollectResultOneBatch( + BatchReader* batch_reader, int64_t max_data_processing_time_in_us) { + int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); + std::srand(seed); + auto batch_result = batch_reader->NextBatch(); + BatchReader::ReadBatch batch; + if (!batch_result.ok()) { + if (batch_result.status().ToString().find("should use NextBatchWithBitmap") != + std::string::npos) { + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, + batch_reader->NextBatchWithBitmap()); + if (BatchReader::IsEofBatch(batch_with_bitmap)) { + return std::shared_ptr(); + } + assert(!batch_with_bitmap.second.IsEmpty()); + PAIMON_ASSIGN_OR_RAISE( + batch, ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap), + arrow::default_memory_pool())); + } else { + return batch_result.status(); + } + } else { + batch = std::move(batch_result).value(); + if (BatchReader::IsEofBatch(batch)) { + return std::shared_ptr(); + } + } + auto& [c_array, c_schema] = batch; + assert(c_array->length > 0); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result_array, + arrow::ImportArray(c_array.get(), c_schema.get())); + PAIMON_ASSIGN_OR_RAISE( + auto converted_array, + DictArrayConverter::ConvertDictArray(result_array, arrow::default_memory_pool())); + if (max_data_processing_time_in_us > 0) { + usleep(std::rand() % max_data_processing_time_in_us); + } + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto chunk_array, + arrow::ChunkedArray::Make({converted_array})); + return chunk_array; + } + static Result> GetArray(BatchReader::ReadBatch&& batch) { if (BatchReader::IsEofBatch(batch)) { return std::shared_ptr(); From 0d9a174cabca67a271828ff35e0bedc2dc72212b Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Fri, 26 Jun 2026 16:13:42 +0800 Subject: [PATCH 09/27] style: replace auto in assigning macro with explicit type --- .../prefetch_file_batch_reader_impl_test.cpp | 2 +- .../parquet_file_batch_reader_test.cpp | 45 +++++++++++-------- .../testing/utils/read_result_collector.h | 4 +- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index e3cda6eea..350854900 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -934,7 +934,7 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); uint64_t global_row_id = 0; ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch, + ASSERT_OK_AND_ASSIGN(std::shared_ptr batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 20); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 4e705674f..530915545 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -1049,8 +1049,9 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch1, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); @@ -1061,8 +1062,9 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); // Not adjacent pages - ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch2, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch2 = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ [3], @@ -1076,16 +1078,18 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_EQ(global_row_id, 5); // Only one record read - ASSERT_OK_AND_ASSIGN(auto batch3, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch3, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 6); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_OK_AND_ASSIGN(auto eof_batch, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr eof_batch, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return invalid. ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); @@ -1117,15 +1121,17 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch1, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); ASSERT_EQ(global_row_id, 5); - ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch2, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 6); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); @@ -1154,15 +1160,17 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { uint64_t global_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch1, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch1, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 1); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); ASSERT_EQ(global_row_id, 2); - ASSERT_OK_AND_ASSIGN(auto batch2, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch2, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); @@ -1177,8 +1185,9 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_OK( parquet_batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/predicate, std::nullopt)); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_OK_AND_ASSIGN(auto batch3, paimon::test::ReadResultCollector::CollectResultOneBatch( - parquet_batch_reader.get())); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr batch3, + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); ASSERT_EQ(global_row_id, 3); ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); diff --git a/src/paimon/testing/utils/read_result_collector.h b/src/paimon/testing/utils/read_result_collector.h index ce16a66dc..f2ab0d1fc 100644 --- a/src/paimon/testing/utils/read_result_collector.h +++ b/src/paimon/testing/utils/read_result_collector.h @@ -165,7 +165,7 @@ class ReadResultCollector { } auto& [c_array, c_schema] = batch; assert(c_array->length > 0); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result_array, + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr result_array, arrow::ImportArray(c_array.get(), c_schema.get())); PAIMON_ASSIGN_OR_RAISE( auto converted_array, @@ -173,7 +173,7 @@ class ReadResultCollector { if (max_data_processing_time_in_us > 0) { usleep(std::rand() % max_data_processing_time_in_us); } - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto chunk_array, + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr chunk_array, arrow::ChunkedArray::Make({converted_array})); return chunk_array; } From 1336922035230a416652a8fdd81ef936449f8c94 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 10:12:41 +0800 Subject: [PATCH 10/27] style: rename interfaces and parameters --- .../map_shared_shredding_file_reader.cpp | 4 +- .../map_shared_shredding_file_reader.h | 2 +- .../bitmap/apply_bitmap_index_batch_reader.h | 6 +- .../reader/delegating_prefetch_reader.h | 4 +- .../prefetch_file_batch_reader_impl.cpp | 4 +- .../reader/prefetch_file_batch_reader_impl.h | 2 +- .../prefetch_file_batch_reader_impl_test.cpp | 66 ++++++------- .../apply_deletion_vector_batch_reader.h | 8 +- .../complete_row_tracking_fields_reader.cpp | 6 +- .../io/complete_row_tracking_fields_reader.h | 4 +- src/paimon/core/io/field_mapping_reader.h | 4 +- .../io/key_value_data_file_record_reader.cpp | 2 +- .../format/avro/avro_file_batch_reader.h | 2 +- .../avro/avro_file_batch_reader_test.cpp | 20 ++-- .../format/blob/blob_file_batch_reader.h | 4 +- .../blob/blob_file_batch_reader_test.cpp | 12 +-- .../format/lance/lance_file_batch_reader.h | 4 +- .../lance/lance_format_reader_writer_test.cpp | 12 +-- src/paimon/format/orc/orc_file_batch_reader.h | 2 +- .../format/orc/orc_file_batch_reader_test.cpp | 14 +-- .../parquet/parquet_file_batch_reader.h | 2 +- .../parquet_file_batch_reader_test.cpp | 96 +++++++++---------- .../testing/mock/mock_file_batch_reader.h | 2 +- 23 files changed, 141 insertions(+), 141 deletions(-) diff --git a/src/paimon/common/data/shredding/map_shared_shredding_file_reader.cpp b/src/paimon/common/data/shredding/map_shared_shredding_file_reader.cpp index 435f21f53..71deb1114 100644 --- a/src/paimon/common/data/shredding/map_shared_shredding_file_reader.cpp +++ b/src/paimon/common/data/shredding/map_shared_shredding_file_reader.cpp @@ -391,9 +391,9 @@ void MapSharedShreddingFileReader::Close() { reader_->Close(); } -Result MapSharedShreddingFileReader::GetPreviousBatchGlobalRowId( +Result MapSharedShreddingFileReader::GetPreviousBatchFileRowId( uint64_t batch_row_id) const { - return reader_->GetPreviousBatchGlobalRowId(batch_row_id); + return reader_->GetPreviousBatchFileRowId(batch_row_id); } Result MapSharedShreddingFileReader::GetNumberOfRows() const { diff --git a/src/paimon/common/data/shredding/map_shared_shredding_file_reader.h b/src/paimon/common/data/shredding/map_shared_shredding_file_reader.h index 74359cf36..46f053517 100644 --- a/src/paimon/common/data/shredding/map_shared_shredding_file_reader.h +++ b/src/paimon/common/data/shredding/map_shared_shredding_file_reader.h @@ -60,7 +60,7 @@ class MapSharedShreddingFileReader : public FileBatchReader { void Close() override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override; + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index 8213c4f2a..e8428eaf3 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -80,8 +80,8 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { return Status::Invalid("ApplyBitmapIndexBatchReader does not support SetReadSchema"); } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetPreviousBatchGlobalRowId(batch_row_id); + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchFileRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -96,7 +96,7 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; for (int32_t i = 0; i < batch_size; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchGlobalRowId(i)); + PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchFileRowId(i)); if (bitmap_.Contains(global_row_id)) { is_valid.Add(i); } diff --git a/src/paimon/common/reader/delegating_prefetch_reader.h b/src/paimon/common/reader/delegating_prefetch_reader.h index 64d57a155..0a22f8263 100644 --- a/src/paimon/common/reader/delegating_prefetch_reader.h +++ b/src/paimon/common/reader/delegating_prefetch_reader.h @@ -54,8 +54,8 @@ class DelegatingPrefetchReader : public FileBatchReader { return prefetch_reader_->SetReadSchema(read_schema, predicate, selection_bitmap); } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { - return GetReader()->GetPreviousBatchGlobalRowId(batch_row_id); + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + return GetReader()->GetPreviousBatchFileRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index d22b1d318..dc8a9ad5f 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -432,7 +432,7 @@ Status PrefetchFileBatchReaderImpl::HandleReadResult( global_row_ids.reserve(c_array->length); for (int64_t i = 0; i < c_array->length; ++i) { PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, - readers_[reader_idx]->GetPreviousBatchGlobalRowId(i)); + readers_[reader_idx]->GetPreviousBatchFileRowId(i)); global_row_ids.push_back(global_row_id); } if (global_row_ids.empty()) { @@ -598,7 +598,7 @@ Result> PrefetchFileBatchReaderImpl::GetFileSchem return readers_[0]->GetFileSchema(); } -Result PrefetchFileBatchReaderImpl::GetPreviousBatchGlobalRowId( +Result PrefetchFileBatchReaderImpl::GetPreviousBatchFileRowId( uint64_t batch_row_id) const { if (current_batch_global_row_ids_.size() == 0) { return Status::Invalid( diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h index 908e06b85..19f936c8f 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.h +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.h @@ -76,7 +76,7 @@ class PrefetchFileBatchReaderImpl : public PrefetchFileBatchReader { const std::optional& selection_bitmap) override; Status SeekToRow(uint64_t row_number) override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override; + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override; Result GetNumberOfRows() const override; uint64_t GetNextRowToRead() const override; void Close() override; diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index 350854900..c5d60f322 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -284,7 +284,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestSimple) { /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -605,11 +605,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestReadWithLargeBatchSize) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); } @@ -633,13 +633,13 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPartialReaderSuccessRead) { } arrow::ArrayVector result_array_vector; - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); auto& [batch, bitmap] = batch_with_bitmap; ASSERT_EQ(batch.first->length, bitmap.Cardinality()); - uint64_t global_row_id = 0; - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 0); + uint64_t file_row_id = 0; + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 0); ASSERT_OK_AND_ASSIGN(auto array, ReadResultCollector::GetArray(std::move(batch))); result_array_vector.push_back(array); ASSERT_OK(prefetch_reader->GetReadStatus()); @@ -680,9 +680,9 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { ->SetNextBatchStatus(Status::IOError("mock error")); } - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); auto batch_result = reader->NextBatchWithBitmap(); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_FALSE(batch_result.ok()); ASSERT_TRUE(batch_result.status().IsIOError()); ASSERT_FALSE(prefetch_reader->is_shutdown_); @@ -691,7 +691,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestAllReaderFailedWithIOError) { // call NextBatch again, will still return error status auto batch_result2 = reader->NextBatchWithBitmap(); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_FALSE(batch_result2.ok()); ASSERT_TRUE(batch_result2.status().IsIOError()); } @@ -708,11 +708,11 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPrefetchWithEmptyData) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_FALSE(result_array); } @@ -728,7 +728,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { prefetch_max_parallel_num * 2, /*enable_adaptive_prefetch_strategy=*/false, executor_, /*initialize_read_ranges=*/true, /*prefetch_cache_mode=*/PrefetchCacheMode::ALWAYS, CacheConfig(), GetDefaultPool())); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); @@ -737,7 +737,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { // continue to call NextBatch() after reading eof ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch_with_bitmap)); } @@ -834,11 +834,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestPrefetchWithPredicatePushdownWithCom PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 30)); @@ -870,11 +870,11 @@ TEST_P(PrefetchFileBatchReaderImplTest, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_OK(reader->RefreshReadRanges()); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); arrow::ArrayVector expected_array_vector; expected_array_vector.push_back(data_array->Slice(0, 20)); @@ -932,21 +932,21 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - uint64_t global_row_id = 0; - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + uint64_t file_row_id = 0; + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(std::shared_ptr batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 20); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); - ASSERT_EQ(global_row_id, 25); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 20); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); + ASSERT_EQ(file_row_id, 25); ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 70); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); - ASSERT_EQ(global_row_id, 75); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 70); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); + ASSERT_EQ(file_row_id, 75); // Set read schema again std::unique_ptr c_schema = std::make_unique(); @@ -955,15 +955,15 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { Literal(30l), Literal(49l)); ASSERT_OK(reader->SetReadSchema(c_schema.get(), predicate, std::nullopt)); - ASSERT_NOK(reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 30); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 30); ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, reader->GetPreviousBatchGlobalRowId(5)); - ASSERT_EQ(global_row_id, 45); + ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); + ASSERT_EQ(file_row_id, 45); } } // namespace paimon::test diff --git a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h index cb2867075..48fec8b31 100644 --- a/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h +++ b/src/paimon/core/deletionvectors/apply_deletion_vector_batch_reader.h @@ -82,8 +82,8 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { return Status::Invalid("ApplyDeletionVectorBatchReader does not support SetReadSchema"); } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetPreviousBatchGlobalRowId(batch_row_id); + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchFileRowId(batch_row_id); } Result GetNumberOfRows() const override { @@ -98,8 +98,8 @@ class ApplyDeletionVectorBatchReader : public FileBatchReader { Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; for (int32_t i = 0; i < batch_size; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchGlobalRowId(i)); - PAIMON_ASSIGN_OR_RAISE(bool is_deleted, deletion_vector_->IsDeleted(global_row_id)); + PAIMON_ASSIGN_OR_RAISE(uint64_t file_row_id, reader_->GetPreviousBatchFileRowId(i)); + PAIMON_ASSIGN_OR_RAISE(bool is_deleted, deletion_vector_->IsDeleted(file_row_id)); if (!is_deleted) { is_valid.Add(i); } diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp index 54e55b380..29b610b16 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.cpp +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.cpp @@ -91,9 +91,9 @@ CompleteRowTrackingFieldsBatchReader::NextBatchWithBitmap() { return Status::Invalid( "unexpected: read _ROW_ID special field, but first row id is null in meta"); } - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, - reader_->GetPreviousBatchGlobalRowId(idx_in_array)); - return first_row_id_.value() + global_row_id; + PAIMON_ASSIGN_OR_RAISE(uint64_t file_row_id, + reader_->GetPreviousBatchFileRowId(idx_in_array)); + return first_row_id_.value() + file_row_id; }; PAIMON_RETURN_NOT_OK(ConvertRowTrackingField(src_struct_array->length(), /*init_value=*/0, row_id_convert_func, &row_id_array)); diff --git a/src/paimon/core/io/complete_row_tracking_fields_reader.h b/src/paimon/core/io/complete_row_tracking_fields_reader.h index b3a42cb69..2aa535d79 100644 --- a/src/paimon/core/io/complete_row_tracking_fields_reader.h +++ b/src/paimon/core/io/complete_row_tracking_fields_reader.h @@ -60,8 +60,8 @@ class CompleteRowTrackingFieldsBatchReader : public FileBatchReader { reader_->Close(); } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetPreviousBatchGlobalRowId(batch_row_id); + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchFileRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/field_mapping_reader.h b/src/paimon/core/io/field_mapping_reader.h index 71d715b89..9efd4a07c 100644 --- a/src/paimon/core/io/field_mapping_reader.h +++ b/src/paimon/core/io/field_mapping_reader.h @@ -77,8 +77,8 @@ class FieldMappingReader : public FileBatchReader { return Status::Invalid("FieldMappingReader does not support SetReadSchema"); } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { - return reader_->GetPreviousBatchGlobalRowId(batch_row_id); + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + return reader_->GetPreviousBatchFileRowId(batch_row_id); } Result GetNumberOfRows() const override { diff --git a/src/paimon/core/io/key_value_data_file_record_reader.cpp b/src/paimon/core/io/key_value_data_file_record_reader.cpp index 9ccc71486..8d12ec746 100644 --- a/src/paimon/core/io/key_value_data_file_record_reader.cpp +++ b/src/paimon/core/io/key_value_data_file_record_reader.cpp @@ -82,7 +82,7 @@ Result KeyValueDataFileRecordReader::Iterator::Next() { Result> KeyValueDataFileRecordReader::Iterator::NextWithFilePos() { PAIMON_ASSIGN_OR_RAISE(KeyValue kv, Next()); PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, - reader_->reader_->GetPreviousBatchGlobalRowId(cursor_ - 1)); + reader_->reader_->GetPreviousBatchFileRowId(cursor_ - 1)); return std::make_pair(static_cast(global_row_id), std::move(kv)); } diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index 463264f39..a90d82c5a 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -45,7 +45,7 @@ class AvroFileBatchReader : public FileBatchReader { Status SetReadSchema(::ArrowSchema* read_schema, const std::shared_ptr& predicate, const std::optional& selection_bitmap) override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { return previous_first_row_ + batch_row_id; } diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index c3970bf33..2e1e00c42 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -327,7 +327,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaRejectNestedSubFieldProjection) "does not support nested sub-field projection"); } -TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchGlobalRowId) { +TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFileRowId) { std::string path = paimon::test::GetDataDir() + "/avro/append_simple.db/" "append_simple/bucket-0/" @@ -352,25 +352,25 @@ TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchGlobalRowId) { ASSERT_OK_AND_ASSIGN(auto num_rows, reader->GetNumberOfRows()); ASSERT_EQ(4, num_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(1, reader->GetPreviousBatchFileRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(2, reader->GetPreviousBatchFileRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(3, reader->GetPreviousBatchFileRowId(0).value()); ArrowArrayRelease(batch4.first.get()); ArrowSchemaRelease(batch4.second.get()); ASSERT_OK_AND_ASSIGN(auto batch5, reader->NextBatch()); - ASSERT_EQ(4, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(4, reader->GetPreviousBatchFileRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch5)); } @@ -396,7 +396,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_OK_AND_ASSIGN(auto reader, reader_builder->Build(in)); ASSERT_OK_AND_ASSIGN(auto first_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); auto first_array = arrow::ImportArray(first_batch.first.get(), first_batch.second.get()).ValueOrDie(); ASSERT_TRUE(first_array->Equals(src_array->Slice(0, 2))) << first_array->ToString(); @@ -406,10 +406,10 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); ASSERT_OK(reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto projected_batch, reader->NextBatch()); - ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); auto projected_array = arrow::ImportArray(projected_batch.first.get(), projected_batch.second.get()).ValueOrDie(); auto expected_projected_array = arrow::ipc::internal::json::ArrayFromJSON( diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index c4619aff5..06afe6acc 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -97,10 +97,10 @@ class BlobFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { if (all_blob_lengths_.size() != target_blob_lengths_.size()) { return Status::NotImplemented( - "Cannot call GetPreviousBatchGlobalRowId in BlobFileBatchReader because, after " + "Cannot call GetPreviousBatchFileRowId in BlobFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index afee869b5..0cfe9351f 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -169,21 +169,21 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(3, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); - ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch2, reader->NextBatch()); - ASSERT_EQ(1, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(1, reader->GetPreviousBatchFileRowId(0).value()); ArrowArrayRelease(batch2.first.get()); ArrowSchemaRelease(batch2.second.get()); ASSERT_OK_AND_ASSIGN(auto batch3, reader->NextBatch()); - ASSERT_EQ(2, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(2, reader->GetPreviousBatchFileRowId(0).value()); ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(3, reader->GetPreviousBatchFileRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } @@ -254,7 +254,7 @@ TEST_P(BlobFileBatchReaderTest, EmptyFile) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(0, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); ASSERT_OK_AND_ASSIGN(auto batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch)); } diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index cfd899c76..c7ce9a4f4 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -41,11 +41,11 @@ class LanceFileBatchReader : public FileBatchReader { Result NextBatch() override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { if (!read_row_ids_.empty() && read_row_ids_.size() != num_rows_) { // TODO(xinyu.lxy): support function return Status::NotImplemented( - "Cannot call GetPreviousBatchGlobalRowId in LanceFileBatchReader because, after " + "Cannot call GetPreviousBatchFileRowId in LanceFileBatchReader because, after " "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp index 920cd4b0e..061d950f9 100644 --- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp +++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp @@ -478,26 +478,26 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK_AND_ASSIGN( std::unique_ptr reader, LanceFileBatchReader::Create(file_path, /*batch_size=*/4, /*batch_readahead=*/2)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); // first batch row 0-3 ASSERT_OK_AND_ASSIGN(auto read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(auto read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(0, 4))); - ASSERT_EQ(0, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); // second batch 4-5 ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_OK_AND_ASSIGN(read_array, paimon::test::ReadResultCollector::GetArray(std::move(read_batch))); ASSERT_TRUE(read_array->Equals(array->Slice(4, 2))); - ASSERT_EQ(4, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(4, reader->GetPreviousBatchFileRowId(0).value()); // eof ASSERT_OK_AND_ASSIGN(read_batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(read_batch)); - ASSERT_EQ(6, reader->GetPreviousBatchGlobalRowId(0).value()); + ASSERT_EQ(6, reader->GetPreviousBatchFileRowId(0).value()); // test with bitmap pushdown ArrowSchema c_read_schema; @@ -505,8 +505,8 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK(reader->SetReadSchema(&c_read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/RoaringBitmap32::From({0, 3}))); ASSERT_NOK_WITH_MSG( - reader->GetPreviousBatchGlobalRowId(0), - "Cannot call GetPreviousBatchGlobalRowId in LanceFileBatchReader because, after bitmap " + reader->GetPreviousBatchFileRowId(0), + "Cannot call GetPreviousBatchFileRowId in LanceFileBatchReader because, after bitmap " "pushdown, rows in the array returned by NextBatch are no longer contiguous."); } } // namespace paimon::lance::test diff --git a/src/paimon/format/orc/orc_file_batch_reader.h b/src/paimon/format/orc/orc_file_batch_reader.h index 3cd870db3..0f60c3779 100644 --- a/src/paimon/format/orc/orc_file_batch_reader.h +++ b/src/paimon/format/orc/orc_file_batch_reader.h @@ -62,7 +62,7 @@ class OrcFileBatchReader : public PrefetchFileBatchReader { // OrcFileBatchReader. Therefore, we need to hold BatchReader when using output ArrowArray. Result NextBatch() override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { return reader_->GetRowNumber() + batch_row_id; } diff --git a/src/paimon/format/orc/orc_file_batch_reader_test.cpp b/src/paimon/format/orc/orc_file_batch_reader_test.cpp index b0f56cf3f..87490e0a2 100644 --- a/src/paimon/format/orc/orc_file_batch_reader_test.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader_test.cpp @@ -493,10 +493,10 @@ TEST_P(OrcFileBatchReaderTest, TestNextBatchSimple) { for (auto batch_size : {1, 2, 3, 5, 8, 10}) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, batch_size, natural_read_size); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), -1); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( orc_batch_reader.get())); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 8); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 8); orc_batch_reader->Close(); auto expected_array = std::make_shared(struct_array_); ASSERT_TRUE(result_array->Equals(expected_array)); @@ -767,18 +767,18 @@ TEST_F(OrcFileBatchReaderTest, TestReadNoField) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, /*batch_size=*/3, /*natural_read_size=*/10); // read 3 rows - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), -1); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), -1); ASSERT_OK_AND_ASSIGN(auto batch1, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 0); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 0); // read 3 rows ASSERT_OK_AND_ASSIGN(auto batch2, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 3); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 6); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchGlobalRowId(0).value(), 8); + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 8); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); orc_batch_reader->Close(); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index b05345037..06237cec7 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -96,7 +96,7 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { Result>> GenReadRanges( bool* need_prefetch) const override; - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { if (row_mapping_.size() == 0) { return Status::Invalid( "Last batch is not read or last batch is empty, cannot get previous batch global " diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 530915545..c7255c209 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -822,24 +822,24 @@ TEST_F(ParquetFileBatchReaderTest, TestReadNoField) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, /*batch_size=*/2); - uint64_t global_row_id = 0; + uint64_t file_row_id = 0; // read 2 rows - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 0); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 0); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch2, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 2); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 2); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 4); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 4); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, parquet_batch_reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); parquet_batch_reader->Close(); arrow::FieldVector fields; @@ -1047,19 +1047,19 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/2); - uint64_t global_row_id = 0; - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + uint64_t file_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 1); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_EQ(global_row_id, 2); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 1); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); + ASSERT_EQ(file_row_id, 2); // out of bound return invalid - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(2)); // Not adjacent pages ASSERT_OK_AND_ASSIGN( @@ -1072,10 +1072,10 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ])") .ValueOrDie()); ASSERT_TRUE(batch2->chunk(0)->Equals(expected_batch2)) << batch2->ToString(); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 3); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_EQ(global_row_id, 5); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 3); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); + ASSERT_EQ(file_row_id, 5); // Only one record read ASSERT_OK_AND_ASSIGN( @@ -1083,16 +1083,16 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 6); - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 6); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); ASSERT_OK_AND_ASSIGN( std::shared_ptr eof_batch, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return invalid. - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); } TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { @@ -1119,23 +1119,23 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); - uint64_t global_row_id = 0; - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + uint64_t file_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 3); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); - ASSERT_EQ(global_row_id, 5); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 3); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(2)); + ASSERT_EQ(file_row_id, 5); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 6); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_EQ(global_row_id, 8); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 6); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); + ASSERT_EQ(file_row_id, 8); } TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { @@ -1158,22 +1158,22 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); - uint64_t global_row_id = 0; - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + uint64_t file_row_id = 0; + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 1); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); - ASSERT_EQ(global_row_id, 2); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 1); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); + ASSERT_EQ(file_row_id, 2); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 3); - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(1)); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 3); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); ASSERT_OK_AND_ASSIGN( predicate, @@ -1184,14 +1184,14 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_TRUE(arrow::ExportSchema(*arrow_schema, c_schema.get()).ok()); ASSERT_OK( parquet_batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/predicate, std::nullopt)); - ASSERT_NOK(parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); + ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(0)); - ASSERT_EQ(global_row_id, 3); - ASSERT_OK_AND_ASSIGN(global_row_id, parquet_batch_reader->GetPreviousBatchGlobalRowId(2)); - ASSERT_EQ(global_row_id, 5); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(file_row_id, 3); + ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(2)); + ASSERT_EQ(file_row_id, 5); } } // namespace paimon::parquet::test diff --git a/src/paimon/testing/mock/mock_file_batch_reader.h b/src/paimon/testing/mock/mock_file_batch_reader.h index be6a0e9ef..a28d38dd1 100644 --- a/src/paimon/testing/mock/mock_file_batch_reader.h +++ b/src/paimon/testing/mock/mock_file_batch_reader.h @@ -156,7 +156,7 @@ class MockFileBatchReader : public PrefetchFileBatchReader { return metrics; } - Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const override { + Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { return previous_batch_first_row_num_ + batch_row_id; } From 8f82f44d0b852f35b73064e9af9b31cc5f6b2f25 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 10:23:06 +0800 Subject: [PATCH 11/27] fix: use a more efficient way to apply bitmap --- .../file_index/bitmap/apply_bitmap_index_batch_reader.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index e8428eaf3..dd851c78b 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -96,12 +96,10 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { Result Filter(int32_t batch_size) const { RoaringBitmap32 is_valid; for (int32_t i = 0; i < batch_size; ++i) { - PAIMON_ASSIGN_OR_RAISE(uint64_t global_row_id, reader_->GetPreviousBatchFileRowId(i)); - if (bitmap_.Contains(global_row_id)) { - is_valid.Add(i); - } + PAIMON_ASSIGN_OR_RAISE(uint64_t file_row_id, reader_->GetPreviousBatchFileRowId(i)); + is_valid.Add(i); } - return is_valid; + return RoaringBitmap32::And(bitmap_, is_valid); } private: From d3b73e176f8b0a2705ba5af78976b9bd67e705f9 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 10:34:08 +0800 Subject: [PATCH 12/27] update headers --- include/paimon/reader/file_batch_reader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/paimon/reader/file_batch_reader.h b/include/paimon/reader/file_batch_reader.h index 586c8cbc7..ddb4e999d 100644 --- a/include/paimon/reader/file_batch_reader.h +++ b/include/paimon/reader/file_batch_reader.h @@ -47,7 +47,7 @@ class PAIMON_EXPORT FileBatchReader : public BatchReader { using BatchReader::NextBatchWithBitmap; /// Get the global row number of the row in the previously read batch. - virtual Result GetPreviousBatchGlobalRowId(uint64_t batch_row_id) const = 0; + virtual Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const = 0; /// Get the number of rows in the file. virtual Result GetNumberOfRows() const = 0; From fcc1ac87d4320661fba158230681990117d4a8d7 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 10:55:15 +0800 Subject: [PATCH 13/27] fix: use iterator to apply bitmap --- .../bitmap/apply_bitmap_index_batch_reader.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index dd851c78b..a6de3af3e 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -94,12 +94,24 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { private: Result Filter(int32_t batch_size) const { - RoaringBitmap32 is_valid; + RoaringBitmap32 result; + auto bitmap_iter = bitmap_.Begin(); + auto bitmap_end = bitmap_.End(); + for (int32_t i = 0; i < batch_size; ++i) { PAIMON_ASSIGN_OR_RAISE(uint64_t file_row_id, reader_->GetPreviousBatchFileRowId(i)); - is_valid.Add(i); + while (bitmap_iter != bitmap_end && + static_cast(*bitmap_iter) < file_row_id) { + ++bitmap_iter; + } + if (bitmap_iter == bitmap_end) { + break; + } + if (static_cast(*bitmap_iter) == file_row_id) { + result.Add(i); + } } - return RoaringBitmap32::And(bitmap_, is_valid); + return result; } private: From 0fb2decbca49118c897f879281169e335c8f47b6 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 11:30:02 +0800 Subject: [PATCH 14/27] test: add assertion --- .../common/reader/prefetch_file_batch_reader_impl_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index c5d60f322..a30e90bfd 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -732,6 +732,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestCallNextBatchAfterReadingEof) { ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult( reader.get(), /*max simulated data processing time*/ 100)); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); auto expected_array = std::make_shared(data_array); ASSERT_TRUE(result_array->Equals(expected_array)); From a3e37bdd8904a50a95a48c59db8dc91d05483459 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 11:41:42 +0800 Subject: [PATCH 15/27] test: use '.value()' directly to validate the result. --- .../bitmap/apply_bitmap_index_batch_reader.h | 5 +- .../prefetch_file_batch_reader_impl_test.cpp | 29 +++++----- .../parquet_file_batch_reader_test.cpp | 55 ++++++------------- 3 files changed, 32 insertions(+), 57 deletions(-) diff --git a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h index a6de3af3e..8f770478f 100644 --- a/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h +++ b/src/paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h @@ -100,12 +100,11 @@ class ApplyBitmapIndexBatchReader : public FileBatchReader { for (int32_t i = 0; i < batch_size; ++i) { PAIMON_ASSIGN_OR_RAISE(uint64_t file_row_id, reader_->GetPreviousBatchFileRowId(i)); - while (bitmap_iter != bitmap_end && - static_cast(*bitmap_iter) < file_row_id) { + while (bitmap_iter != bitmap_end && static_cast(*bitmap_iter) < file_row_id) { ++bitmap_iter; } if (bitmap_iter == bitmap_end) { - break; + break; } if (static_cast(*bitmap_iter) == file_row_id) { result.Add(i); diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index a30e90bfd..ceeeb9c57 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -637,9 +637,7 @@ TEST_F(PrefetchFileBatchReaderImplTest, TestPartialReaderSuccessRead) { ASSERT_OK_AND_ASSIGN(auto batch_with_bitmap, reader->NextBatchWithBitmap()); auto& [batch, bitmap] = batch_with_bitmap; ASSERT_EQ(batch.first->length, bitmap.Cardinality()); - uint64_t file_row_id = 0; - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 0); + ASSERT_EQ(reader->GetPreviousBatchFileRowId(0).value(), 0); ASSERT_OK_AND_ASSIGN(auto array, ReadResultCollector::GetArray(std::move(batch))); result_array_vector.push_back(array); ASSERT_OK(prefetch_reader->GetReadStatus()); @@ -933,21 +931,18 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { PreparePrefetchReader(file_format, schema.get(), predicate, /*selection_bitmap=*/std::nullopt, /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); - uint64_t file_row_id = 0; ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(std::shared_ptr batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 20); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); - ASSERT_EQ(file_row_id, 25); + for (uint64_t i = 0; i < 10; i++) { + ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 20 + i); + } ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 70); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); - ASSERT_EQ(file_row_id, 75); + for (uint64_t i = 0; i < 10; i++) { + ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 70 + i); + } // Set read schema again std::unique_ptr c_schema = std::make_unique(); @@ -959,12 +954,14 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 30); + for (uint64_t i = 0; i < 10; i++) { + ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 30 + i); + } ASSERT_OK_AND_ASSIGN(batch, paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, reader->GetPreviousBatchFileRowId(5)); - ASSERT_EQ(file_row_id, 45); + for (uint64_t i = 0; i < 10; i++) { + ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 40 + i); + } } } // namespace paimon::test diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index c7255c209..3e2f51f58 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -822,20 +822,16 @@ TEST_F(ParquetFileBatchReaderTest, TestReadNoField) { auto parquet_batch_reader = PrepareParquetFileBatchReader(file_name, read_schema, /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt, /*batch_size=*/2); - uint64_t file_row_id = 0; // read 2 rows ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 0); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 0); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch2, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 2); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 2); // read 2 rows ASSERT_OK_AND_ASSIGN(auto batch3, parquet_batch_reader->NextBatch()); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 4); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 4); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, parquet_batch_reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); @@ -1047,17 +1043,14 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/2); - uint64_t file_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 1); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); - ASSERT_EQ(file_row_id, 2); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 2); // out of bound return invalid ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(2)); @@ -1072,10 +1065,8 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ])") .ValueOrDie()); ASSERT_TRUE(batch2->chunk(0)->Equals(expected_batch2)) << batch2->ToString(); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 3); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); - ASSERT_EQ(file_row_id, 5); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 5); // Only one record read ASSERT_OK_AND_ASSIGN( @@ -1083,8 +1074,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 6); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); ASSERT_OK_AND_ASSIGN( @@ -1119,23 +1109,18 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); - uint64_t file_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 3); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(2)); - ASSERT_EQ(file_row_id, 5); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 6); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); - ASSERT_EQ(file_row_id, 8); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 8); } TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { @@ -1158,21 +1143,17 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { auto parquet_batch_reader = PrepareParquetFileBatchReader( file_path_, arrow_schema, /*predicate=*/predicate, std::nullopt, /*batch_size=*/3); - uint64_t file_row_id = 0; ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 1); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(1)); - ASSERT_EQ(file_row_id, 2); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 2); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 3); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); ASSERT_OK_AND_ASSIGN( @@ -1188,10 +1169,8 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(0)); - ASSERT_EQ(file_row_id, 3); - ASSERT_OK_AND_ASSIGN(file_row_id, parquet_batch_reader->GetPreviousBatchFileRowId(2)); - ASSERT_EQ(file_row_id, 5); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); + ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); } } // namespace paimon::parquet::test From 8820e7ceffc3aa9258e5a2a2305ffc8d986945ca Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 11:55:31 +0800 Subject: [PATCH 16/27] update comments --- src/paimon/testing/utils/read_result_collector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/testing/utils/read_result_collector.h b/src/paimon/testing/utils/read_result_collector.h index f2ab0d1fc..10f7f8dae 100644 --- a/src/paimon/testing/utils/read_result_collector.h +++ b/src/paimon/testing/utils/read_result_collector.h @@ -133,7 +133,7 @@ class ReadResultCollector { static Result> CollectResultOneBatch( BatchReader* batch_reader) { - return CollectResultOneBatch(batch_reader, /*max simulated data processing time*/ 0); + return CollectResultOneBatch(batch_reader, /*max_simulated_data_processing_time*/ 0); } static Result> CollectResultOneBatch( From 376a3124f0bf3cd0b7bdd7564a5670012620a5bd Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 14:02:46 +0800 Subject: [PATCH 17/27] style: change method name --- src/paimon/format/parquet/parquet_file_batch_reader.cpp | 7 ++++--- src/paimon/format/parquet/parquet_file_batch_reader.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 73d066b6a..3fc379006 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -222,7 +222,7 @@ Status ParquetFileBatchReader::SetReadSchema( reader_->GetAllRowGroupRanges()[rg_id].first - 1))); } } - PAIMON_ASSIGN_OR_RAISE(all_row_ranges_, GetAllTargetRowRanges(target_row_groups)); + PAIMON_RETURN_NOT_OK(UpdateAllTargetRowranges(target_row_groups)); PAIMON_RETURN_NOT_OK(reader_->PrepareForReadingLazy(target_row_groups, column_indices)); } PAIMON_PARQUET_CATCH_AND_RETURN_STATUS("ParquetFileBatchReader::SetReadSchema") @@ -533,7 +533,7 @@ Result> ParquetFileBatchReader::ComputeNestedColumnIndices( return indices; } -Result ParquetFileBatchReader::GetAllTargetRowRanges( +Status ParquetFileBatchReader::UpdateAllTargetRowranges( const std::vector& target_row_groups) { row_mapping_.clear(); auto all_row_group_ranges = reader_->GetAllRowGroupRanges(); @@ -545,7 +545,8 @@ Result ParquetFileBatchReader::GetAllTargetRowRanges( range.to + all_row_group_ranges[target_row_group.row_group_index].first)); } } - return all_ranges; + all_row_ranges_ = std::move(all_ranges); + return Status::OK(); } Status ParquetFileBatchReader::GenerateRowMapping(int64_t batch_length) { diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 06237cec7..e103694c0 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -184,7 +184,7 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { const std::shared_ptr& read_schema, const std::shared_ptr& file_schema); - Result GetAllTargetRowRanges(const std::vector& target_row_groups); + Status UpdateAllTargetRowranges(const std::vector& target_row_groups); // precondition: predicate supposed not be empty Result> FilterRowGroupsByPredicate( From f1c02db59d4cece571814d9396d1adc752c05899 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 14:17:02 +0800 Subject: [PATCH 18/27] fix: small fixes --- .../common/reader/prefetch_file_batch_reader_impl.cpp | 1 - src/paimon/format/blob/blob_file_batch_reader.h | 3 +++ src/paimon/format/lance/lance_file_batch_reader.h | 3 +++ src/paimon/format/parquet/row_ranges.cpp | 6 ------ src/paimon/format/parquet/row_ranges.h | 4 +--- src/paimon/testing/mock/mock_file_batch_reader.h | 5 ++++- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index dc8a9ad5f..5940a4d48 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "arrow/array/array_base.h" diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index 06afe6acc..998939c8a 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -104,6 +104,9 @@ class BlobFileBatchReader : public FileBatchReader { "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } + if (previous_batch_first_row_number_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } return previous_batch_first_row_number_ + batch_row_id; } diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index c7ce9a4f4..5cfbdb981 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -49,6 +49,9 @@ class LanceFileBatchReader : public FileBatchReader { "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } + if (previous_batch_first_row_num_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet"); + } return previous_batch_first_row_num_ + batch_row_id; } diff --git a/src/paimon/format/parquet/row_ranges.cpp b/src/paimon/format/parquet/row_ranges.cpp index 6e20780b8..1b03715be 100644 --- a/src/paimon/format/parquet/row_ranges.cpp +++ b/src/paimon/format/parquet/row_ranges.cpp @@ -104,12 +104,6 @@ void RowRanges::Add(const Range& range) { ranges_.insert(it, merged); } -void RowRanges::Union(const RowRanges& other) { - for (const auto& range : other.ranges_) { - Add(range); - } -} - std::optional RowRanges::MapFilteredIndexToOriginalRow(int64_t filtered_index) const { int64_t accumulated = 0; for (const auto& range : ranges_) { diff --git a/src/paimon/format/parquet/row_ranges.h b/src/paimon/format/parquet/row_ranges.h index f6b41178f..6174ac4eb 100644 --- a/src/paimon/format/parquet/row_ranges.h +++ b/src/paimon/format/parquet/row_ranges.h @@ -92,8 +92,6 @@ class RowRanges { /// Adds a range to the end of the list, maintaining sorted disjoint ranges. void Add(const Range& range); - void Union(const RowRanges& other); - /// Maps a filtered-result index to the original row index within the row group. /// For example, if RowRanges = {[10,19], [50,59]}, then: /// MapFilteredIndexToOriginalRow(0) = 10 (first row of first range) @@ -111,7 +109,7 @@ class RowRanges { struct TargetRowGroup { int32_t row_group_index{-1}; bool is_partially_matched{false}; - // page-filtered row ranges, only valid if is_partially_matched is true. + RowRanges row_ranges; // Whether this row group has been excluded by ApplyReadRanges. // When true, this row group is logically skipped during iteration diff --git a/src/paimon/testing/mock/mock_file_batch_reader.h b/src/paimon/testing/mock/mock_file_batch_reader.h index a28d38dd1..439d5c296 100644 --- a/src/paimon/testing/mock/mock_file_batch_reader.h +++ b/src/paimon/testing/mock/mock_file_batch_reader.h @@ -157,6 +157,9 @@ class MockFileBatchReader : public PrefetchFileBatchReader { } Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + if (previous_batch_first_row_num_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet"); + } return previous_batch_first_row_num_ + batch_row_id; } @@ -191,7 +194,7 @@ class MockFileBatchReader : public PrefetchFileBatchReader { int32_t batch_size_ = 0; int32_t current_pos_ = 0; int32_t read_end_pos_ = 0; - int32_t previous_batch_first_row_num_ = -1; + uint64_t previous_batch_first_row_num_ = std::numeric_limits::max(); Status next_batch_status_; bool enable_randomize_batch_size_ = true; std::vector> read_ranges_; From ad66b22c0b20aeea1332be32693f9bd5a335508b Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 14:33:07 +0800 Subject: [PATCH 19/27] fix: blob test --- src/paimon/format/blob/blob_file_batch_reader_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index 0cfe9351f..8329f0557 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -254,7 +254,7 @@ TEST_P(BlobFileBatchReaderTest, EmptyFile) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(0, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch, reader->NextBatch()); ASSERT_TRUE(BatchReader::IsEofBatch(batch)); } From 25ef3d096f0dd66c98826e9499032922fcec52a0 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 15:06:32 +0800 Subject: [PATCH 20/27] fix: blob and lance tests --- src/paimon/format/blob/blob_file_batch_reader_test.cpp | 4 ++-- src/paimon/format/lance/lance_format_reader_writer_test.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index 8329f0557..c372fe2a2 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -169,7 +169,7 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ASSERT_OK(reader->SetReadSchema(&c_schema, nullptr, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto number_of_rows, reader->GetNumberOfRows()); ASSERT_EQ(3, number_of_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); @@ -183,7 +183,7 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } diff --git a/src/paimon/format/lance/lance_format_reader_writer_test.cpp b/src/paimon/format/lance/lance_format_reader_writer_test.cpp index 061d950f9..71a073413 100644 --- a/src/paimon/format/lance/lance_format_reader_writer_test.cpp +++ b/src/paimon/format/lance/lance_format_reader_writer_test.cpp @@ -478,7 +478,7 @@ TEST_F(LanceFileReaderWriterTest, TestPreviousBatchFirstRowNumber) { ASSERT_OK_AND_ASSIGN( std::unique_ptr reader, LanceFileBatchReader::Create(file_path, /*batch_size=*/4, /*batch_readahead=*/2)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); // first batch row 0-3 ASSERT_OK_AND_ASSIGN(auto read_batch, reader->NextBatch()); From 8482e909672cb7c6f7ff53905b2cde9c51641b13 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Mon, 29 Jun 2026 15:36:25 +0800 Subject: [PATCH 21/27] fix: blob --- src/paimon/format/blob/blob_file_batch_reader_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index c372fe2a2..c9d5dbd95 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -183,7 +183,7 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); + ASSERT_EQ(3, reader->GetPreviousBatchFileRowId(0).value()); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } From fa1b068bd03bb4fe22cf38fc179e1ceff636e9b4 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 30 Jun 2026 10:20:48 +0800 Subject: [PATCH 22/27] refractor: merge 'CollectResult' and 'CollectResultOneBatch' into a single function --- .../prefetch_file_batch_reader_impl_test.cpp | 8 +-- .../parquet_file_batch_reader_test.cpp | 18 +++--- .../testing/utils/read_result_collector.h | 59 ++----------------- 3 files changed, 19 insertions(+), 66 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index ceeeb9c57..4952557c0 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -933,13 +933,13 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(std::shared_ptr batch, - paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 20 + i); } ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 70 + i); } @@ -953,12 +953,12 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 30 + i); } ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); + paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 40 + i); } diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 3e2f51f58..9f4fe9be3 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -1046,7 +1046,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); @@ -1057,7 +1057,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { // Not adjacent pages ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); auto expected_batch2 = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ [3], @@ -1071,7 +1071,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { // Only one record read ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); @@ -1079,7 +1079,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_OK_AND_ASSIGN( std::shared_ptr eof_batch, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return invalid. ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); @@ -1112,13 +1112,13 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 8); } @@ -1146,13 +1146,13 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 2); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); @@ -1168,7 +1168,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, - paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); + paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); } diff --git a/src/paimon/testing/utils/read_result_collector.h b/src/paimon/testing/utils/read_result_collector.h index 10f7f8dae..55a86717f 100644 --- a/src/paimon/testing/utils/read_result_collector.h +++ b/src/paimon/testing/utils/read_result_collector.h @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -68,17 +69,15 @@ class ReadResultCollector { return results; } - static Result> CollectResult(BatchReader* batch_reader) { - return CollectResult(batch_reader, /*max simulated data processing time*/ 0); - } - // will convert dictionary array to string array for comparing results static Result> CollectResult( - BatchReader* batch_reader, int64_t max_data_processing_time_in_us) { + BatchReader* batch_reader, int64_t max_data_processing_time_in_us = 0, + int64_t max_read_batch = std::numeric_limits::max()) { arrow::ArrayVector result_array_vector; int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); std::srand(seed); - while (true) { + int64_t read_batch_count = 0; + while (read_batch_count < max_read_batch) { // Prioritize calling NextBatch. If it fails (paimon inner reader e.g., // PrefetchBatchReader, ApplyBitmapIndexBatchReader...), call NextBatchWithBitmap. auto batch_result = batch_reader->NextBatch(); @@ -109,6 +108,7 @@ class ReadResultCollector { PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result_array, arrow::ImportArray(c_array.get(), c_schema.get())); result_array_vector.push_back(result_array); + ++read_batch_count; if (max_data_processing_time_in_us > 0) { usleep(std::rand() % max_data_processing_time_in_us); } @@ -131,53 +131,6 @@ class ReadResultCollector { return chunk_array; } - static Result> CollectResultOneBatch( - BatchReader* batch_reader) { - return CollectResultOneBatch(batch_reader, /*max_simulated_data_processing_time*/ 0); - } - - static Result> CollectResultOneBatch( - BatchReader* batch_reader, int64_t max_data_processing_time_in_us) { - int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); - std::srand(seed); - auto batch_result = batch_reader->NextBatch(); - BatchReader::ReadBatch batch; - if (!batch_result.ok()) { - if (batch_result.status().ToString().find("should use NextBatchWithBitmap") != - std::string::npos) { - PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, - batch_reader->NextBatchWithBitmap()); - if (BatchReader::IsEofBatch(batch_with_bitmap)) { - return std::shared_ptr(); - } - assert(!batch_with_bitmap.second.IsEmpty()); - PAIMON_ASSIGN_OR_RAISE( - batch, ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap), - arrow::default_memory_pool())); - } else { - return batch_result.status(); - } - } else { - batch = std::move(batch_result).value(); - if (BatchReader::IsEofBatch(batch)) { - return std::shared_ptr(); - } - } - auto& [c_array, c_schema] = batch; - assert(c_array->length > 0); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr result_array, - arrow::ImportArray(c_array.get(), c_schema.get())); - PAIMON_ASSIGN_OR_RAISE( - auto converted_array, - DictArrayConverter::ConvertDictArray(result_array, arrow::default_memory_pool())); - if (max_data_processing_time_in_us > 0) { - usleep(std::rand() % max_data_processing_time_in_us); - } - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr chunk_array, - arrow::ChunkedArray::Make({converted_array})); - return chunk_array; - } - static Result> GetArray(BatchReader::ReadBatch&& batch) { if (BatchReader::IsEofBatch(batch)) { return std::shared_ptr(); From 8dc8576e8abbfd9fdf253841e19c6c9751dd99d1 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 30 Jun 2026 10:28:51 +0800 Subject: [PATCH 23/27] refractor: extract common parts of CollectResult and CollectResultOneBatch --- .../prefetch_file_batch_reader_impl_test.cpp | 8 +- .../parquet_file_batch_reader_test.cpp | 18 ++-- .../testing/utils/read_result_collector.h | 102 ++++++++++++------ 3 files changed, 81 insertions(+), 47 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp index 4952557c0..ceeeb9c57 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl_test.cpp @@ -933,13 +933,13 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { /*batch_size=*/10, /*prefetch_max_parallel_num=*/3, cache_mode); ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(std::shared_ptr batch, - paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 20 + i); } ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 70 + i); } @@ -953,12 +953,12 @@ TEST_P(PrefetchFileBatchReaderImplTest, TestRowMapping) { ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 30 + i); } ASSERT_OK_AND_ASSIGN(batch, - paimon::test::ReadResultCollector::CollectResult(reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(reader.get())); for (uint64_t i = 0; i < 10; i++) { ASSERT_EQ(reader->GetPreviousBatchFileRowId(i).value(), 40 + i); } diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 9f4fe9be3..3e2f51f58 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -1046,7 +1046,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch1 = src_array->Slice(1, 2); ASSERT_TRUE(batch1->chunk(0)->Equals(expected_batch1)) << batch1->ToString(); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); @@ -1057,7 +1057,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { // Not adjacent pages ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch2 = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ [3], @@ -1071,7 +1071,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { // Only one record read ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); auto expected_batch3 = src_array->Slice(6, 1); ASSERT_TRUE(batch3->chunk(0)->Equals(expected_batch3)) << batch3->ToString(); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); @@ -1079,7 +1079,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSimple) { ASSERT_OK_AND_ASSIGN( std::shared_ptr eof_batch, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(nullptr, eof_batch); // previous batch is eof, return invalid. ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); @@ -1112,13 +1112,13 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingFullyAndPartially) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 8); } @@ -1146,13 +1146,13 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch1, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 1); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(1).value(), 2); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch2, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(1)); @@ -1168,7 +1168,7 @@ TEST_F(ParquetFileBatchReaderTest, TestRowMappingSetReadSchemaTwice) { ASSERT_NOK(parquet_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN( std::shared_ptr batch3, - paimon::test::ReadResultCollector::CollectResult(parquet_batch_reader.get(), 0, 1)); + paimon::test::ReadResultCollector::CollectResultOneBatch(parquet_batch_reader.get())); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(0).value(), 3); ASSERT_EQ(parquet_batch_reader->GetPreviousBatchFileRowId(2).value(), 5); } diff --git a/src/paimon/testing/utils/read_result_collector.h b/src/paimon/testing/utils/read_result_collector.h index 55a86717f..55f0358ea 100644 --- a/src/paimon/testing/utils/read_result_collector.h +++ b/src/paimon/testing/utils/read_result_collector.h @@ -18,7 +18,6 @@ #include -#include #include #include #include @@ -71,44 +70,16 @@ class ReadResultCollector { // will convert dictionary array to string array for comparing results static Result> CollectResult( - BatchReader* batch_reader, int64_t max_data_processing_time_in_us = 0, - int64_t max_read_batch = std::numeric_limits::max()) { + BatchReader* batch_reader, int64_t max_data_processing_time_in_us) { arrow::ArrayVector result_array_vector; int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); std::srand(seed); - int64_t read_batch_count = 0; - while (read_batch_count < max_read_batch) { - // Prioritize calling NextBatch. If it fails (paimon inner reader e.g., - // PrefetchBatchReader, ApplyBitmapIndexBatchReader...), call NextBatchWithBitmap. - auto batch_result = batch_reader->NextBatch(); - BatchReader::ReadBatch batch; - if (!batch_result.ok()) { - if (batch_result.status().ToString().find("should use NextBatchWithBitmap") != - std::string::npos) { - PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, - batch_reader->NextBatchWithBitmap()); - if (BatchReader::IsEofBatch(batch_with_bitmap)) { - break; - } - assert(!batch_with_bitmap.second.IsEmpty()); - PAIMON_ASSIGN_OR_RAISE( - batch, ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap), - arrow::default_memory_pool())); - } else { - return batch_result.status(); - } - } else { - batch = std::move(batch_result).value(); - if (BatchReader::IsEofBatch(batch)) { - break; - } + while (true) { + PAIMON_ASSIGN_OR_RAISE(auto result_array, ReadOneBatch(batch_reader)); + if (result_array == nullptr) { + break; } - auto& [c_array, c_schema] = batch; - assert(c_array->length > 0); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result_array, - arrow::ImportArray(c_array.get(), c_schema.get())); result_array_vector.push_back(result_array); - ++read_batch_count; if (max_data_processing_time_in_us > 0) { usleep(std::rand() % max_data_processing_time_in_us); } @@ -131,6 +102,35 @@ class ReadResultCollector { return chunk_array; } + static Result> CollectResult(BatchReader* batch_reader) { + return CollectResult(batch_reader, /*max simulated data processing time*/ 0); + } + + static Result> CollectResultOneBatch( + BatchReader* batch_reader) { + return CollectResultOneBatch(batch_reader, /*max_simulated_data_processing_time*/ 0); + } + + static Result> CollectResultOneBatch( + BatchReader* batch_reader, int64_t max_data_processing_time_in_us) { + int64_t seed = DateTimeUtils::GetCurrentUTCTimeUs(); + std::srand(seed); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result_array, + ReadOneBatch(batch_reader)); + if (result_array == nullptr) { + return std::shared_ptr(); + } + PAIMON_ASSIGN_OR_RAISE( + auto converted_array, + DictArrayConverter::ConvertDictArray(result_array, arrow::default_memory_pool())); + if (max_data_processing_time_in_us > 0) { + usleep(std::rand() % max_data_processing_time_in_us); + } + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr chunk_array, + arrow::ChunkedArray::Make({converted_array})); + return chunk_array; + } + static Result> GetArray(BatchReader::ReadBatch&& batch) { if (BatchReader::IsEofBatch(batch)) { return std::shared_ptr(); @@ -165,5 +165,39 @@ class ReadResultCollector { arrow::compute::Take(arrow::Datum(array), arrow::Datum(sorted_indices))); return sorted_batch.chunked_array(); } + + private: + static Result> ReadOneBatch(BatchReader* batch_reader) { + // Prioritize calling NextBatch. If it fails (paimon inner reader e.g., + // PrefetchBatchReader, ApplyBitmapIndexBatchReader...), call NextBatchWithBitmap. + auto batch_result = batch_reader->NextBatch(); + BatchReader::ReadBatch batch; + if (!batch_result.ok()) { + if (batch_result.status().ToString().find("should use NextBatchWithBitmap") != + std::string::npos) { + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, + batch_reader->NextBatchWithBitmap()); + if (BatchReader::IsEofBatch(batch_with_bitmap)) { + return std::shared_ptr(); + } + assert(!batch_with_bitmap.second.IsEmpty()); + PAIMON_ASSIGN_OR_RAISE( + batch, ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap), + arrow::default_memory_pool())); + } else { + return batch_result.status(); + } + } else { + batch = std::move(batch_result).value(); + if (BatchReader::IsEofBatch(batch)) { + return std::shared_ptr(); + } + } + auto& [c_array, c_schema] = batch; + assert(c_array->length > 0); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result_array, + arrow::ImportArray(c_array.get(), c_schema.get())); + return result_array; + } }; } // namespace paimon::test From db4b42e0d4ad35f181b542b6035028cbafcb947b Mon Sep 17 00:00:00 2001 From: Zhou Hongfeng <87103887+zhf999@users.noreply.github.com> Date: Tue, 30 Jun 2026 10:31:39 +0800 Subject: [PATCH 24/27] Update src/paimon/format/parquet/parquet_file_batch_reader.cpp Co-authored-by: Zhang Jiawei <30893610+zjw1111@users.noreply.github.com> --- src/paimon/format/parquet/parquet_file_batch_reader.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 3fc379006..3fe9f340d 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -215,9 +215,7 @@ Status ParquetFileBatchReader::SetReadSchema( /*ranges=*/it->second); } else { target_row_groups.emplace_back( - /*rg_index=*/rg_id, - /*is_partially_matched=*/false, - /*ranges=*/ + /*rg_index=*/rg_id, /*is_partially_matched=*/false, /*ranges=*/ RowRanges(Range(0, reader_->GetAllRowGroupRanges()[rg_id].second - reader_->GetAllRowGroupRanges()[rg_id].first - 1))); } From dd85c5745e27902a909549fa2df12fb0cc6fdf0c Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 30 Jun 2026 15:08:23 +0800 Subject: [PATCH 25/27] fix: aligned all file_batch_reader to behavior the same when 'GetPreviousBatchFileRowId' is called --- .../format/avro/avro_file_batch_reader.cpp | 3 +++ .../format/avro/avro_file_batch_reader.h | 13 ++++++++++ .../avro/avro_file_batch_reader_test.cpp | 6 ++--- .../format/blob/blob_file_batch_reader.cpp | 4 ++- .../format/blob/blob_file_batch_reader.h | 15 +++++++++-- .../blob/blob_file_batch_reader_test.cpp | 2 +- .../format/lance/lance_file_batch_reader.cpp | 6 ++--- .../format/lance/lance_file_batch_reader.h | 16 +++++++++--- .../format/orc/orc_file_batch_reader.cpp | 9 ++++++- src/paimon/format/orc/orc_file_batch_reader.h | 17 ++++++++++++- .../format/orc/orc_file_batch_reader_test.cpp | 25 +++++++++++++------ .../parquet/parquet_file_batch_reader.h | 10 +++++--- 12 files changed, 100 insertions(+), 26 deletions(-) diff --git a/src/paimon/format/avro/avro_file_batch_reader.cpp b/src/paimon/format/avro/avro_file_batch_reader.cpp index 0c55e0260..eb27049d0 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader.cpp @@ -116,6 +116,7 @@ Result AvroFileBatchReader::NextBatch() { previous_first_row_ = next_row_to_read_; next_row_to_read_ += array_builder_->length(); if (array_builder_->length() == 0) { + previous_batch_row_count_ = 0; return BatchReader::MakeEofBatch(); } PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, @@ -123,6 +124,7 @@ Result AvroFileBatchReader::NextBatch() { std::unique_ptr c_array = std::make_unique(); std::unique_ptr c_schema = std::make_unique(); PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, c_array.get(), c_schema.get())); + previous_batch_row_count_ = c_array->length; return make_pair(std::move(c_array), std::move(c_schema)); } catch (const ::avro::Exception& e) { return Status::Invalid(fmt::format("avro reader next batch failed. {}", e.what())); @@ -168,6 +170,7 @@ Status AvroFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, reader_ = std::move(reader); array_builder_ = std::move(array_builder); previous_first_row_ = std::numeric_limits::max(); + previous_batch_row_count_ = 0; next_row_to_read_ = std::numeric_limits::max(); close_ = false; return Status::OK(); diff --git a/src/paimon/format/avro/avro_file_batch_reader.h b/src/paimon/format/avro/avro_file_batch_reader.h index a90d82c5a..b70cdb784 100644 --- a/src/paimon/format/avro/avro_file_batch_reader.h +++ b/src/paimon/format/avro/avro_file_batch_reader.h @@ -46,6 +46,18 @@ class AvroFileBatchReader : public FileBatchReader { const std::optional& selection_bitmap) override; Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { + if (previous_batch_row_count_ == 0) { + if (previous_first_row_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } else { + return Status::Invalid("Last batch was EOF."); + } + } + if (batch_row_id >= previous_batch_row_count_) { + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", + batch_row_id, previous_batch_row_count_)); + } return previous_first_row_ + batch_row_id; } @@ -90,6 +102,7 @@ class AvroFileBatchReader : public FileBatchReader { std::optional> read_fields_projection_; uint64_t previous_first_row_ = std::numeric_limits::max(); uint64_t next_row_to_read_ = std::numeric_limits::max(); + uint64_t previous_batch_row_count_ = 0; mutable std::optional total_rows_ = std::nullopt; const int32_t batch_size_; bool close_ = false; diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index 2e1e00c42..54e598c6d 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -352,7 +352,7 @@ TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFileRowId) { ASSERT_OK_AND_ASSIGN(auto num_rows, reader->GetNumberOfRows()); ASSERT_EQ(4, num_rows); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, reader->NextBatch()); ArrowArrayRelease(batch1.first.get()); ArrowSchemaRelease(batch1.second.get()); @@ -370,7 +370,7 @@ TEST_F(AvroFileBatchReaderTest, TestGetPreviousBatchFileRowId) { ArrowArrayRelease(batch4.first.get()); ArrowSchemaRelease(batch4.second.get()); ASSERT_OK_AND_ASSIGN(auto batch5, reader->NextBatch()); - ASSERT_EQ(4, reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch5)); } @@ -406,7 +406,7 @@ TEST_F(AvroFileBatchReaderTest, TestSetReadSchemaResetsReaderToFirstRow) { ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); ASSERT_OK(reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, /*selection_bitmap=*/std::nullopt)); - ASSERT_EQ(std::numeric_limits::max(), reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto projected_batch, reader->NextBatch()); ASSERT_EQ(0, reader->GetPreviousBatchFileRowId(0).value()); diff --git a/src/paimon/format/blob/blob_file_batch_reader.cpp b/src/paimon/format/blob/blob_file_batch_reader.cpp index 08fa6149b..d5b5a2fe2 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader.cpp @@ -157,7 +157,7 @@ Status BlobFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, target_type_ = arrow::struct_(arrow_schema->fields()); current_pos_ = 0; previous_batch_first_row_number_ = std::numeric_limits::max(); - + previous_batch_row_count_ = 0; return Status::OK(); } @@ -292,6 +292,7 @@ Result BlobFileBatchReader::NextBatch() { } if (current_pos_ >= target_blob_lengths_.size()) { PAIMON_ASSIGN_OR_RAISE(previous_batch_first_row_number_, GetNumberOfRows()); + previous_batch_row_count_ = 0; return BatchReader::MakeEofBatch(); } int32_t left_rows = target_blob_lengths_.size() - current_pos_; @@ -303,6 +304,7 @@ Result BlobFileBatchReader::NextBatch() { PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*blob_array, c_array.get(), c_schema.get())); previous_batch_first_row_number_ = target_blob_row_indexes_[current_pos_]; current_pos_ += rows_to_read; + previous_batch_row_count_ = c_array->length; return make_pair(std::move(c_array), std::move(c_schema)); } diff --git a/src/paimon/format/blob/blob_file_batch_reader.h b/src/paimon/format/blob/blob_file_batch_reader.h index 998939c8a..27549c190 100644 --- a/src/paimon/format/blob/blob_file_batch_reader.h +++ b/src/paimon/format/blob/blob_file_batch_reader.h @@ -24,6 +24,7 @@ #include "arrow/memory_pool.h" #include "arrow/type.h" +#include "fmt/format.h" #include "paimon/common/data/blob_defs.h" #include "paimon/fs/file_system.h" #include "paimon/memory/bytes.h" @@ -104,8 +105,17 @@ class BlobFileBatchReader : public FileBatchReader { "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } - if (previous_batch_first_row_number_ == std::numeric_limits::max()) { - return Status::Invalid("No batch has been read yet."); + if (previous_batch_row_count_ == 0) { + if (previous_batch_first_row_number_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } else { + return Status::Invalid("Last batch was EOF."); + } + } + if (batch_row_id >= previous_batch_row_count_) { + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", + batch_row_id, previous_batch_row_count_)); } return previous_batch_first_row_number_ + batch_row_id; } @@ -177,6 +187,7 @@ class BlobFileBatchReader : public FileBatchReader { size_t current_pos_ = 0; uint64_t previous_batch_first_row_number_ = std::numeric_limits::max(); + uint64_t previous_batch_row_count_ = 0; bool closed_ = false; }; diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index c9d5dbd95..c372fe2a2 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -183,7 +183,7 @@ TEST_F(BlobFileBatchReaderTest, TestRowNumbers) { ArrowArrayRelease(batch3.first.get()); ArrowSchemaRelease(batch3.second.get()); ASSERT_OK_AND_ASSIGN(auto batch4, reader->NextBatch()); - ASSERT_EQ(3, reader->GetPreviousBatchFileRowId(0).value()); + ASSERT_NOK(reader->GetPreviousBatchFileRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); } diff --git a/src/paimon/format/lance/lance_file_batch_reader.cpp b/src/paimon/format/lance/lance_file_batch_reader.cpp index 1ae1deb84..acbcb835a 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.cpp +++ b/src/paimon/format/lance/lance_file_batch_reader.cpp @@ -96,7 +96,7 @@ Status LanceFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, PAIMON_RETURN_NOT_OK(LanceToPaimonStatus(err_code, error_message_)); stream_reader_ = nullptr; previous_batch_first_row_num_ = std::numeric_limits::max(); - last_batch_row_num_ = 0; + previous_batch_row_count_ = 0; } return Status::OK(); } @@ -119,7 +119,7 @@ Result LanceFileBatchReader::NextBatch() { // first read previous_batch_first_row_num_ = 0; } else { - previous_batch_first_row_num_ += last_batch_row_num_; + previous_batch_first_row_num_ += previous_batch_row_count_; } auto c_array = std::make_unique(); auto c_schema = std::make_unique(); @@ -130,7 +130,7 @@ Result LanceFileBatchReader::NextBatch() { if (is_eof) { return BatchReader::MakeEofBatch(); } - last_batch_row_num_ = c_array->length; + previous_batch_row_count_ = c_array->length; return std::make_pair(std::move(c_array), std::move(c_schema)); } diff --git a/src/paimon/format/lance/lance_file_batch_reader.h b/src/paimon/format/lance/lance_file_batch_reader.h index 5cfbdb981..129fdcc2d 100644 --- a/src/paimon/format/lance/lance_file_batch_reader.h +++ b/src/paimon/format/lance/lance_file_batch_reader.h @@ -22,6 +22,7 @@ #include #include "arrow/c/bridge.h" +#include "fmt/format.h" #include "lance_lib/lance_api.h" #include "paimon/metrics.h" #include "paimon/reader/file_batch_reader.h" @@ -49,8 +50,17 @@ class LanceFileBatchReader : public FileBatchReader { "bitmap pushdown, rows in the array returned by NextBatch are no longer " "contiguous."); } - if (previous_batch_first_row_num_ == std::numeric_limits::max()) { - return Status::Invalid("No batch has been read yet"); + if (previous_batch_row_count_ == 0) { + if (previous_batch_first_row_num_ == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } else { + return Status::Invalid("Last batch was EOF."); + } + } + if (batch_row_id >= previous_batch_row_count_) { + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", + batch_row_id, previous_batch_row_count_)); } return previous_batch_first_row_num_ + batch_row_id; } @@ -84,7 +94,7 @@ class LanceFileBatchReader : public FileBatchReader { uint64_t num_rows_ = 0; // only validate when there is no bitmap pushdown uint64_t previous_batch_first_row_num_ = std::numeric_limits::max(); - uint64_t last_batch_row_num_ = 0; + uint64_t previous_batch_row_count_ = 0; mutable std::string error_message_; LanceFileReader* file_reader_ = nullptr; LanceReaderAdapter* stream_reader_ = nullptr; diff --git a/src/paimon/format/orc/orc_file_batch_reader.cpp b/src/paimon/format/orc/orc_file_batch_reader.cpp index 76ad15688..45c60b574 100644 --- a/src/paimon/format/orc/orc_file_batch_reader.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader.cpp @@ -166,6 +166,7 @@ Status OrcFileBatchReader::SetReadSchema(::ArrowSchema* read_schema, options_, &target_column_ids)); target_column_ids_ = target_column_ids; + previous_batch_row_count_ = 0; PAIMON_RETURN_NOT_OK(reader_->SetReadSchema(target_type, row_reader_options)); return Status::OK(); } @@ -179,7 +180,13 @@ Result>> OrcFileBatchReader::PreBuffer } Result OrcFileBatchReader::NextBatch() { - return reader_->Next(); + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatch batch, reader_->Next()); + if (BatchReader::IsEofBatch(batch)) { + previous_batch_row_count_ = 0; + } else { + previous_batch_row_count_ = batch.first->length; + } + return batch; } std::shared_ptr OrcFileBatchReader::GetReaderMetrics() const { diff --git a/src/paimon/format/orc/orc_file_batch_reader.h b/src/paimon/format/orc/orc_file_batch_reader.h index 0f60c3779..da9c40696 100644 --- a/src/paimon/format/orc/orc_file_batch_reader.h +++ b/src/paimon/format/orc/orc_file_batch_reader.h @@ -63,7 +63,20 @@ class OrcFileBatchReader : public PrefetchFileBatchReader { Result NextBatch() override; Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { - return reader_->GetRowNumber() + batch_row_id; + uint64_t previous_first_row = reader_->GetRowNumber(); + if (previous_batch_row_count_ == 0) { + if (previous_first_row == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } else { + return Status::Invalid("Last batch was EOF."); + } + } + if (batch_row_id >= previous_batch_row_count_) { + return Status::Invalid( + fmt::format("batch_row_id {} is out of range, last batch row count is {}", + batch_row_id, previous_batch_row_count_)); + } + return previous_first_row + batch_row_id; } Result GetNumberOfRows() const override { @@ -120,6 +133,8 @@ class OrcFileBatchReader : public PrefetchFileBatchReader { std::shared_ptr metrics_; std::vector target_column_ids_; std::vector> cache_ranges_; + + uint64_t previous_batch_row_count_ = 0; }; } // namespace paimon::orc diff --git a/src/paimon/format/orc/orc_file_batch_reader_test.cpp b/src/paimon/format/orc/orc_file_batch_reader_test.cpp index 87490e0a2..b9e781553 100644 --- a/src/paimon/format/orc/orc_file_batch_reader_test.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader_test.cpp @@ -493,13 +493,22 @@ TEST_P(OrcFileBatchReaderTest, TestNextBatchSimple) { for (auto batch_size : {1, 2, 3, 5, 8, 10}) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, batch_size, natural_read_size); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), -1); - ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( - orc_batch_reader.get())); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 8); + ASSERT_NOK(orc_batch_reader->GetPreviousBatchFileRowId(0)); + int i = 0; + while (true) { + ASSERT_OK_AND_ASSIGN( + auto result_array, + paimon::test::ReadResultCollector::CollectResultOneBatch(orc_batch_reader.get())); + if (!result_array) { + ASSERT_NOK(orc_batch_reader->GetPreviousBatchFileRowId(0)); + break; + } + ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), i * batch_size); + ASSERT_TRUE(result_array->Equals(std::make_shared( + struct_array_->Slice(i * batch_size, result_array->length())))); + i++; + } orc_batch_reader->Close(); - auto expected_array = std::make_shared(struct_array_); - ASSERT_TRUE(result_array->Equals(expected_array)); // test metrics auto reader_metrics = orc_batch_reader->GetReaderMetrics(); ASSERT_OK_AND_ASSIGN(uint64_t io_count, @@ -767,7 +776,7 @@ TEST_F(OrcFileBatchReaderTest, TestReadNoField) { auto orc_batch_reader = PrepareOrcFileBatchReader(file_name, &read_schema, /*batch_size=*/3, /*natural_read_size=*/10); // read 3 rows - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), -1); + ASSERT_NOK(orc_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_OK_AND_ASSIGN(auto batch1, orc_batch_reader->NextBatch()); ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 0); // read 3 rows @@ -778,7 +787,7 @@ TEST_F(OrcFileBatchReaderTest, TestReadNoField) { ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 6); // read rows with eof ASSERT_OK_AND_ASSIGN(auto batch4, orc_batch_reader->NextBatch()); - ASSERT_EQ(orc_batch_reader->GetPreviousBatchFileRowId(0).value(), 8); + ASSERT_NOK(orc_batch_reader->GetPreviousBatchFileRowId(0)); ASSERT_TRUE(BatchReader::IsEofBatch(batch4)); orc_batch_reader->Close(); diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index e103694c0..b020f7e39 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -98,9 +98,13 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { Result GetPreviousBatchFileRowId(uint64_t batch_row_id) const override { if (row_mapping_.size() == 0) { - return Status::Invalid( - "Last batch is not read or last batch is empty, cannot get previous batch global " - "row id"); + PAIMON_ASSIGN_OR_RAISE(uint64_t previous_first_row, + reader_->GetPreviousBatchFirstRowNumber()); + if (previous_first_row == std::numeric_limits::max()) { + return Status::Invalid("No batch has been read yet."); + } else { + return Status::Invalid("Last batch was EOF."); + } } if (batch_row_id >= row_mapping_.size()) { return Status::Invalid( From d5c6ca015e8d98b6b76a2c369163404f70c66274 Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 30 Jun 2026 17:48:40 +0800 Subject: [PATCH 26/27] fix: correct the filter logic of PrefetchFileBatchReaderImpl --- .../prefetch_file_batch_reader_impl.cpp | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp index 5940a4d48..2f3db7817 100644 --- a/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp +++ b/src/paimon/common/reader/prefetch_file_batch_reader_impl.cpp @@ -439,26 +439,35 @@ Status PrefetchFileBatchReaderImpl::HandleReadResult( return Status::OK(); } auto [slice_begin, slice_end] = ComputeBatchSliceByReadRange(global_row_ids, read_range); - if (slice_begin >= slice_end) { - readers_pos_[reader_idx]->store(read_range.second); + // slice_begin should always be 0, records before read_range.first have been consumed or + // filtered out. + if (slice_begin != 0) { + return Status::Invalid(fmt::format("Slice begin is {}, which is not 0.", slice_begin)); + } + + if (0 == slice_end) { + // fully out of range, data before first_row_number has been filtered out + readers_pos_[reader_idx]->store(global_row_ids[0]); ReaderUtils::ReleaseReadBatch(std::move(read_batch)); return Status::OK(); - } else if (slice_begin > 0 || slice_end < c_array->length) { + } else if (slice_end < c_array->length) { + // partially out of range, data before read_range.second has been effectively consumed readers_pos_[reader_idx]->store(read_range.second); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr src_array, arrow::ImportArray(c_array.get(), c_schema.get())); - auto array = src_array->Slice(slice_begin, slice_end - slice_begin); + auto array = src_array->Slice(0, slice_end); PAIMON_RETURN_NOT_OK_FROM_ARROW( arrow::ExportArray(*array, c_array.get(), c_schema.get())); RoaringBitmap32 sliced_bitmap; - for (auto iter = bitmap.EqualOrLarger(slice_begin); - iter != bitmap.End() && *iter < slice_end; ++iter) { - sliced_bitmap.Add(*iter - slice_begin); + for (auto iter = bitmap.Begin(); iter != bitmap.End() && *iter < slice_end; ++iter) { + sliced_bitmap.Add(*iter); } bitmap = std::move(sliced_bitmap); - global_row_ids = std::vector(global_row_ids.begin() + slice_begin, - global_row_ids.begin() + slice_end); + global_row_ids = + std::vector(global_row_ids.begin(), global_row_ids.begin() + slice_end); } else { + // all within the range, data before readers_[reader_idx]->GetNextRowToRead() has been + // effectively consumed readers_pos_[reader_idx]->store(readers_[reader_idx]->GetNextRowToRead()); } if (bitmap.IsEmpty()) { From c620a633c8b703e09812919e74c51f26b86f8d2b Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Tue, 30 Jun 2026 17:53:02 +0800 Subject: [PATCH 27/27] fix: OrcReaderReader reset next_row_ in SetReadSchema --- src/paimon/format/orc/orc_reader_wrapper.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/paimon/format/orc/orc_reader_wrapper.cpp b/src/paimon/format/orc/orc_reader_wrapper.cpp index 772531684..232b91984 100644 --- a/src/paimon/format/orc/orc_reader_wrapper.cpp +++ b/src/paimon/format/orc/orc_reader_wrapper.cpp @@ -49,6 +49,7 @@ Status OrcReaderWrapper::SetReadSchema(const std::shared_ptr& t try { row_reader_ = reader_->createRowReader(row_reader_options); target_type_ = target_type; + next_row_ = 0; } catch (const std::exception& e) { return Status::Invalid( fmt::format("orc file batch reader create row reader failed for file {}, with {} error",