From 05d37fcf8050d9b7146e255c687c7ff84f13f9ab Mon Sep 17 00:00:00 2001 From: wecharyu Date: Tue, 17 Mar 2026 00:57:49 +0800 Subject: [PATCH 1/7] GH-48467: [C++][Parquet] Add total_buffered_bytes() API for RowGroupWriter --- cpp/src/parquet/column_writer.cc | 4 ++++ cpp/src/parquet/column_writer.h | 3 +++ cpp/src/parquet/file_writer.cc | 19 +++++++++++++++++++ cpp/src/parquet/file_writer.h | 5 +++++ 4 files changed, 31 insertions(+) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 1f1197f95a0e..be976730215f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1489,6 +1489,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, return current_encoder_->EstimatedDataEncodedSize(); } + int64_t EstimatedBufferedLevelsBytes() const override { + return definition_levels_sink_.length() + repetition_levels_sink_.length(); + } + protected: std::shared_ptr GetValuesBuffer() override { return current_encoder_->FlushValues(); diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index e0d4b5234bc3..6a077feb22f0 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -165,6 +165,9 @@ class PARQUET_EXPORT ColumnWriter { /// \brief Estimated size of the values that are not written to a page yet. virtual int64_t estimated_buffered_value_bytes() const = 0; + /// \brief Estimated size of the levels that are not written to a page yet. + virtual int64_t EstimatedBufferedLevelsBytes() const = 0; + /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 842e667e8a7c..d494f2def0f4 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -69,6 +69,10 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } +int64_t RowGroupWriter::total_buffered_bytes() const { + return contents_->EstimatedBufferedBytes(); +} + bool RowGroupWriter::buffered() const { return contents_->buffered(); } int RowGroupWriter::current_column() { return contents_->current_column(); } @@ -198,6 +202,21 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } + int64_t EstimatedBufferedBytes() const override { + if (closed_) { + return 0; + } + int64_t estimated_buffered_value_bytes = 0; + for (size_t i = 0; i < column_writers_.size(); i++) { + if (column_writers_[i]) { + estimated_buffered_value_bytes += + column_writers_[i]->estimated_buffered_value_bytes() + + column_writers_[i]->EstimatedBufferedLevelsBytes(); + } + } + return estimated_buffered_value_bytes; + } + bool buffered() const override { return buffered_row_group_; } void Close() override { diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5ea1d7c98a0..8525af37a76c 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -58,6 +58,9 @@ class PARQUET_EXPORT RowGroupWriter { virtual int64_t total_compressed_bytes() const = 0; /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; + /// \brief Estimated bytes of values and levels that are buffered by the page writer + /// but not written to a page yet + virtual int64_t EstimatedBufferedBytes() const = 0; virtual bool buffered() const = 0; }; @@ -99,6 +102,8 @@ class PARQUET_EXPORT RowGroupWriter { int64_t total_compressed_bytes() const; /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; + /// \brief total bytes of values and levels that are buffered by the page writer + int64_t total_buffered_bytes() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created /// by calling ParquetFileWriter::AppendBufferedRowGroup. From 618d293c05051681cdd30030157f21aa26b57f55 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Thu, 19 Mar 2026 01:22:50 +0800 Subject: [PATCH 2/7] use BufferedStats for buffered data --- cpp/src/parquet/column_writer.cc | 15 +++++++++++++-- cpp/src/parquet/column_writer.h | 10 ++++++++-- cpp/src/parquet/file_writer.cc | 19 ++++++++++--------- cpp/src/parquet/file_writer.h | 18 +++++++++++++----- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index be976730215f..61507c2ba196 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1489,8 +1489,19 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, return current_encoder_->EstimatedDataEncodedSize(); } - int64_t EstimatedBufferedLevelsBytes() const override { - return definition_levels_sink_.length() + repetition_levels_sink_.length(); + int64_t estimated_buffered_def_level_bytes() const override { + return definition_levels_sink_.length(); + } + + int64_t estimated_buffered_rep_level_bytes() const override { + return repetition_levels_sink_.length(); + } + + int64_t estimated_buffered_dict_bytes() const override { + if (current_dict_encoder_) { + return current_dict_encoder_->dict_encoded_size(); + } + return 0; } protected: diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 6a077feb22f0..5ad58c5ecf21 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -165,8 +165,14 @@ class PARQUET_EXPORT ColumnWriter { /// \brief Estimated size of the values that are not written to a page yet. virtual int64_t estimated_buffered_value_bytes() const = 0; - /// \brief Estimated size of the levels that are not written to a page yet. - virtual int64_t EstimatedBufferedLevelsBytes() const = 0; + /// \brief Estimated size of the definition levels that are not written to a page yet. + virtual int64_t estimated_buffered_def_level_bytes() const = 0; + + /// \brief Estimated size of the repetition levels that are not written to a page yet. + virtual int64_t estimated_buffered_rep_level_bytes() const = 0; + + /// \brief Estimated size of the dictionary that are not written to a page yet. + virtual int64_t estimated_buffered_dict_bytes() const = 0; /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index d494f2def0f4..08d5c2ca711e 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -69,8 +69,8 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } -int64_t RowGroupWriter::total_buffered_bytes() const { - return contents_->EstimatedBufferedBytes(); +BufferedStats RowGroupWriter::estimated_buffered_stats() const { + return contents_->EstimatedBufferedStats(); } bool RowGroupWriter::buffered() const { return contents_->buffered(); } @@ -202,19 +202,20 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } - int64_t EstimatedBufferedBytes() const override { + BufferedStats EstimatedBufferedStats() const override { + BufferedStats stats; if (closed_) { - return 0; + return stats; } - int64_t estimated_buffered_value_bytes = 0; for (size_t i = 0; i < column_writers_.size(); i++) { if (column_writers_[i]) { - estimated_buffered_value_bytes += - column_writers_[i]->estimated_buffered_value_bytes() + - column_writers_[i]->EstimatedBufferedLevelsBytes(); + stats.def_level_bytes += column_writers_[i]->estimated_buffered_def_level_bytes(); + stats.rep_level_bytes += column_writers_[i]->estimated_buffered_rep_level_bytes(); + stats.value_bytes += column_writers_[i]->estimated_buffered_value_bytes(); + stats.dict_bytes += column_writers_[i]->estimated_buffered_dict_bytes(); } } - return estimated_buffered_value_bytes; + return stats; } bool buffered() const override { return buffered_row_group_; } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 8525af37a76c..d5d753bfd0b9 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -34,6 +34,13 @@ class ColumnWriter; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; +struct PARQUET_EXPORT BufferedStats { + int64_t def_level_bytes = 0; + int64_t rep_level_bytes = 0; + int64_t value_bytes = 0; + int64_t dict_bytes = 0; +}; + class PARQUET_EXPORT RowGroupWriter { public: // Forward declare a virtual class 'Contents' to aid dependency injection and more @@ -58,9 +65,9 @@ class PARQUET_EXPORT RowGroupWriter { virtual int64_t total_compressed_bytes() const = 0; /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; - /// \brief Estimated bytes of values and levels that are buffered by the page writer - /// but not written to a page yet - virtual int64_t EstimatedBufferedBytes() const = 0; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + virtual BufferedStats EstimatedBufferedStats() const = 0; virtual bool buffered() const = 0; }; @@ -102,8 +109,9 @@ class PARQUET_EXPORT RowGroupWriter { int64_t total_compressed_bytes() const; /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; - /// \brief total bytes of values and levels that are buffered by the page writer - int64_t total_buffered_bytes() const; + /// \brief Estimated sizes of buffered data (levels, values, dict) not yet + /// written to a page. + BufferedStats estimated_buffered_stats() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created /// by calling ParquetFileWriter::AppendBufferedRowGroup. From 9249f00fd8b4ddd1f5a7b108180bec0417f6b592 Mon Sep 17 00:00:00 2001 From: wecharyu Date: Sun, 22 Mar 2026 15:44:47 +0800 Subject: [PATCH 3/7] add comment for BufferedStats --- cpp/src/parquet/file_writer.cc | 6 +++--- cpp/src/parquet/file_writer.h | 16 +++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 08d5c2ca711e..5e677ba470c3 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -69,7 +69,7 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const { return contents_->total_compressed_bytes_written(); } -BufferedStats RowGroupWriter::estimated_buffered_stats() const { +RowGroupWriter::BufferedStats RowGroupWriter::estimated_buffered_stats() const { return contents_->EstimatedBufferedStats(); } @@ -202,8 +202,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents { return total_compressed_bytes_written; } - BufferedStats EstimatedBufferedStats() const override { - BufferedStats stats; + RowGroupWriter::BufferedStats EstimatedBufferedStats() const override { + RowGroupWriter::BufferedStats stats; if (closed_) { return stats; } diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index d5d753bfd0b9..3ca71875d9db 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -34,15 +34,17 @@ class ColumnWriter; static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'}; static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; -struct PARQUET_EXPORT BufferedStats { - int64_t def_level_bytes = 0; - int64_t rep_level_bytes = 0; - int64_t value_bytes = 0; - int64_t dict_bytes = 0; -}; - class PARQUET_EXPORT RowGroupWriter { public: + // Estimated uncompressed byte sizes of data buffered by column writers + // that have not yet been serialized into data pages. + struct BufferedStats { + int64_t def_level_bytes = 0; + int64_t rep_level_bytes = 0; + int64_t value_bytes = 0; + int64_t dict_bytes = 0; + }; + // Forward declare a virtual class 'Contents' to aid dependency injection and more // easily create test fixtures // An implementation of the Contents class is defined in the .cc file From fc41f985b01675c27ac4c75625b811c6ab4f346a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 26 Mar 2026 10:07:46 +0800 Subject: [PATCH 4/7] Update cpp/src/parquet/file_writer.h Co-authored-by: Zehua Zou --- cpp/src/parquet/file_writer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index 3ca71875d9db..b7f74685c47d 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -68,7 +68,7 @@ class PARQUET_EXPORT RowGroupWriter { /// \brief total compressed bytes written by the page writer virtual int64_t total_compressed_bytes_written() const = 0; /// \brief Estimated sizes of buffered data (levels, values, dict) not yet - /// written to a page. + /// written to pages. virtual BufferedStats EstimatedBufferedStats() const = 0; virtual bool buffered() const = 0; From fe2de4b6f83f479eeb018d02ae8b38990e2e3ef8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 26 Mar 2026 10:07:55 +0800 Subject: [PATCH 5/7] Update cpp/src/parquet/file_writer.h Co-authored-by: Zehua Zou --- cpp/src/parquet/file_writer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index b7f74685c47d..df42b6185996 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -37,7 +37,7 @@ static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'}; class PARQUET_EXPORT RowGroupWriter { public: // Estimated uncompressed byte sizes of data buffered by column writers - // that have not yet been serialized into data pages. + // that have not yet been serialized into pages. struct BufferedStats { int64_t def_level_bytes = 0; int64_t rep_level_bytes = 0; From 74ae7056a306d069c890f2d35768a262f84217ae Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 26 Mar 2026 10:08:07 +0800 Subject: [PATCH 6/7] Update cpp/src/parquet/file_writer.h Co-authored-by: Zehua Zou --- cpp/src/parquet/file_writer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index df42b6185996..636a7e99572f 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -112,7 +112,7 @@ class PARQUET_EXPORT RowGroupWriter { /// \brief total compressed bytes written by the page writer int64_t total_compressed_bytes_written() const; /// \brief Estimated sizes of buffered data (levels, values, dict) not yet - /// written to a page. + /// written to pages. BufferedStats estimated_buffered_stats() const; /// Returns whether the current RowGroupWriter is in the buffered mode and is created From 1591aa24d4807c42a884f0b79ce8a80ee4ccaa88 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 26 Mar 2026 10:08:22 +0800 Subject: [PATCH 7/7] Update cpp/src/parquet/file_writer.cc Co-authored-by: Zehua Zou --- cpp/src/parquet/file_writer.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 5e677ba470c3..ec303408f363 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -207,12 +207,12 @@ class RowGroupSerializer : public RowGroupWriter::Contents { if (closed_) { return stats; } - for (size_t i = 0; i < column_writers_.size(); i++) { - if (column_writers_[i]) { - stats.def_level_bytes += column_writers_[i]->estimated_buffered_def_level_bytes(); - stats.rep_level_bytes += column_writers_[i]->estimated_buffered_rep_level_bytes(); - stats.value_bytes += column_writers_[i]->estimated_buffered_value_bytes(); - stats.dict_bytes += column_writers_[i]->estimated_buffered_dict_bytes(); + for (const auto& column_writer : column_writers_) { + if (column_writer) { + stats.def_level_bytes += column_writer->estimated_buffered_def_level_bytes(); + stats.rep_level_bytes += column_writer->estimated_buffered_rep_level_bytes(); + stats.value_bytes += column_writer->estimated_buffered_value_bytes(); + stats.dict_bytes += column_writer->estimated_buffered_dict_bytes(); } } return stats;