From 21b0c226bf6fbeb28c5990034e3abe14435d5517 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Jun 2026 09:09:28 +0100 Subject: [PATCH 1/3] Don't use ranges --- .../arrow/compute/kernels/vector_select_k.cc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index cc375919658..3239d9d4e1e 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -17,7 +17,6 @@ #include #include -#include #include #include "arrow/compute/function.h" @@ -118,22 +117,22 @@ void HeapSortNonNullsToOutput(std::span non_null_input_range, Comparat return; } std::span heap = non_null_input_range.subspan(0, output_range.size()); - std::ranges::make_heap(heap, cmp); + std::make_heap(heap.begin(), heap.end(), cmp); std::span remaining_input = non_null_input_range.subspan(output_range.size()); for (uint64_t x_index : remaining_input) { if (cmp(x_index, heap.front())) { - std::ranges::pop_heap(heap, cmp); + std::pop_heap(heap.begin(), heap.end(), cmp); heap.back() = x_index; - std::ranges::push_heap(heap, cmp); + std::push_heap(heap.begin(), heap.end(), cmp); } } // fill output in reverse when destructing, // as the "worst" (next-to-would-have-been-replaced) element is at heap-top - for (auto& reverse_out_iter : std::ranges::reverse_view(output_range)) { - reverse_out_iter = heap.front(); // heap-top has the next element - std::ranges::pop_heap(heap, cmp); + for (int64_t i = output_range.size() - 1; i >= 0; --i) { + output_range[i] = heap.front(); // heap-top has the next element + std::pop_heap(heap.begin(), heap.end(), cmp); // Decrease heap-size by one heap = heap.first(heap.size() - 1); } @@ -422,9 +421,8 @@ class ChunkedArraySelector : public TypeVisitor { // so the heap must have been completely filled DCHECK_EQ(heap.size(), output.non_null_like_range.size()); - for (uint64_t& reverse_out_iter : - std::ranges::reverse_view(output.non_null_like_range)) { - reverse_out_iter = + for (int64_t i = output.non_null_like_range.size() - 1; i >= 0; --i) { + output.non_null_like_range[i] = heap.top().index + heap.top().offset; // heap-top has the next element heap.pop(); } From 05d44a5055de56dc2cde9cd393c01161d8b2dca1 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 30 Jun 2026 09:33:54 +0100 Subject: [PATCH 2/3] Fix underflow --- cpp/src/arrow/compute/kernels/vector_select_k.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 3239d9d4e1e..ea072d179ee 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -130,8 +130,8 @@ void HeapSortNonNullsToOutput(std::span non_null_input_range, Comparat // fill output in reverse when destructing, // as the "worst" (next-to-would-have-been-replaced) element is at heap-top - for (int64_t i = output_range.size() - 1; i >= 0; --i) { - output_range[i] = heap.front(); // heap-top has the next element + for (int64_t i = output_range.size(); i > 0; --i) { + output_range[i - 1] = heap.front(); // heap-top has the next element std::pop_heap(heap.begin(), heap.end(), cmp); // Decrease heap-size by one heap = heap.first(heap.size() - 1); @@ -421,8 +421,8 @@ class ChunkedArraySelector : public TypeVisitor { // so the heap must have been completely filled DCHECK_EQ(heap.size(), output.non_null_like_range.size()); - for (int64_t i = output.non_null_like_range.size() - 1; i >= 0; --i) { - output.non_null_like_range[i] = + for (int64_t i = output.non_null_like_range.size(); i > 0; --i) { + output.non_null_like_range[i - 1] = heap.top().index + heap.top().offset; // heap-top has the next element heap.pop(); } From f487731d3b1976a83f263d5eab04b3d305bc4e26 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 30 Jun 2026 11:16:13 +0200 Subject: [PATCH 3/3] Add tests for columns without non-null values --- .../arrow/compute/kernels/select_k_test.cc | 366 ++++++++++++++++-- 1 file changed, 323 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/select_k_test.cc b/cpp/src/arrow/compute/kernels/select_k_test.cc index 67e5d214636..47e4af58001 100644 --- a/cpp/src/arrow/compute/kernels/select_k_test.cc +++ b/cpp/src/arrow/compute/kernels/select_k_test.cc @@ -276,6 +276,26 @@ TEST_F(TestSelectKWithArray, FullSelectKNull) { Check(uint8(), array_input, options, expected); } +TEST_F(TestSelectKWithArray, PartialSelectKAllNull) { + auto array_input = R"([null, null, null, null, null])"; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(4, sort_keys); + auto expected = R"([null, null, null, null])"; + Check(uint8(), array_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + Check(uint8(), array_input, options, expected); +} + +TEST_F(TestSelectKWithArray, FullSelectKAllNull) { + auto array_input = R"([null, null, null, null, null])"; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(10, sort_keys); + auto expected = R"([null, null, null, null, null])"; + Check(uint8(), array_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + Check(uint8(), array_input, options, expected); +} + TEST_F(TestSelectKWithArray, PartialSelectKNullNaN) { auto array_input = R"([null, 30, NaN, 20, 10, null])"; std::vector sort_keys{SortKey("a", SortOrder::Descending)}; @@ -293,9 +313,31 @@ TEST_F(TestSelectKWithArray, FullSelectKNullNaN) { options.sort_keys[0].null_placement = NullPlacement::AtStart; Check(float64(), array_input, options, "[null, null, NaN, 30, 20, 10]"); } + +TEST_F(TestSelectKWithArray, PartialSelectKAllNullAndNaN) { + auto array_input = R"([null, NaN, NaN, null, null])"; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(4, sort_keys); + auto expected = R"([NaN, NaN, null, null])"; + Check(float64(), array_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = R"([null, null, null, NaN])"; + Check(float64(), array_input, options, expected); +} + +TEST_F(TestSelectKWithArray, FullSelectKAllNullAndNaN) { + auto array_input = R"([null, NaN, NaN, null, null])"; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(10, sort_keys); + auto expected = R"([NaN, NaN, null, null, null])"; + Check(float64(), array_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = R"([null, null, null, NaN, NaN])"; + Check(float64(), array_input, options, expected); +} + // Test basic cases for chunked array -template struct TestSelectKWithChunkedArray : public ::testing::Test { TestSelectKWithChunkedArray() {} @@ -323,6 +365,15 @@ struct TestSelectKWithChunkedArray : public ::testing::Test { AssertSelectK(chunked_array, k); } + void Check(const std::shared_ptr& type, const std::vector& input, + const SelectKOptions& options, const std::string& expected) { + std::shared_ptr actual; + auto input_array = ChunkedArrayFromJSON(type, input); + auto expected_array = ChunkedArrayFromJSON(type, {expected}); + ASSERT_OK(this->DoSelectK(input_array, options, &actual)); + AssertChunkedEqual(*expected_array, *actual); + } + void Check(const std::shared_ptr& chunked_array, const SelectKOptions& options, const std::shared_ptr& expected_array) { @@ -343,9 +394,12 @@ struct TestSelectKWithChunkedArray : public ::testing::Test { } }; -TYPED_TEST_SUITE(TestSelectKWithChunkedArray, SelectKableTypes); +template +struct TestSelectKWithChunkedArrayTyped : public TestSelectKWithChunkedArray {}; + +TYPED_TEST_SUITE(TestSelectKWithChunkedArrayTyped, SelectKableTypes); -TYPED_TEST(TestSelectKWithChunkedArray, RandomValuesWithSlices) { +TYPED_TEST(TestSelectKWithChunkedArrayTyped, RandomValuesWithSlices) { Random rand(0x61549225); int length = 100; for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) { @@ -361,62 +415,119 @@ TYPED_TEST(TestSelectKWithChunkedArray, RandomValuesWithSlices) { } } -TYPED_TEST(TestSelectKWithChunkedArray, PartialSelectKNull) { - auto chunked_array = ChunkedArrayFromJSON(uint8(), { - "[null, 1]", - "[3, null, 2]", - "[1]", - }); +TEST_F(TestSelectKWithChunkedArray, PartialSelectKNull) { + auto chunked_array = std::vector{ + "[null, 1]", + "[3, null, 2]", + "[1]", + }; std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; auto options = SelectKOptions(3, sort_keys); - auto expected = ChunkedArrayFromJSON(uint8(), {"[1, 1, 2]"}); - this->Check(chunked_array, options, expected); + auto expected = "[1, 1, 2]"; + this->Check(uint8(), chunked_array, options, expected); options.sort_keys[0].null_placement = NullPlacement::AtStart; - expected = ChunkedArrayFromJSON(uint8(), {"[null, null, 1]"}); - this->Check(chunked_array, options, expected); + expected = "[null, null, 1]"; + this->Check(uint8(), chunked_array, options, expected); } -TYPED_TEST(TestSelectKWithChunkedArray, FullSelectKNull) { - auto chunked_array = ChunkedArrayFromJSON(uint8(), { - "[null, 1]", - "[3, null, 2]", - "[1]", - }); +TEST_F(TestSelectKWithChunkedArray, FullSelectKNull) { + auto chunked_array = std::vector{ + "[null, 1]", + "[3, null, 2]", + "[1]", + }; std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; auto options = SelectKOptions(10, sort_keys); options.sort_keys[0].null_placement = NullPlacement::AtStart; - auto expected = ChunkedArrayFromJSON(uint8(), {"[null, null, 1, 1, 2, 3]"}); - this->Check(chunked_array, options, expected); + auto expected = "[null, null, 1, 1, 2, 3]"; + this->Check(uint8(), chunked_array, options, expected); options.sort_keys[0].null_placement = NullPlacement::AtEnd; - expected = ChunkedArrayFromJSON(uint8(), {"[1, 1, 2, 3, null, null]"}); - this->Check(chunked_array, options, expected); + expected = "[1, 1, 2, 3, null, null]"; + this->Check(uint8(), chunked_array, options, expected); } -TYPED_TEST(TestSelectKWithChunkedArray, PartialSelectKNullNaN) { - auto chunked_array = ChunkedArrayFromJSON( - float64(), {"[null, 1]", "[3, null, NaN]", "[10, NaN, 2]", "[1]"}); +TEST_F(TestSelectKWithChunkedArray, PartialSelectKAllNull) { + auto chunked_array = std::vector{ + "[null, null]", + "[null, null, null]", + "[null]", + }; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(3, sort_keys); + auto expected = "[null, null, null]"; + this->Check(uint8(), chunked_array, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + this->Check(uint8(), chunked_array, options, expected); +} + +TEST_F(TestSelectKWithChunkedArray, FullSelectKAllNull) { + auto chunked_array = std::vector{ + "[null, null]", + "[null, null, null]", + "[null]", + }; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(10, sort_keys); + auto expected = "[null, null, null, null, null, null]"; + this->Check(uint8(), chunked_array, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + this->Check(uint8(), chunked_array, options, expected); +} + +TEST_F(TestSelectKWithChunkedArray, PartialSelectKNullNaN) { + auto chunked_array = + std::vector{"[null, 1]", "[3, null, NaN]", "[10, NaN, 2]", "[1]"}; std::vector sort_keys{SortKey("a", SortOrder::Descending)}; auto options = SelectKOptions(3, sort_keys); options.sort_keys[0].null_placement = NullPlacement::AtStart; - auto expected = ChunkedArrayFromJSON(float64(), {"[null, null, NaN]"}); - this->Check(chunked_array, options, expected); + auto expected = "[null, null, NaN]"; + this->Check(float64(), chunked_array, options, expected); options.sort_keys[0].null_placement = NullPlacement::AtEnd; - expected = ChunkedArrayFromJSON(float64(), {"[10, 3, 2]"}); - this->Check(chunked_array, options, expected); + expected = "[10, 3, 2]"; + this->Check(float64(), chunked_array, options, expected); } -TYPED_TEST(TestSelectKWithChunkedArray, FullSelectKNullNaN) { - auto chunked_array = ChunkedArrayFromJSON( - float64(), {"[null, 1]", "[3, null, NaN]", "[10, NaN, 2]", "[1]"}); +TEST_F(TestSelectKWithChunkedArray, FullSelectKNullNaN) { + auto chunked_array = + std::vector{"[null, 1]", "[3, null, NaN]", "[10, NaN, 2]", "[1]"}; std::vector sort_keys{SortKey("a", SortOrder::Descending)}; auto options = SelectKOptions(10, sort_keys); options.sort_keys[0].null_placement = NullPlacement::AtStart; - auto expected = - ChunkedArrayFromJSON(float64(), {"[null, null, NaN, NaN, 10, 3, 2, 1, 1]"}); - this->Check(chunked_array, options, expected); + auto expected = "[null, null, NaN, NaN, 10, 3, 2, 1, 1]"; + this->Check(float64(), chunked_array, options, expected); options.sort_keys[0].null_placement = NullPlacement::AtEnd; - expected = ChunkedArrayFromJSON(float64(), {"[10, 3, 2, 1, 1, NaN, NaN, null, null]"}); - this->Check(chunked_array, options, expected); + expected = "[10, 3, 2, 1, 1, NaN, NaN, null, null]"; + this->Check(float64(), chunked_array, options, expected); +} + +TEST_F(TestSelectKWithChunkedArray, PartialSelectKAllNullAndNaN) { + auto chunked_array = std::vector{ + "[null, NaN]", + "[NaN, null, null]", + "[null]", + }; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(3, sort_keys); + auto expected = "[NaN, NaN, null]"; + this->Check(float64(), chunked_array, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = "[null, null, null]"; + this->Check(float64(), chunked_array, options, expected); +} + +TEST_F(TestSelectKWithChunkedArray, FullSelectKAllNullAndNaN) { + auto chunked_array = std::vector{ + "[null, NaN]", + "[NaN, null, null]", + "[null]", + }; + std::vector sort_keys{SortKey("a", SortOrder::Ascending)}; + auto options = SelectKOptions(10, sort_keys); + auto expected = "[NaN, NaN, null, null, null, null]"; + this->Check(float64(), chunked_array, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = "[null, null, null, null, NaN, NaN]"; + this->Check(float64(), chunked_array, options, expected); } template @@ -790,6 +901,39 @@ TEST_F(TestSelectKWithRecordBatch, PartialSelectKNullNaN) { Check(schema, batch_input, options, expected); } +TEST_F(TestSelectKWithRecordBatch, PartialSelectKAllNullNaN) { + auto schema = ::arrow::schema({ + {field("a", float32())}, + {field("b", float64())}, + }); + auto batch_input = R"([ + {"a": null, "b": null}, + {"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": null, "b": NaN}, + {"a": NaN, "b": NaN}, + {"a": null, "b": NaN} + ])"; + std::vector sort_keys{ + SortKey("a", SortOrder::Ascending, NullPlacement::AtStart), + SortKey("b", SortOrder::Descending)}; + auto options = SelectKOptions(3, sort_keys); + auto expected = R"([{"a": null, "b": NaN}, + {"a": null, "b": NaN}, + {"a": null, "b": null}])"; + Check(schema, batch_input, options, expected); + options.sort_keys[1].null_placement = NullPlacement::AtStart; + expected = R"([{"a": null, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": NaN}])"; + Check(schema, batch_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtEnd; + expected = R"([{"a": NaN, "b": null}, + {"a": NaN, "b": NaN}, + {"a": null, "b": null}])"; + Check(schema, batch_input, options, expected); +} + TEST_F(TestSelectKWithRecordBatch, FullSelectKNullNaN) { auto schema = ::arrow::schema({ {field("a", float32())}, @@ -834,6 +978,51 @@ TEST_F(TestSelectKWithRecordBatch, FullSelectKNullNaN) { Check(schema, batch_input, options, expected); } +TEST_F(TestSelectKWithRecordBatch, FullSelectKAllNullNaN) { + auto schema = ::arrow::schema({ + {field("a", float32())}, + {field("b", float64())}, + }); + auto batch_input = R"([ + {"a": null, "b": null}, + {"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": null, "b": NaN}, + {"a": NaN, "b": NaN}, + {"a": null, "b": NaN} + ])"; + std::vector sort_keys{ + SortKey("a", SortOrder::Ascending, NullPlacement::AtStart), + SortKey("b", SortOrder::Descending)}; + auto options = SelectKOptions(10, sort_keys); + auto expected = R"([{"a": null, "b": NaN}, + {"a": null, "b": NaN}, + {"a": null, "b": null}, + {"a": null, "b": null}, + {"a": NaN, "b": NaN}, + {"a": NaN, "b": null} + ])"; + Check(schema, batch_input, options, expected); + options.sort_keys[1].null_placement = NullPlacement::AtStart; + expected = R"([{"a": null, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": NaN}, + {"a": null, "b": NaN}, + {"a": NaN, "b": null}, + {"a": NaN, "b": NaN} + ])"; + Check(schema, batch_input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtEnd; + expected = R"([{"a": NaN, "b": null}, + {"a": NaN, "b": NaN}, + {"a": null, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": NaN}, + {"a": null, "b": NaN} + ])"; + Check(schema, batch_input, options, expected); +} + TEST_F(TestSelectKWithRecordBatch, BottomKOneColumnKey) { auto schema = ::arrow::schema({ {field("country", utf8())}, @@ -1147,26 +1336,26 @@ TEST_F(TestSelectKWithTable, FullSelectKNullNaN) { {"a": 1, "b": 3}, {"a": 3, "b": null}, {"a": 6, "b": NaN}, - {"a": 6, "b": null}, + {"a": 6, "b": null}, {"a": NaN, "b": 5}, - {"a": null, "b": 5}, + {"a": null, "b": 5}, {"a": null, "b": null}])"}; Check(schema, input, options, expected); options.sort_keys[0].null_placement = NullPlacement::AtStart; expected = {R"([ - {"a": null, "b": 5}, + {"a": null, "b": 5}, {"a": null, "b": null}, {"a": NaN, "b": 5}, {"a": 1, "b": 5}, {"a": 1, "b": 3}, {"a": 3, "b": null}, {"a": 6, "b": NaN}, - {"a": 6, "b": null} + {"a": 6, "b": null} ])"}; Check(schema, input, options, expected); options.sort_keys[1].null_placement = NullPlacement::AtStart; expected = {R"([ - {"a": null, "b": null}, + {"a": null, "b": null}, {"a": null, "b": 5}, {"a": NaN, "b": 5}, {"a": 1, "b": 5}, @@ -1178,5 +1367,96 @@ TEST_F(TestSelectKWithTable, FullSelectKNullNaN) { Check(schema, input, options, expected); } +TEST_F(TestSelectKWithTable, PartialSelectKAllNullNaN) { + auto schema = ::arrow::schema({ + {field("a", float32())}, + {field("b", float64())}, + }); + std::vector input = { + R"([{"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": null, "b": NaN} + ])", + R"([{"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": NaN, "b": NaN} + ])"}; + + std::vector sort_keys{SortKey("a", SortOrder::Ascending), + SortKey("b", SortOrder::Descending)}; + auto options = SelectKOptions(3, sort_keys); + + std::vector expected = { + R"([{"a": NaN, "b": NaN}, + {"a": NaN, "b": null}, + {"a": NaN, "b": null} + ])"}; + Check(schema, input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = { + R"([{"a": null, "b": NaN}, + {"a": null, "b": null}, + {"a": null, "b": null} + ])"}; + Check(schema, input, options, expected); + options.sort_keys[1].null_placement = NullPlacement::AtStart; + expected = { + R"([{"a": null, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": NaN} + ])"}; + Check(schema, input, options, expected); +} + +TEST_F(TestSelectKWithTable, FullSelectKAllNullNaN) { + auto schema = ::arrow::schema({ + {field("a", float32())}, + {field("b", float64())}, + }); + std::vector input = { + R"([{"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": null, "b": NaN} + ])", + R"([{"a": null, "b": null}, + {"a": NaN, "b": null}, + {"a": NaN, "b": NaN} + ])"}; + + std::vector sort_keys{SortKey("a", SortOrder::Ascending), + SortKey("b", SortOrder::Descending)}; + auto options = SelectKOptions(10, sort_keys); + + std::vector expected = { + R"([{"a": NaN, "b": NaN}, + {"a": NaN, "b": null}, + {"a": NaN, "b": null}, + {"a": null, "b": NaN}, + {"a": null, "b": null}, + {"a": null, "b": null} + ])"}; + Check(schema, input, options, expected); + options.sort_keys[0].null_placement = NullPlacement::AtStart; + expected = { + R"([{"a": null, "b": NaN}, + {"a": null, "b": null}, + {"a": null, "b": null}, + {"a": NaN, "b": NaN}, + {"a": NaN, "b": null}, + {"a": NaN, "b": null} + ])"}; + Check(schema, input, options, expected); + options.sort_keys[1].null_placement = NullPlacement::AtStart; + expected = { + R"([{"a": null, "b": null}, + {"a": null, "b": null}, + {"a": null, "b": NaN}, + {"a": NaN, "b": null}, + {"a": NaN, "b": null}, + {"a": NaN, "b": NaN} + ])"}; + Check(schema, input, options, expected); +} + } // namespace compute } // namespace arrow