Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion cpp/src/arrow/datum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,26 @@ int64_t Datum::null_count() const {
const auto& val = *std::get<std::shared_ptr<Scalar>>(this->value);
return val.is_valid ? 0 : 1;
} else {
DCHECK(false) << "This function only valid for array-like values";
DCHECK(false) << "This function only valid for scalar or array-like values";
return 0;
}
}

int64_t Datum::ComputeLogicalNullCount() const {
if (this->kind() == Datum::ARRAY) {
return std::get<std::shared_ptr<ArrayData>>(this->value)->ComputeLogicalNullCount();
} else if (this->kind() == Datum::CHUNKED_ARRAY) {
return std::get<std::shared_ptr<ChunkedArray>>(this->value)
->ComputeLogicalNullCount();
} else if (this->kind() == Datum::SCALAR) {
// Union and run-end encoded scalars derive is_valid from their underlying
// value, so it reflects logical validity. A DictionaryScalar's is_valid
// only tracks index validity, so this differs from the array path when a
// valid index points to a null dictionary value.
const auto& val = *std::get<std::shared_ptr<Scalar>>(this->value);
return val.is_valid ? 0 : 1;
} else {
DCHECK(false) << "This function only valid for scalar or array-like values";
return 0;
}
}
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/datum.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,20 @@ struct ARROW_EXPORT Datum {
/// Only valid for scalar and array-like data.
int64_t null_count() const;

/// \brief Compute the logical null count.
///
/// Only valid for scalar and array-like data. Unlike null_count(), this
/// accounts for types whose logical nulls are not captured by the top-level
/// validity bitmap, such as union, run-end encoded and dictionary types; for
/// those types the count is recomputed on every call. For scalars this
/// returns the same value as null_count(); note that a DictionaryScalar
/// counts as non-null whenever its index is valid, even if the index points
/// to a null dictionary value.
///
/// \see ArrayData::ComputeLogicalNullCount
/// \see ChunkedArray::ComputeLogicalNullCount
int64_t ComputeLogicalNullCount() const;

/// \brief The value type of the variant, if any
///
/// \return nullptr if no type
Expand Down
48 changes: 48 additions & 0 deletions cpp/src/arrow/datum_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,54 @@ TEST(Datum, NullCount) {
ASSERT_EQ(3, val3.null_count());
}

TEST(Datum, ComputeLogicalNullCount) {
// For scalars, is_valid already reflects logical validity.
Datum val1(std::make_shared<Int8Scalar>(1));
ASSERT_EQ(0, val1.ComputeLogicalNullCount());

Datum val2(MakeNullScalar(int8()));
ASSERT_EQ(1, val2.ComputeLogicalNullCount());

// For arrays with a validity bitmap, the logical null count matches
// null_count().
Datum val3(ArrayFromJSON(int8(), "[1, null, null, null]"));
ASSERT_EQ(3, val3.null_count());
ASSERT_EQ(3, val3.ComputeLogicalNullCount());

// Union arrays carry logical nulls in their children without a top-level
// validity bitmap, so null_count() is 0 while the logical null count is not.
auto union_type = sparse_union({field("a", int8()), field("b", boolean())});
auto union_arr =
ArrayFromJSON(union_type, R"([[0, null], [1, true], [0, 5], [1, null]])");
Datum val4(union_arr);
ASSERT_EQ(0, val4.null_count());
ASSERT_EQ(2, val4.ComputeLogicalNullCount());

// Chunked arrays sum the logical null count over the chunks.
auto union_chunk = ArrayFromJSON(union_type, R"([[0, 1], [1, null]])");
ASSERT_OK_AND_ASSIGN(auto chunked, ChunkedArray::Make({union_arr, union_chunk}));
Datum val5(chunked);
ASSERT_EQ(0, val5.null_count());
ASSERT_EQ(3, val5.ComputeLogicalNullCount());

// Dictionary arrays have a validity bitmap on the indices, but a valid
// index referencing a null dictionary value is also a logical null.
auto dict_type = dictionary(int32(), utf8());
auto dict_arr = DictArrayFromJSON(dict_type, /*indices=*/"[0, 1, null, 1]",
/*dictionary=*/R"([null, "a"])");
Datum val6(dict_arr);
ASSERT_EQ(1, val6.null_count());
ASSERT_EQ(2, val6.ComputeLogicalNullCount());

// A DictionaryScalar's is_valid only reflects index validity, so unlike the
// array path, a valid index referencing a null dictionary value does not
// count as a logical null.
ASSERT_OK_AND_ASSIGN(auto dict_scalar, dict_arr->GetScalar(0));
Datum val7(dict_scalar);
ASSERT_EQ(0, val7.null_count());
ASSERT_EQ(0, val7.ComputeLogicalNullCount());
}

TEST(Datum, MutableArray) {
auto arr = ArrayFromJSON(int8(), "[1, 2, 3, 4]");

Expand Down
Loading