From e8dc2019853737848e11ae2d67bc7de274193d15 Mon Sep 17 00:00:00 2001 From: abdul rawoof Date: Fri, 3 Jul 2026 14:42:10 +0530 Subject: [PATCH] GH-50355: [C++][Gandiva] fix out-of-bounds read in utf8_length_ignore_invalid --- cpp/src/gandiva/precompiled/string_ops.cc | 2 +- .../gandiva/precompiled/string_ops_test.cc | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 0dd02cb1d80..2927793cfc5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -206,7 +206,7 @@ gdv_int32 utf8_length_ignore_invalid(const char* data, gdv_int32 data_len) { // if invalid byte or incomplete glyph, ignore it char_len = 1; } - for (int j = 1; j < char_len; ++j) { + for (int j = 1; j < char_len && i + j < data_len; ++j) { if ((data[i + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph char_len += 1; } diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 5a317d4595a..6347e946e09 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -19,6 +19,7 @@ #include #include +#include #include "gandiva/execution_context.h" #include "gandiva/precompiled/types.h" @@ -1608,6 +1609,29 @@ TEST(TestStringOps, TestRpadString) { EXPECT_EQ(std::string(out_str + 5000, 2), "α"); } +TEST(TestStringOps, TestPadMalformedUtf8NoOverread) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + + // A 4-byte utf8 lead byte followed by non-continuation bytes and no trailing + // space. utf8_length_ignore_invalid() used to extend the glyph length past + // the end of the buffer while scanning the continuation bytes. The input is + // held in an exactly-sized heap buffer so any over-read trips AddressSanitizer. + std::vector text = {'\xF0', 'a', 'a', 'a'}; + const auto text_len = static_cast(text.size()); + + const char* out_str = + lpad_utf8_int32_utf8(ctx_ptr, text.data(), text_len, 6, " ", 1, &out_len); + EXPECT_EQ(out_len, 9); + EXPECT_EQ(std::string(out_str + out_len - text_len, text_len), + std::string(text.begin(), text.end())); + + out_str = rpad_utf8_int32_utf8(ctx_ptr, text.data(), text_len, 6, " ", 1, &out_len); + EXPECT_EQ(out_len, 9); + EXPECT_EQ(std::string(out_str, text_len), std::string(text.begin(), text.end())); +} + TEST(TestStringOps, TestRtrim) { gandiva::ExecutionContext ctx; uint64_t ctx_ptr = reinterpret_cast(&ctx);