From a96721b49cadc0f90e3b9e6d1f12e55264d33d0d Mon Sep 17 00:00:00 2001 From: metsw24-max Date: Mon, 1 Jun 2026 23:24:36 +0530 Subject: [PATCH 1/3] MINOR: [C++][CSV] avoid int32 overflow in block parser value counts --- cpp/src/arrow/csv/parser.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index e83855336d2d..48c9762e27e2 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -204,7 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter // however we allow for one extraneous write in case of excessive columns, // hence `2 + num_rows * num_cols` (see explanation in PushValue below). PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols) - : ValueDescWriter(pool, /*values_capacity=*/2 + num_rows * num_cols) {} + : ValueDescWriter(pool, /*values_capacity=*/2 + + static_cast(num_rows) * num_cols) {} void PushValue(ParsedValueDesc v) { DCHECK_LT(values_size_, values_capacity_); @@ -536,7 +537,8 @@ class BlockParserImpl { // as the bulk filter has a fixed cost that isn't compensated // when values are too short. const int64_t bulk_filter_threshold = - batch_.num_cols_ * (batch_.num_rows_ - start_num_rows) * 10; + static_cast(batch_.num_cols_) * (batch_.num_rows_ - start_num_rows) * + 10; use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold; } From e89f61dfbdf52fe8e8ff21a4968587e4eef4a8bc Mon Sep 17 00:00:00 2001 From: metsw24-max Date: Tue, 23 Jun 2026 17:11:53 +0530 Subject: [PATCH 2/3] MINOR: [C++][CSV] apply clang-format to parser.cc --- cpp/src/arrow/csv/parser.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index 48c9762e27e2..d1405ea01e23 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -204,8 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter // however we allow for one extraneous write in case of excessive columns, // hence `2 + num_rows * num_cols` (see explanation in PushValue below). PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols) - : ValueDescWriter(pool, /*values_capacity=*/2 + - static_cast(num_rows) * num_cols) {} + : ValueDescWriter( + pool, /*values_capacity=*/2 + static_cast(num_rows) * num_cols) {} void PushValue(ParsedValueDesc v) { DCHECK_LT(values_size_, values_capacity_); @@ -536,9 +536,8 @@ class BlockParserImpl { // Use bulk filter only if average value length is >= 10 bytes, // as the bulk filter has a fixed cost that isn't compensated // when values are too short. - const int64_t bulk_filter_threshold = - static_cast(batch_.num_cols_) * (batch_.num_rows_ - start_num_rows) * - 10; + const int64_t bulk_filter_threshold = static_cast(batch_.num_cols_) * + (batch_.num_rows_ - start_num_rows) * 10; use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold; } From b3c66356befe475620b87f253a35a6898ec207a0 Mon Sep 17 00:00:00 2001 From: Sayed Kaif Date: Wed, 24 Jun 2026 13:54:46 +0530 Subject: [PATCH 3/3] MINOR: [C++][CSV] reject blocks whose value count overflows int32 --- cpp/src/arrow/csv/parser.cc | 11 +++++++++++ cpp/src/arrow/csv/parser_test.cc | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index d1405ea01e23..54dc54ae7e82 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -605,6 +605,17 @@ class BlockParserImpl { rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - batch_.num_rows_); } + // The values array holds one ParsedValueDesc per cell and those offsets + // are 31-bit, so the number of values in a chunk must fit in an int32. + // A first line with millions of fields can drive `num_cols_` high enough + // to overflow that, so error out rather than presize past the limit. + if (static_cast(rows_in_chunk) * batch_.num_cols_ > + std::numeric_limits::max()) { + return Status::Invalid("CSV parser: row group of ", rows_in_chunk, " rows x ", + batch_.num_cols_, + " columns exceeds the maximum number of values"); + } + ARROW_ASSIGN_OR_RAISE( auto values_writer, PresizedValueDescWriter::Make(pool_, rows_in_chunk, batch_.num_cols_)); diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 0b5e7175093f..abe57dd2e89c 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -666,6 +666,17 @@ TEST(BlockParser, MismatchingNumColumns) { } } +TEST(BlockParser, TooManyValues) { + // A first line carrying millions of fields drives num_cols high enough that + // the per-chunk value count (rows x columns) would overflow the 31-bit value + // offset, so the parser errors out instead of overflowing. + uint32_t out_size; + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/5000000); + Status st = Parse(parser, MakeCSVData({"a,b\n"}), &out_size); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("exceeds the maximum number of values"), st); +} + TEST(BlockParser, MismatchingNumColumnsHandler) { struct CustomHandler { operator InvalidRowHandler() {