diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc index e83855336d2d..54dc54ae7e82 100644 --- a/cpp/src/arrow/csv/parser.cc +++ b/cpp/src/arrow/csv/parser.cc @@ -204,7 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter // however we allow for one extraneous write in case of excessive columns, // hence `2 + num_rows * num_cols` (see explanation in PushValue below). PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols) - : ValueDescWriter(pool, /*values_capacity=*/2 + num_rows * num_cols) {} + : ValueDescWriter( + pool, /*values_capacity=*/2 + static_cast(num_rows) * num_cols) {} void PushValue(ParsedValueDesc v) { DCHECK_LT(values_size_, values_capacity_); @@ -535,8 +536,8 @@ class BlockParserImpl { // Use bulk filter only if average value length is >= 10 bytes, // as the bulk filter has a fixed cost that isn't compensated // when values are too short. - const int64_t bulk_filter_threshold = - batch_.num_cols_ * (batch_.num_rows_ - start_num_rows) * 10; + const int64_t bulk_filter_threshold = static_cast(batch_.num_cols_) * + (batch_.num_rows_ - start_num_rows) * 10; use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold; } @@ -604,6 +605,17 @@ class BlockParserImpl { rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - batch_.num_rows_); } + // The values array holds one ParsedValueDesc per cell and those offsets + // are 31-bit, so the number of values in a chunk must fit in an int32. + // A first line with millions of fields can drive `num_cols_` high enough + // to overflow that, so error out rather than presize past the limit. + if (static_cast(rows_in_chunk) * batch_.num_cols_ > + std::numeric_limits::max()) { + return Status::Invalid("CSV parser: row group of ", rows_in_chunk, " rows x ", + batch_.num_cols_, + " columns exceeds the maximum number of values"); + } + ARROW_ASSIGN_OR_RAISE( auto values_writer, PresizedValueDescWriter::Make(pool_, rows_in_chunk, batch_.num_cols_)); diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 0b5e7175093f..abe57dd2e89c 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -666,6 +666,17 @@ TEST(BlockParser, MismatchingNumColumns) { } } +TEST(BlockParser, TooManyValues) { + // A first line carrying millions of fields drives num_cols high enough that + // the per-chunk value count (rows x columns) would overflow the 31-bit value + // offset, so the parser errors out instead of overflowing. + uint32_t out_size; + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/5000000); + Status st = Parse(parser, MakeCSVData({"a,b\n"}), &out_size); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("exceeds the maximum number of values"), st); +} + TEST(BlockParser, MismatchingNumColumnsHandler) { struct CustomHandler { operator InvalidRowHandler() {