Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions cpp/src/arrow/csv/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter<PresizedValueDescWriter>
// however we allow for one extraneous write in case of excessive columns,
// hence `2 + num_rows * num_cols` (see explanation in PushValue below).
PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
: ValueDescWriter(pool, /*values_capacity=*/2 + num_rows * num_cols) {}
: ValueDescWriter(
pool, /*values_capacity=*/2 + static_cast<int64_t>(num_rows) * num_cols) {}

void PushValue(ParsedValueDesc v) {
DCHECK_LT(values_size_, values_capacity_);
Expand Down Expand Up @@ -535,8 +536,8 @@ class BlockParserImpl {
// Use bulk filter only if average value length is >= 10 bytes,
// as the bulk filter has a fixed cost that isn't compensated
// when values are too short.
const int64_t bulk_filter_threshold =
batch_.num_cols_ * (batch_.num_rows_ - start_num_rows) * 10;
const int64_t bulk_filter_threshold = static_cast<int64_t>(batch_.num_cols_) *
(batch_.num_rows_ - start_num_rows) * 10;
use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold;
}

Expand Down Expand Up @@ -604,6 +605,17 @@ class BlockParserImpl {
rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - batch_.num_rows_);
}

// The values array holds one ParsedValueDesc per cell and those offsets
// are 31-bit, so the number of values in a chunk must fit in an int32.
// A first line with millions of fields can drive `num_cols_` high enough
// to overflow that, so error out rather than presize past the limit.
if (static_cast<int64_t>(rows_in_chunk) * batch_.num_cols_ >
std::numeric_limits<int32_t>::max()) {
return Status::Invalid("CSV parser: row group of ", rows_in_chunk, " rows x ",
batch_.num_cols_,
" columns exceeds the maximum number of values");
}

ARROW_ASSIGN_OR_RAISE(
auto values_writer,
PresizedValueDescWriter::Make(pool_, rows_in_chunk, batch_.num_cols_));
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/csv/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,17 @@ TEST(BlockParser, MismatchingNumColumns) {
}
}

TEST(BlockParser, TooManyValues) {
// A first line carrying millions of fields drives num_cols high enough that
// the per-chunk value count (rows x columns) would overflow the 31-bit value
// offset, so the parser errors out instead of overflowing.
uint32_t out_size;
BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/5000000);
Status st = Parse(parser, MakeCSVData({"a,b\n"}), &out_size);
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("exceeds the maximum number of values"), st);
}

TEST(BlockParser, MismatchingNumColumnsHandler) {
struct CustomHandler {
operator InvalidRowHandler() {
Expand Down
Loading