From a96721b49cadc0f90e3b9e6d1f12e55264d33d0d Mon Sep 17 00:00:00 2001
From: metsw24-max <metsw24@gmail.com>
Date: Mon, 1 Jun 2026 23:24:36 +0530
Subject: [PATCH 1/3] MINOR: [C++][CSV] avoid int32 overflow in block parser
 value counts

---
 cpp/src/arrow/csv/parser.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index e83855336d2d..48c9762e27e2 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -204,7 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter<PresizedValueDescWriter>
   // however we allow for one extraneous write in case of excessive columns,
   // hence `2 + num_rows * num_cols` (see explanation in PushValue below).
   PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
-      : ValueDescWriter(pool, /*values_capacity=*/2 + num_rows * num_cols) {}
+      : ValueDescWriter(pool, /*values_capacity=*/2 +
+                                  static_cast<int64_t>(num_rows) * num_cols) {}
 
   void PushValue(ParsedValueDesc v) {
     DCHECK_LT(values_size_, values_capacity_);
@@ -536,7 +537,8 @@ class BlockParserImpl {
       // as the bulk filter has a fixed cost that isn't compensated
       // when values are too short.
       const int64_t bulk_filter_threshold =
-          batch_.num_cols_ * (batch_.num_rows_ - start_num_rows) * 10;
+          static_cast<int64_t>(batch_.num_cols_) * (batch_.num_rows_ - start_num_rows) *
+          10;
       use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold;
     }
 

From e89f61dfbdf52fe8e8ff21a4968587e4eef4a8bc Mon Sep 17 00:00:00 2001
From: metsw24-max <metsw24@gmail.com>
Date: Tue, 23 Jun 2026 17:11:53 +0530
Subject: [PATCH 2/3] MINOR: [C++][CSV] apply clang-format to parser.cc

---
 cpp/src/arrow/csv/parser.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index 48c9762e27e2..d1405ea01e23 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -204,8 +204,8 @@ class PresizedValueDescWriter : public ValueDescWriter<PresizedValueDescWriter>
   // however we allow for one extraneous write in case of excessive columns,
   // hence `2 + num_rows * num_cols` (see explanation in PushValue below).
   PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
-      : ValueDescWriter(pool, /*values_capacity=*/2 +
-                                  static_cast<int64_t>(num_rows) * num_cols) {}
+      : ValueDescWriter(
+            pool, /*values_capacity=*/2 + static_cast<int64_t>(num_rows) * num_cols) {}
 
   void PushValue(ParsedValueDesc v) {
     DCHECK_LT(values_size_, values_capacity_);
@@ -536,9 +536,8 @@ class BlockParserImpl {
       // Use bulk filter only if average value length is >= 10 bytes,
       // as the bulk filter has a fixed cost that isn't compensated
       // when values are too short.
-      const int64_t bulk_filter_threshold =
-          static_cast<int64_t>(batch_.num_cols_) * (batch_.num_rows_ - start_num_rows) *
-          10;
+      const int64_t bulk_filter_threshold = static_cast<int64_t>(batch_.num_cols_) *
+                                            (batch_.num_rows_ - start_num_rows) * 10;
       use_bulk_filter_ = (data - *out_data) > bulk_filter_threshold;
     }
 

From b3c66356befe475620b87f253a35a6898ec207a0 Mon Sep 17 00:00:00 2001
From: Sayed Kaif <metsw24@gmail.com>
Date: Wed, 24 Jun 2026 13:54:46 +0530
Subject: [PATCH 3/3] MINOR: [C++][CSV] reject blocks whose value count
 overflows int32

---
 cpp/src/arrow/csv/parser.cc      | 11 +++++++++++
 cpp/src/arrow/csv/parser_test.cc | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index d1405ea01e23..54dc54ae7e82 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -605,6 +605,17 @@ class BlockParserImpl {
           rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - batch_.num_rows_);
         }
 
+        // The values array holds one ParsedValueDesc per cell and those offsets
+        // are 31-bit, so the number of values in a chunk must fit in an int32.
+        // A first line with millions of fields can drive `num_cols_` high enough
+        // to overflow that, so error out rather than presize past the limit.
+        if (static_cast<int64_t>(rows_in_chunk) * batch_.num_cols_ >
+            std::numeric_limits<int32_t>::max()) {
+          return Status::Invalid("CSV parser: row group of ", rows_in_chunk, " rows x ",
+                                 batch_.num_cols_,
+                                 " columns exceeds the maximum number of values");
+        }
+
         ARROW_ASSIGN_OR_RAISE(
             auto values_writer,
             PresizedValueDescWriter::Make(pool_, rows_in_chunk, batch_.num_cols_));
diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc
index 0b5e7175093f..abe57dd2e89c 100644
--- a/cpp/src/arrow/csv/parser_test.cc
+++ b/cpp/src/arrow/csv/parser_test.cc
@@ -666,6 +666,17 @@ TEST(BlockParser, MismatchingNumColumns) {
   }
 }
 
+TEST(BlockParser, TooManyValues) {
+  // A first line carrying millions of fields drives num_cols high enough that
+  // the per-chunk value count (rows x columns) would overflow the 31-bit value
+  // offset, so the parser errors out instead of overflowing.
+  uint32_t out_size;
+  BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/5000000);
+  Status st = Parse(parser, MakeCSVData({"a,b\n"}), &out_size);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, testing::HasSubstr("exceeds the maximum number of values"), st);
+}
+
 TEST(BlockParser, MismatchingNumColumnsHandler) {
   struct CustomHandler {
     operator InvalidRowHandler() {