From a0fb656b72c82bc138bfcbb75a1db3efa4d53516 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Sat, 11 Apr 2026 17:10:51 +0100 Subject: [PATCH 1/7] Allow dictionaries from a wider range of types for indices --- r/src/array_to_vector.cpp | 23 ++++++++++++++++++++++- r/tests/testthat/test-Table.R | 16 ++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 432b49503e1a..7af710bc7f32 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter { case Type::UINT16: case Type::INT16: case Type::INT32: - // TODO: also add int64, uint32, uint64 downcasts, if possible + case Type::UINT32: + case Type::INT64: + case Type::UINT64: break; default: cpp11::stop("Cannot convert Dictionary Array of type `%s` to R", @@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter { dictionary_ = CreateEmptyArray(dict_type.value_type()); } } + + // R factors store their codes in 32-bit integers, so dictionary arrays with + // more levels than that cannot be represented safely. + if (dictionary_->length() > std::numeric_limits::max()) { + const auto& dict_type = checked_cast(*chunked_array->type()); + cpp11::stop( + "Cannot convert Dictionary Array of type `%s` to R: dictionary has " + "more levels than an R factor can represent", + dict_type.ToString().c_str()); + } } SEXP Allocate(R_xlen_t n) const { @@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter { case Type::INT32: return Ingest_some_nulls_Impl(data, array, start, n, chunk_index); + case Type::UINT32: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::INT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::UINT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); default: break; } diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1ca8832beb84..e404da1d029e 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -371,6 +371,22 @@ test_that("Can create table with specific dictionary types", { } }) +test_that("Table converts dictionary arrays with wider index types back to R", { + fact <- example_data[, "fct"] + + tab_uint32 <- Table$create(fact, schema = schema(fct = dictionary(uint32(), utf8()))) + expect_equal(tab_uint32$schema, schema(fct = dictionary(uint32(), utf8()))) + expect_equal_data_frame(tab_uint32, fact) + + tab_int64 <- Table$create(fact, schema = schema(fct = dictionary(int64(), utf8()))) + expect_equal(tab_int64$schema, schema(fct = dictionary(int64(), utf8()))) + expect_equal_data_frame(tab_int64, fact) + + tab_uint64 <- Table$create(fact, schema = schema(fct = dictionary(uint64(), utf8()))) + expect_equal(tab_uint64$schema, schema(fct = dictionary(uint64(), utf8()))) + expect_equal_data_frame(tab_uint64, fact) +}) + test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) From 80788aee50d4b54478cdb09d031c4e3c689b8bcf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 8 Jun 2026 11:48:00 +0100 Subject: [PATCH 2/7] Apply suggestion from @thisisnic --- r/tests/testthat/test-Table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index e404da1d029e..f6ca738d5e8c 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -371,7 +371,7 @@ test_that("Can create table with specific dictionary types", { } }) -test_that("Table converts dictionary arrays with wider index types back to R", { +test_that("Table converts dictionary arrays with uint32/int64/uint64 index types back to R", { fact <- example_data[, "fct"] tab_uint32 <- Table$create(fact, schema = schema(fct = dictionary(uint32(), utf8()))) From 1e02dcb51a7de95488e34d136fe1b117567f6eeb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 8 Jun 2026 11:51:04 +0100 Subject: [PATCH 3/7] add coilot suggested test --- r/tests/testthat/test-Table.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index f6ca738d5e8c..114f8a8477b8 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -385,6 +385,10 @@ test_that("Table converts dictionary arrays with uint32/int64/uint64 index types tab_uint64 <- Table$create(fact, schema = schema(fct = dictionary(uint64(), utf8()))) expect_equal(tab_uint64$schema, schema(fct = dictionary(uint64(), utf8()))) expect_equal_data_frame(tab_uint64, fact) + + tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), large_utf8()))) + expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), large_utf8()))) + expect_equal_data_frame(tab_large, fact) }) test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { From bfcc9c6315878f2f8ef7d77421fed6b676cb49ab Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 8 Jun 2026 12:16:41 +0100 Subject: [PATCH 4/7] Combine tests --- r/tests/testthat/test-Table.R | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 114f8a8477b8..12ef25975c70 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -357,35 +357,15 @@ test_that("Table handles null type (ARROW-7064)", { test_that("Can create table with specific dictionary types", { fact <- example_data[, "fct"] - int_types <- c(int8(), int16(), int32(), int64()) - # TODO: test uint types when format allows - # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint - for (i in int_types) { + index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), uint64()) + for (i in index_types) { sch <- schema(fct = dictionary(i, utf8())) tab <- Table$create(fact, schema = sch) expect_equal(sch, tab$schema) - if (i != int64()) { - # TODO: same downcast to int32 as we do for int64() type elsewhere - expect_equal_data_frame(tab, fact) - } + expect_equal_data_frame(tab, fact) } -}) - -test_that("Table converts dictionary arrays with uint32/int64/uint64 index types back to R", { - fact <- example_data[, "fct"] - - tab_uint32 <- Table$create(fact, schema = schema(fct = dictionary(uint32(), utf8()))) - expect_equal(tab_uint32$schema, schema(fct = dictionary(uint32(), utf8()))) - expect_equal_data_frame(tab_uint32, fact) - - tab_int64 <- Table$create(fact, schema = schema(fct = dictionary(int64(), utf8()))) - expect_equal(tab_int64$schema, schema(fct = dictionary(int64(), utf8()))) - expect_equal_data_frame(tab_int64, fact) - - tab_uint64 <- Table$create(fact, schema = schema(fct = dictionary(uint64(), utf8()))) - expect_equal(tab_uint64$schema, schema(fct = dictionary(uint64(), utf8()))) - expect_equal_data_frame(tab_uint64, fact) + # large_utf8 values exercise the non-ALTREP conversion path tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), large_utf8()))) expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), large_utf8()))) expect_equal_data_frame(tab_large, fact) From 3f5739453db0f89f7a82f0e410b99f5620b49d80 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Jun 2026 12:27:47 +0100 Subject: [PATCH 5/7] Account for large strings --- r/src/array_to_vector.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 7af710bc7f32..0992181acfe8 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -725,7 +725,8 @@ class Converter_Dictionary : public Converter { // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) // Alternative: preserve the logical type of the dictionary values // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) - if (dictionary_->type_id() != Type::STRING) { + if (dictionary_->type_id() != Type::STRING && + dictionary_->type_id() != Type::LARGE_STRING) { cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); } From 0661910b49ab13254d80d482b1b696140a3fdb68 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Jun 2026 12:28:04 +0100 Subject: [PATCH 6/7] don't forget the large ones --- cpp/src/arrow/util/converter.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c23d6ccd9886..d987bf3061fe 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -238,7 +238,9 @@ struct MakeConverterImpl { DICTIONARY_CASE(FloatType); DICTIONARY_CASE(DoubleType); DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(LargeBinaryType); DICTIONARY_CASE(StringType); + DICTIONARY_CASE(LargeStringType); DICTIONARY_CASE(FixedSizeBinaryType); #undef DICTIONARY_CASE default: From bc33f7cee6b3134d4b1d06718706efbcee9e266f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Wed, 10 Jun 2026 13:14:59 +0100 Subject: [PATCH 7/7] don't hardcode levels --- r/src/r_to_arrow.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..4312572a5ec8 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1029,8 +1029,8 @@ class RDictionaryConverter> // first we need to handle the levels SEXP levels = Rf_getAttrib(x, R_LevelsSymbol); - auto memo_chunked_chunked_array = - arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false); + auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray( + levels, this->dict_type_->value_type(), false); for (const auto& chunk : memo_chunked_chunked_array->chunks()) { RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk)); }