diff --git a/cpp/src/arrow/util/converter.h b/cpp/src/arrow/util/converter.h index c23d6ccd9886..d987bf3061fe 100644 --- a/cpp/src/arrow/util/converter.h +++ b/cpp/src/arrow/util/converter.h @@ -238,7 +238,9 @@ struct MakeConverterImpl { DICTIONARY_CASE(FloatType); DICTIONARY_CASE(DoubleType); DICTIONARY_CASE(BinaryType); + DICTIONARY_CASE(LargeBinaryType); DICTIONARY_CASE(StringType); + DICTIONARY_CASE(LargeStringType); DICTIONARY_CASE(FixedSizeBinaryType); #undef DICTIONARY_CASE default: diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 432b49503e1a..0992181acfe8 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter { case Type::UINT16: case Type::INT16: case Type::INT32: - // TODO: also add int64, uint32, uint64 downcasts, if possible + case Type::UINT32: + case Type::INT64: + case Type::UINT64: break; default: cpp11::stop("Cannot convert Dictionary Array of type `%s` to R", @@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter { dictionary_ = CreateEmptyArray(dict_type.value_type()); } } + + // R factors store their codes in 32-bit integers, so dictionary arrays with + // more levels than that cannot be represented safely. + if (dictionary_->length() > std::numeric_limits::max()) { + const auto& dict_type = checked_cast(*chunked_array->type()); + cpp11::stop( + "Cannot convert Dictionary Array of type `%s` to R: dictionary has " + "more levels than an R factor can represent", + dict_type.ToString().c_str()); + } } SEXP Allocate(R_xlen_t n) const { @@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter { case Type::INT32: return Ingest_some_nulls_Impl(data, array, start, n, chunk_index); + case Type::UINT32: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::INT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); + case Type::UINT64: + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); default: break; } @@ -704,7 +725,8 @@ class Converter_Dictionary : public Converter { // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) // Alternative: preserve the logical type of the dictionary values // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) - if (dictionary_->type_id() != Type::STRING) { + if (dictionary_->type_id() != Type::STRING && + dictionary_->type_id() != Type::LARGE_STRING) { cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); } diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 45d68043af5a..4312572a5ec8 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1029,8 +1029,8 @@ class RDictionaryConverter> // first we need to handle the levels SEXP levels = Rf_getAttrib(x, R_LevelsSymbol); - auto memo_chunked_chunked_array = - arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false); + auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray( + levels, this->dict_type_->value_type(), false); for (const auto& chunk : memo_chunked_chunked_array->chunks()) { RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk)); } diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index 1ca8832beb84..12ef25975c70 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -357,18 +357,18 @@ test_that("Table handles null type (ARROW-7064)", { test_that("Can create table with specific dictionary types", { fact <- example_data[, "fct"] - int_types <- c(int8(), int16(), int32(), int64()) - # TODO: test uint types when format allows - # uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint - for (i in int_types) { + index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), uint64()) + for (i in index_types) { sch <- schema(fct = dictionary(i, utf8())) tab <- Table$create(fact, schema = sch) expect_equal(sch, tab$schema) - if (i != int64()) { - # TODO: same downcast to int32 as we do for int64() type elsewhere - expect_equal_data_frame(tab, fact) - } + expect_equal_data_frame(tab, fact) } + + # large_utf8 values exercise the non-ALTREP conversion path + tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), large_utf8()))) + expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), large_utf8()))) + expect_equal_data_frame(tab_large, fact) }) test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {