Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/src/arrow/util/converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ struct MakeConverterImpl {
DICTIONARY_CASE(FloatType);
DICTIONARY_CASE(DoubleType);
DICTIONARY_CASE(BinaryType);
DICTIONARY_CASE(LargeBinaryType);
DICTIONARY_CASE(StringType);
DICTIONARY_CASE(LargeStringType);
DICTIONARY_CASE(FixedSizeBinaryType);
#undef DICTIONARY_CASE
default:
Expand Down
26 changes: 24 additions & 2 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter {
case Type::UINT16:
case Type::INT16:
case Type::INT32:
// TODO: also add int64, uint32, uint64 downcasts, if possible
case Type::UINT32:
case Type::INT64:
case Type::UINT64:
break;
default:
cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
Expand All @@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter {
dictionary_ = CreateEmptyArray(dict_type.value_type());
}
}

// R factors store their codes in 32-bit integers, so dictionary arrays with
// more levels than that cannot be represented safely.
if (dictionary_->length() > std::numeric_limits<int>::max()) {
const auto& dict_type = checked_cast<const DictionaryType&>(*chunked_array->type());
cpp11::stop(
"Cannot convert Dictionary Array of type `%s` to R: dictionary has "
"more levels than an R factor can represent",
dict_type.ToString().c_str());
}
}

SEXP Allocate(R_xlen_t n) const {
Expand Down Expand Up @@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter {
case Type::INT32:
return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
chunk_index);
case Type::UINT32:
return Ingest_some_nulls_Impl<arrow::UInt32Type>(data, array, start, n,
chunk_index);
case Type::INT64:
return Ingest_some_nulls_Impl<arrow::Int64Type>(data, array, start, n,
chunk_index);
case Type::UINT64:
return Ingest_some_nulls_Impl<arrow::UInt64Type>(data, array, start, n,
chunk_index);
default:
break;
}
Expand Down Expand Up @@ -704,7 +725,8 @@ class Converter_Dictionary : public Converter {
// TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
// Alternative: preserve the logical type of the dictionary values
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
if (dictionary_->type_id() != Type::STRING) {
if (dictionary_->type_id() != Type::STRING &&
dictionary_->type_id() != Type::LARGE_STRING) {
cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
}

Expand Down
4 changes: 2 additions & 2 deletions r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1029,8 +1029,8 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>

// first we need to handle the levels
SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
auto memo_chunked_chunked_array =
arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false);
auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray(
levels, this->dict_type_->value_type(), false);
for (const auto& chunk : memo_chunked_chunked_array->chunks()) {
RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk));
}
Expand Down
16 changes: 8 additions & 8 deletions r/tests/testthat/test-Table.R
Original file line number Diff line number Diff line change
Expand Up @@ -357,18 +357,18 @@ test_that("Table handles null type (ARROW-7064)", {

test_that("Can create table with specific dictionary types", {
fact <- example_data[, "fct"]
int_types <- c(int8(), int16(), int32(), int64())
# TODO: test uint types when format allows
# uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
for (i in int_types) {
index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), uint64())
for (i in index_types) {
sch <- schema(fct = dictionary(i, utf8()))
tab <- Table$create(fact, schema = sch)
expect_equal(sch, tab$schema)
if (i != int64()) {
# TODO: same downcast to int32 as we do for int64() type elsewhere
expect_equal_data_frame(tab, fact)
}
expect_equal_data_frame(tab, fact)
}

# large_utf8 values exercise the non-ALTREP conversion path
tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), large_utf8())))
expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), large_utf8())))
expect_equal_data_frame(tab_large, fact)
})

test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
Expand Down
Loading