Skip to content
Open
66 changes: 64 additions & 2 deletions src/iceberg/json_serde.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include <nlohmann/json.hpp>

#include "iceberg/constants.h"
#include "iceberg/expression/json_serde_internal.h"
#include "iceberg/expression/literal.h"
#include "iceberg/json_serde_internal.h"
#include "iceberg/name_mapping.h"
#include "iceberg/partition_field.h"
Expand All @@ -49,6 +51,7 @@
#include "iceberg/util/json_util_internal.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/string_util.h"
#include "iceberg/util/temporal_util.h"
#include "iceberg/util/timepoint.h"

namespace iceberg {
Expand Down Expand Up @@ -307,6 +310,15 @@ nlohmann::json ToJson(const SchemaField& field) {
if (!field.doc().empty()) {
json[kDoc] = field.doc();
}
// Defaults are validated to be primitive literals matching the field type, so
Comment thread
huan233usc marked this conversation as resolved.
// single-value serialization cannot fail here.
if (field.initial_default().has_value()) {
ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault],
ToJson(field.initial_default()->get()));
}
if (field.write_default().has_value()) {
ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get()));
}
return json;
}

Expand All @@ -319,7 +331,6 @@ nlohmann::json ToJson(const Type& type) {
nlohmann::json fields_json = nlohmann::json::array();
for (const auto& field : struct_type.fields()) {
fields_json.push_back(ToJson(field));
// TODO(gangwu): add default values
}
json[kFields] = fields_json;
return json;
Expand Down Expand Up @@ -554,16 +565,67 @@ Result<std::unique_ptr<Type>> TypeFromJson(const nlohmann::json& json) {
}
}

namespace {

// The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` default
// values requires a UTC offset. The shared timestamp parser accepts any offset and
// silently normalizes to UTC, which would let C++ accept default metadata that Java
// rejects and then rewrite the offset on serialization. Enforce UTC for these
// defaults at parse time, where the original offset is still visible.
Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& value) {
const auto type_id = type.type_id();
if (type_id != TypeId::kTimestampTz && type_id != TypeId::kTimestampTzNs) {
return {};
}
if (!value.is_string()) {
// Let LiteralFromJson report the type mismatch.
return {};
}
const auto str = value.get<std::string>();
ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str));
if (!is_utc) {
return JsonParseError(
"Invalid timestamptz default '{}' for {}: default values must use UTC "
"(offset 'Z' or '+00:00')",
str, type.ToString());
}
return {};
}

} // namespace

Result<std::unique_ptr<SchemaField>> FieldFromJson(const nlohmann::json& json) {
ICEBERG_ASSIGN_OR_RAISE(
auto type, GetJsonValue<nlohmann::json>(json, kType).and_then(TypeFromJson));
ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue<int32_t>(json, kId));
ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue<std::string>(json, kName));
ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue<bool>(json, kRequired));
ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault<std::string>(json, kDoc));
ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> initial_default_json,
GetJsonValueOptional<nlohmann::json>(json, kInitialDefault));
ICEBERG_ASSIGN_OR_RAISE(std::optional<nlohmann::json> write_default_json,
GetJsonValueOptional<nlohmann::json>(json, kWriteDefault));

std::shared_ptr<const Literal> initial_default;
if (initial_default_json.has_value()) {
ICEBERG_RETURN_UNEXPECTED(
ValidateTimestamptzDefaultIsUtc(*type, *initial_default_json));
ICEBERG_ASSIGN_OR_RAISE(Literal literal,
LiteralFromJson(*initial_default_json, type.get()));

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default-value parsing should enforce the JSON single-value rule for timestamptz and timestamptz_ns. The shared timestamp parser accepts offsets such as +05:00, but the spec and Java SingleValueParser only accept UTC (+00:00) for these default values. As written, C++ can accept schema metadata that Java rejects and then silently normalize it on write.

@huan233usc huan233usc Jun 23, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 180a9f9FieldFromJson now rejects non-UTC offsets for timestamptz/timestamptz_ns default values (new TemporalUtils::IsUtcOffset, which reuses the existing timezone-suffix parser). This follows Java iiuc

initial_default = std::make_shared<const Literal>(std::move(literal));
}
std::shared_ptr<const Literal> write_default;
if (write_default_json.has_value()) {
ICEBERG_RETURN_UNEXPECTED(
ValidateTimestamptzDefaultIsUtc(*type, *write_default_json));
ICEBERG_ASSIGN_OR_RAISE(Literal literal,
LiteralFromJson(*write_default_json, type.get()));
write_default = std::make_shared<const Literal>(std::move(literal));
}
Comment on lines +610 to +624

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deserialization first constructs a bare SchemaField, then conditionally calls WithInitialDefault/WithWriteDefault, each of which copies the entire field (including the shared_ptr<Type>). This is an unnecessary intermediate copy.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed — FieldFromJson now parses the defaults first and builds the field in one construction. Intermediate copy gone.


return std::make_unique<SchemaField>(field_id, std::move(name), std::move(type),
!required, doc);
!required, doc, std::move(initial_default),
std::move(write_default));
}

Result<std::unique_ptr<Schema>> SchemaFromJson(const nlohmann::json& json) {
Expand Down
26 changes: 23 additions & 3 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,15 @@ std::shared_ptr<Type> ReassignTypeIds(const std::shared_ptr<Type>& type,
SchemaField ReassignField(const SchemaField& field, int32_t new_id,
const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned,
Schema::IdMap& ids_to_original) {
return {new_id, std::string(field.name()),
// Reassigning IDs only rewrites the field ID and nested type IDs; share the field's
// (immutable) default values rather than copying them.
return {new_id,
std::string(field.name()),
ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original),
field.optional(), std::string(field.doc())};
field.optional(),
std::string(field.doc()),
field.initial_default_ptr(),
field.write_default_ptr()};
}

std::vector<SchemaField> ReassignIds(std::vector<SchemaField> fields,
Expand Down Expand Up @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const {
}
}

// TODO(GuoTao.yu): Check default values when they are supported
// Only the initial-default is gated on format version: it changes how existing
// data files are read (rows written before the column existed materialize this
// value), so it requires the v3 reader contract. A write-default only affects
// values written going forward and does not reinterpret existing data.
if (field.initial_default().has_value() &&
format_version < TableMetadata::kMinFormatVersionDefaultValues) {
return InvalidSchema(
"Invalid initial default for {}: non-null default ({}) is not supported "
"until v{}",
field.name(), field.initial_default()->get(),
TableMetadata::kMinFormatVersionDefaultValues);
}
if (field.initial_default().has_value() || field.write_default().has_value()) {
ICEBERG_RETURN_UNEXPECTED(field.Validate());
}
Comment thread
huan233usc marked this conversation as resolved.
}

return {};
Expand Down
90 changes: 87 additions & 3 deletions src/iceberg/schema_field.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,26 @@

#include <format>
#include <string_view>
#include <utility>

#include "iceberg/expression/literal.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"

namespace iceberg {

SchemaField::SchemaField(int32_t field_id, std::string_view name,
std::shared_ptr<Type> type, bool optional, std::string_view doc)
std::shared_ptr<Type> type, bool optional, std::string_view doc,
std::shared_ptr<const Literal> initial_default,
std::shared_ptr<const Literal> write_default)
: field_id_(field_id),
name_(name),
type_(std::move(type)),
optional_(optional),
doc_(doc) {}
doc_(doc),
initial_default_(std::move(initial_default)),
write_default_(std::move(write_default)) {}

SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name,
std::shared_ptr<Type> type, std::string_view doc) {
Expand All @@ -55,13 +62,76 @@ bool SchemaField::optional() const { return optional_; }

std::string_view SchemaField::doc() const { return doc_; }

std::optional<std::reference_wrapper<const Literal>> SchemaField::initial_default()
const {
if (initial_default_ == nullptr) {
return std::nullopt;
}
return std::cref(*initial_default_);
}

std::optional<std::reference_wrapper<const Literal>> SchemaField::write_default() const {
if (write_default_ == nullptr) {
return std::nullopt;
}
return std::cref(*write_default_);
}

const std::shared_ptr<const Literal>& SchemaField::initial_default_ptr() const {
return initial_default_;
}

const std::shared_ptr<const Literal>& SchemaField::write_default_ptr() const {
return write_default_;
}

namespace {

Status ValidateDefault(const SchemaField& field, const Literal& value,
std::string_view kind) {
if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) {
return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind,
field.name());
}
// Defaults are only supported on primitive fields. The spec also permits JSON
// single-value defaults for struct/list/map (e.g. an empty struct `{}` whose
// sub-field defaults live in field metadata); that matches the current Java model's
// gap and is left as a follow-up.
if (field.type() == nullptr || !field.type()->is_primitive()) {
return InvalidSchema(
"Invalid {} value for {}: default values are only supported for primitive types",

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This rejects all non-primitive defaults. The spec allows JSON single-value defaults for structs/lists/maps, and a struct field may have a non-null empty struct default ({}) while sub-field defaults live in field metadata. If C++ is intentionally matching the current Java model for this first patch, please call out that this remains a spec gap; otherwise this will reject legal v3 schema metadata.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intentional for this patch — it matches Java's current primitive-only model. Added a comment in ValidateDefault marking struct/list/map single-value defaults as a known spec gap / follow-up (180a9f9).

kind, field.name());
}
// Match Java (Types.NestedField), which casts the default literal to the field type
// instead of requiring an exact type match (e.g. an int default on a long field, or
// a string default on a date/timestamp/uuid field). Reject only defaults that cannot
// be cast to the field type or fall outside its range (CastTo signals out-of-range as
// an above-max/below-min sentinel).
auto field_type = std::static_pointer_cast<PrimitiveType>(field.type());
auto cast = value.CastTo(field_type);
if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) {
return InvalidSchema("{} of field {} has type {} that cannot be cast to {}", kind,
field.name(), *value.type(), *field.type());
}
return {};
}

} // namespace

Status SchemaField::Validate() const {
if (name_.empty()) [[unlikely]] {
return InvalidSchema("SchemaField cannot have empty name");
}
if (type_ == nullptr) [[unlikely]] {
return InvalidSchema("SchemaField cannot have null type");
}
if (initial_default_ != nullptr) {
ICEBERG_RETURN_UNEXPECTED(
ValidateDefault(*this, *initial_default_, "initial-default"));
}
if (write_default_ != nullptr) {
ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default"));
}
return {};
}

Expand All @@ -72,9 +142,23 @@ std::string SchemaField::ToString() const {
return result;
}

namespace {

bool DefaultEquals(const std::shared_ptr<const Literal>& lhs,
const std::shared_ptr<const Literal>& rhs) {
if (lhs == nullptr || rhs == nullptr) {
return lhs == rhs;
}
return *lhs == *rhs;
}

} // namespace

bool SchemaField::Equals(const SchemaField& other) const {
return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ &&
optional_ == other.optional_;
optional_ == other.optional_ &&
DefaultEquals(initial_default_, other.initial_default_) &&
DefaultEquals(write_default_, other.write_default_);
}

} // namespace iceberg
41 changes: 40 additions & 1 deletion src/iceberg/schema_field.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
/// type (e.g. a struct).

#include <cstdint>
#include <functional>
#include <memory>
#include <optional>
#include <string>
#include <string_view>

Expand All @@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
/// \param[in] type The field type.
/// \param[in] optional Whether values of this field are required or nullable.
/// \param[in] doc Optional documentation string for the field.
/// \param[in] initial_default The v3 `initial-default` value, or null if absent. The
/// field shares ownership of the (immutable) value.
/// \param[in] write_default The v3 `write-default` value, or null if absent. The field
/// shares ownership of the (immutable) value.
SchemaField(int32_t field_id, std::string_view name, std::shared_ptr<Type> type,
bool optional, std::string_view doc = {});
bool optional, std::string_view doc = {},
std::shared_ptr<const Literal> initial_default = nullptr,
std::shared_ptr<const Literal> write_default = nullptr);

/// \brief Construct an optional (nullable) field.
static SchemaField MakeOptional(int32_t field_id, std::string_view name,
Expand All @@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
/// \brief Get the field documentation.
std::string_view doc() const;

/// \brief Get the default value for this field used when reading rows written
/// before the field existed (v3 `initial-default`). Empty if absent.
///
/// The returned reference is a non-owning view into a value owned by this field;
/// it remains valid for the lifetime of this SchemaField.
[[nodiscard]] std::optional<std::reference_wrapper<const Literal>> initial_default()
const;

/// \brief Get the default value for this field used when a writer does not
/// supply a value (v3 `write-default`). Empty if absent.
///
/// The returned reference is a non-owning view into a value owned by this field;
/// it remains valid for the lifetime of this SchemaField.
[[nodiscard]] std::optional<std::reference_wrapper<const Literal>> write_default()
const;

/// \brief Get the shared owning pointer to the `initial-default` value, or null if
/// absent. Prefer initial_default() for reading; this exists so a rebuilt field can
/// share the (immutable) value rather than copy it.
[[nodiscard]] const std::shared_ptr<const Literal>& initial_default_ptr() const;

/// \brief Get the shared owning pointer to the `write-default` value, or null if
/// absent. Prefer write_default() for reading; this exists so a rebuilt field can
/// share the (immutable) value rather than copy it.
[[nodiscard]] const std::shared_ptr<const Literal>& write_default_ptr() const;

[[nodiscard]] std::string ToString() const override;

Status Validate() const;
Expand Down Expand Up @@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable {
std::shared_ptr<Type> type_;
bool optional_;
std::string doc_;
// Default values are owned by this field and never mutated after being set; copies
// of the field share the same payload (reference-counted) instead of deep-copying,
// like `type_` above. Sharing is unobservable because the payload is immutable.
std::shared_ptr<const Literal> initial_default_;
std::shared_ptr<const Literal> write_default_;
Comment on lines +140 to +141

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ReassignField constructs a new SchemaField via the 5-argument constructor which initializes initial_default_ and write_default_ to nullptr. When schema IDs are reassigned (e.g., copying a schema with fresh IDs via the Schema(get_id) path), all default values on fields are silently lost. We should copy all field properties including initialDefault and writeDefault.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, confirmed. Defaults are now constructor args, and ReassignField passes the source field's initial_default_ptr()/write_default_ptr() through, so they're shared with the reassigned field, not lost. Added ReassignIdsPreservesDefaultValues.

};

} // namespace iceberg
6 changes: 5 additions & 1 deletion src/iceberg/schema_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,14 @@ Result<FieldProjection> ProjectNested(const Type& expected_type, const Type& sou
iter->second.local_index, prune_source));
} else if (MetadataColumns::IsMetadataColumn(field_id)) {
child_projection.kind = FieldProjection::Kind::kMetadata;
} else if (expected_field.initial_default().has_value()) {
// Rows written before the field existed assume its `initial-default` value.
child_projection.kind = FieldProjection::Kind::kDefault;
child_projection.from = expected_field.initial_default()->get();
} else if (expected_field.optional()) {
child_projection.kind = FieldProjection::Kind::kNull;
} else {
// TODO(gangwu): support default value for v3 and constant value
// TODO(gangwu): support constant value
return InvalidSchema("Missing required field: {}", expected_field.ToString());
}
result.children.emplace_back(std::move(child_projection));
Expand Down
Loading
Loading