diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index ac80e9f08..49dbcdd4c 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -690,10 +690,10 @@ Result RenameTableRequestFromJson(const nlohmann::json& json } // LoadTableResult (used by CreateTableResponse, LoadTableResponse) -nlohmann::json ToJson(const LoadTableResult& result) { +Result ToJson(const LoadTableResult& result) { nlohmann::json json; SetOptionalStringField(json, kMetadataLocation, result.metadata_location); - json[kMetadata] = ToJson(*result.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*result.metadata)); SetContainerField(json, kConfig, result.config); return json; } @@ -820,12 +820,12 @@ Result ListTablesResponseFromJson(const nlohmann::json& json return response; } -nlohmann::json ToJson(const CreateTableRequest& request) { +Result ToJson(const CreateTableRequest& request) { nlohmann::json json; json[kName] = request.name; SetOptionalStringField(json, kLocation, request.location); if (request.schema) { - json[kSchema] = ToJson(*request.schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*request.schema)); } if (request.partition_spec) { json[kPartitionSpec] = ToJson(*request.partition_spec); @@ -872,7 +872,7 @@ Result CreateTableRequestFromJson(const nlohmann::json& json } // CommitTableRequest serialization -nlohmann::json ToJson(const CommitTableRequest& request) { +Result ToJson(const CommitTableRequest& request) { nlohmann::json json; if (!request.identifier.name.empty()) { json[kIdentifier] = ToJson(request.identifier); @@ -886,7 +886,8 @@ nlohmann::json ToJson(const CommitTableRequest& request) { nlohmann::json updates_json = nlohmann::json::array(); for (const auto& update : request.updates) { - updates_json.push_back(ToJson(*update)); + ICEBERG_ASSIGN_OR_RAISE(auto update_json, ToJson(*update)); + updates_json.push_back(std::move(update_json)); } json[kUpdates] = std::move(updates_json); @@ -932,11 +933,11 @@ Result CommitTableRequestFromJson(const nlohmann::json& json } // CommitTableResponse serialization -nlohmann::json ToJson(const CommitTableResponse& response) { +Result ToJson(const CommitTableResponse& response) { nlohmann::json json; json[kMetadataLocation] = response.metadata_location; if (response.metadata) { - json[kMetadata] = ToJson(*response.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*response.metadata)); } return json; } diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 61d2eb6e0..efe81a4cf 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -57,16 +57,38 @@ ICEBERG_DECLARE_JSON_SERDE(GetNamespaceResponse) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesRequest) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesResponse) ICEBERG_DECLARE_JSON_SERDE(ListTablesResponse) -ICEBERG_DECLARE_JSON_SERDE(LoadTableResult) ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CreateTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableResponse) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) #undef ICEBERG_DECLARE_JSON_SERDE +// These models embed a Schema/TableMetadata whose default-value serialization can fail, +// so their ToJson returns Result. FromJson is declared like the macro-based models above. +ICEBERG_REST_EXPORT Result LoadTableResultFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const LoadTableResult& model); + +ICEBERG_REST_EXPORT Result CreateTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const CreateTableRequest& model); + +ICEBERG_REST_EXPORT Result CommitTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const CommitTableRequest& model); + +ICEBERG_REST_EXPORT Result CommitTableResponseFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); + ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, const std::unordered_map>& diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 4cb4fd349..6b479ee03 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -667,7 +667,8 @@ Result RestCatalog::CreateTableInternal( .properties = properties, }; - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(const auto response, client_->Post(path, json_request, /*headers=*/{}, *TableErrorHandler::Instance(), session)); @@ -710,7 +711,8 @@ Result RestCatalog::UpdateTableInternal( request.updates.push_back(update->Clone()); } - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(auto is_create, TableRequirements::IsCreate(requirements)); const ErrorHandler* error_handler = TableCommitErrorHandler::Instance().get(); if (is_create) { diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index a4810d88f..7774f9b2f 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,6 +27,8 @@ #include #include "iceberg/constants.h" +#include "iceberg/expression/json_serde_internal.h" +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" @@ -49,6 +51,7 @@ #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/timepoint.h" namespace iceberg { @@ -315,19 +318,26 @@ Result> SortOrderFromJson(const nlohmann::json& json) return SortOrder::Make(parsed.order_id, std::move(parsed.fields)); } -nlohmann::json ToJson(const SchemaField& field) { +Result ToJson(const SchemaField& field) { nlohmann::json json; json[kId] = field.field_id(); json[kName] = field.name(); json[kRequired] = !field.optional(); - json[kType] = ToJson(*field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(*field.type())); if (!field.doc().empty()) { json[kDoc] = field.doc(); } + if (field.initial_default().has_value()) { + ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], + ToJson(field.initial_default()->get())); + } + if (field.write_default().has_value()) { + ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(field.write_default()->get())); + } return json; } -nlohmann::json ToJson(const Type& type) { +Result ToJson(const Type& type) { switch (type.type_id()) { case TypeId::kStruct: { const auto& struct_type = internal::checked_cast(type); @@ -335,8 +345,8 @@ nlohmann::json ToJson(const Type& type) { json[kType] = kStruct; nlohmann::json fields_json = nlohmann::json::array(); for (const auto& field : struct_type.fields()) { - fields_json.push_back(ToJson(field)); - // TODO(gangwu): add default values + ICEBERG_ASSIGN_OR_RAISE(auto field_json, ToJson(field)); + fields_json.push_back(std::move(field_json)); } json[kFields] = fields_json; return json; @@ -349,7 +359,7 @@ nlohmann::json ToJson(const Type& type) { const auto& element_field = list_type.fields().front(); json[kElementId] = element_field.field_id(); json[kElementRequired] = !element_field.optional(); - json[kElement] = ToJson(*element_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kElement], ToJson(*element_field.type())); return json; } case TypeId::kMap: { @@ -359,12 +369,12 @@ nlohmann::json ToJson(const Type& type) { const auto& key_field = map_type.key(); json[kKeyId] = key_field.field_id(); - json[kKey] = ToJson(*key_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kKey], ToJson(*key_field.type())); const auto& value_field = map_type.value(); json[kValueId] = value_field.field_id(); json[kValueRequired] = !value_field.optional(); - json[kValue] = ToJson(*value_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(*value_field.type())); return json; } case TypeId::kBoolean: @@ -416,8 +426,9 @@ nlohmann::json ToJson(const Type& type) { std::unreachable(); } -nlohmann::json ToJson(const Schema& schema) { - nlohmann::json json = ToJson(internal::checked_cast(schema)); +Result ToJson(const Schema& schema) { + ICEBERG_ASSIGN_OR_RAISE(nlohmann::json json, + ToJson(internal::checked_cast(schema))); json[kSchemaId] = schema.schema_id(); if (!schema.IdentifierFieldIds().empty()) { json[kIdentifierFieldIds] = schema.IdentifierFieldIds(); @@ -426,7 +437,8 @@ nlohmann::json ToJson(const Schema& schema) { } Result ToJsonString(const Schema& schema) { - return ToJsonString(ToJson(schema)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(schema)); + return ToJsonString(json); } nlohmann::json ToJson(const SnapshotRef& ref) { @@ -625,6 +637,34 @@ Result> TypeFromJson(const nlohmann::json& json) { } } +namespace { + +// The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` default +// values requires a UTC offset. The shared timestamp parser accepts any offset and +// silently normalizes to UTC, which would let C++ accept default metadata that Java +// rejects and then rewrite the offset on serialization. Enforce UTC for these +// defaults at parse time, where the original offset is still visible. +Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& value) { + const auto type_id = type.type_id(); + if (type_id != TypeId::kTimestampTz && type_id != TypeId::kTimestampTzNs) { + return {}; + } + if (!value.is_string()) { + return JsonParseError("Invalid timestamptz default {} for {}: expected a string", + SafeDumpJson(value), type.ToString()); + } + const auto str = value.get(); + ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); + if (!is_utc) { + return JsonParseError( + "Invalid timestamptz default '{}' for {}: default values must use a UTC offset", + str, type.ToString()); + } + return {}; +} + +} // namespace + Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE( auto type, GetJsonValue(json, kType).and_then(TypeFromJson)); @@ -632,9 +672,31 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); + ICEBERG_ASSIGN_OR_RAISE(auto initial_default_json, + GetJsonValueOptional(json, kInitialDefault)); + ICEBERG_ASSIGN_OR_RAISE(auto write_default_json, + GetJsonValueOptional(json, kWriteDefault)); + + std::shared_ptr initial_default; + if (initial_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *initial_default_json)); + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*initial_default_json, type.get())); + initial_default = std::make_shared(std::move(literal)); + } + std::shared_ptr write_default; + if (write_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *write_default_json)); + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*write_default_json, type.get())); + write_default = std::make_shared(std::move(literal)); + } return std::make_unique(field_id, std::move(name), std::move(type), - !required, doc); + !required, doc, std::move(initial_default), + std::move(write_default)); } Result> SchemaFromJson(const nlohmann::json& json) { @@ -966,7 +1028,7 @@ Result EncryptedKeyFromJson(const nlohmann::json& json) { }; } -nlohmann::json ToJson(const TableMetadata& table_metadata) { +Result ToJson(const TableMetadata& table_metadata) { nlohmann::json json; json[kFormatVersion] = table_metadata.format_version; @@ -984,7 +1046,7 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { if (table_metadata.format_version == 1) { for (const auto& schema : table_metadata.schemas) { if (schema->schema_id() == table_metadata.current_schema_id) { - json[kSchema] = ToJson(*schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*schema)); break; } } @@ -992,7 +1054,14 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { // write the current schema ID and schema list json[kCurrentSchemaId] = table_metadata.current_schema_id; - json[kSchemas] = ToJsonList(table_metadata.schemas); + // Schemas can carry fallible default-value serialization, so the shared ToJsonList + // helper (which assumes infallible ToJson) is not used here. + nlohmann::json schemas_json = nlohmann::json::array(); + for (const auto& schema : table_metadata.schemas) { + ICEBERG_ASSIGN_OR_RAISE(auto schema_json, ToJson(*schema)); + schemas_json.push_back(std::move(schema_json)); + } + json[kSchemas] = std::move(schemas_json); // for older readers, continue writing the default spec as "partition-spec" if (table_metadata.format_version == 1) { @@ -1042,7 +1111,8 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { } Result ToJsonString(const TableMetadata& table_metadata) { - return ToJsonString(ToJson(table_metadata)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(table_metadata)); + return ToJsonString(json); } namespace { @@ -1446,7 +1516,7 @@ Result NamespaceFromJson(const nlohmann::json& json) { return ns; } -nlohmann::json ToJson(const TableUpdate& update) { +Result ToJson(const TableUpdate& update) { nlohmann::json json; switch (update.kind()) { case TableUpdate::Kind::kAssignUUID: { @@ -1465,7 +1535,7 @@ nlohmann::json ToJson(const TableUpdate& update) { const auto& u = internal::checked_cast(update); json[kAction] = kActionAddSchema; if (u.schema()) { - json[kSchema] = ToJson(*u.schema()); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*u.schema())); } else { json[kSchema] = nlohmann::json::value_t::null; } diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 351b41086..1a30e8e8f 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -93,7 +93,7 @@ ICEBERG_EXPORT Result> SortOrderFromJson( /// /// \param schema The Iceberg schema to convert. /// \return The JSON representation of the schema. -ICEBERG_EXPORT nlohmann::json ToJson(const Schema& schema); +ICEBERG_EXPORT Result ToJson(const Schema& schema); /// \brief Convert an Iceberg Schema to JSON. /// @@ -111,7 +111,7 @@ ICEBERG_EXPORT Result> SchemaFromJson(const nlohmann::js /// /// \param type The Iceberg type to convert. /// \return The JSON representation of the type. -ICEBERG_EXPORT nlohmann::json ToJson(const Type& type); +ICEBERG_EXPORT Result ToJson(const Type& type); /// \brief Convert JSON to an Iceberg Type. /// @@ -123,7 +123,7 @@ ICEBERG_EXPORT Result> TypeFromJson(const nlohmann::json& /// /// \param field The Iceberg field to convert. /// \return The JSON representation of the field. -ICEBERG_EXPORT nlohmann::json ToJson(const SchemaField& field); +ICEBERG_EXPORT Result ToJson(const SchemaField& field); /// \brief Convert JSON to an Iceberg SchemaField. /// @@ -300,7 +300,7 @@ ICEBERG_EXPORT Result EncryptedKeyFromJson(const nlohmann::json& j /// /// \param table_metadata The `TableMetadata` object to be serialized. /// \return A JSON object representing the `TableMetadata`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableMetadata& table_metadata); +ICEBERG_EXPORT Result ToJson(const TableMetadata& table_metadata); /// \brief Serializes a `TableMetadata` object to JSON. /// @@ -404,7 +404,7 @@ ICEBERG_EXPORT Result NamespaceFromJson(const nlohmann::json& json); /// /// \param update The `TableUpdate` object to be serialized. /// \return A JSON object representing the `TableUpdate`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableUpdate& update); +ICEBERG_EXPORT Result ToJson(const TableUpdate& update); /// \brief Deserializes a JSON object into a `TableUpdate` object. /// diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index fcac43c78..d6347251c 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -116,9 +116,15 @@ std::shared_ptr ReassignTypeIds(const std::shared_ptr& type, SchemaField ReassignField(const SchemaField& field, int32_t new_id, const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned, Schema::IdMap& ids_to_original) { - return {new_id, std::string(field.name()), + // Reassigning IDs only rewrites the field ID and nested type IDs; share the field's + // (immutable) default values rather than copying them. + return {new_id, + std::string(field.name()), ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original), - field.optional(), std::string(field.doc())}; + field.optional(), + std::string(field.doc()), + field.initial_default_ptr(), + field.write_default_ptr()}; } std::vector ReassignIds(std::vector fields, @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const { } } - // TODO(GuoTao.yu): Check default values when they are supported + // Only the initial-default is gated on format version: it changes how existing + // data files are read (rows written before the column existed materialize this + // value), so it requires the v3 reader contract. A write-default only affects + // values written going forward and does not reinterpret existing data. + if (field.initial_default().has_value() && + format_version < TableMetadata::kMinFormatVersionDefaultValues) { + return InvalidSchema( + "Invalid initial default for {}: non-null default ({}) is not supported " + "until v{}", + field.name(), field.initial_default()->get(), + TableMetadata::kMinFormatVersionDefaultValues); + } + if (field.initial_default().has_value() || field.write_default().has_value()) { + ICEBERG_RETURN_UNEXPECTED(field.Validate()); + } } return {}; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 206915ec2..144f75799 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -21,19 +21,27 @@ #include #include +#include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { SchemaField::SchemaField(int32_t field_id, std::string_view name, - std::shared_ptr type, bool optional, std::string_view doc) + std::shared_ptr type, bool optional, std::string_view doc, + std::shared_ptr initial_default, + std::shared_ptr write_default) : field_id_(field_id), name_(name), type_(std::move(type)), optional_(optional), - doc_(doc) {} + doc_(doc), + initial_default_(std::move(initial_default)), + write_default_(std::move(write_default)) {} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { @@ -55,6 +63,62 @@ bool SchemaField::optional() const { return optional_; } std::string_view SchemaField::doc() const { return doc_; } +std::optional> SchemaField::initial_default() + const { + if (initial_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*initial_default_); +} + +std::optional> SchemaField::write_default() const { + if (write_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*write_default_); +} + +const std::shared_ptr& SchemaField::initial_default_ptr() const { + return initial_default_; +} + +const std::shared_ptr& SchemaField::write_default_ptr() const { + return write_default_; +} + +namespace { + +Status ValidateDefault(const SchemaField& field, const Literal& value, + std::string_view kind) { + if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) { + return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, + field.name()); + } + // Defaults are only supported on primitive fields. The spec also permits JSON + // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose + // sub-field defaults live in field metadata); that matches the current Java model's + // gap and is left as a follow-up. + if (field.type() == nullptr || !field.type()->is_primitive()) { + return InvalidSchema( + "Invalid {} value for {}: default values are only supported for primitive types", + kind, field.name()); + } + // Match Java (Types.NestedField), which casts the default literal to the field type + // instead of requiring an exact type match (e.g. an int default on a long field, or + // a string default on a date/timestamp/uuid field). Reject only defaults that cannot + // be cast to the field type or fall outside its range (CastTo signals out-of-range as + // an above-max/below-min sentinel). + auto field_type = internal::checked_pointer_cast(field.type()); + ICEBERG_ASSIGN_OR_RAISE(auto cast, value.CastTo(field_type)); + if (cast.IsAboveMax() || cast.IsBelowMin()) { + return InvalidSchema("{} of field {} ({}) is out of range for {}", kind, field.name(), + *value.type(), *field.type()); + } + return {}; +} + +} // namespace + Status SchemaField::Validate() const { if (name_.empty()) [[unlikely]] { return InvalidSchema("SchemaField cannot have empty name"); @@ -62,6 +126,13 @@ Status SchemaField::Validate() const { if (type_ == nullptr) [[unlikely]] { return InvalidSchema("SchemaField cannot have null type"); } + if (initial_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED( + ValidateDefault(*this, *initial_default_, "initial-default")); + } + if (write_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default")); + } return {}; } @@ -72,9 +143,23 @@ std::string SchemaField::ToString() const { return result; } +namespace { + +bool DefaultEquals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == nullptr || rhs == nullptr) { + return lhs == rhs; + } + return *lhs == *rhs; +} + +} // namespace + bool SchemaField::Equals(const SchemaField& other) const { return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && - optional_ == other.optional_; + optional_ == other.optional_ && + DefaultEquals(initial_default_, other.initial_default_) && + DefaultEquals(write_default_, other.write_default_); } } // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index fd20226a5..e90425ca7 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -24,7 +24,9 @@ /// type (e.g. a struct). #include +#include #include +#include #include #include @@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. + /// \param[in] initial_default The v3 `initial-default` value, or null if absent. The + /// field shares ownership of the (immutable) value. + /// \param[in] write_default The v3 `write-default` value, or null if absent. The field + /// shares ownership of the (immutable) value. SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, - bool optional, std::string_view doc = {}); + bool optional, std::string_view doc = {}, + std::shared_ptr initial_default = nullptr, + std::shared_ptr write_default = nullptr); /// \brief Construct an optional (nullable) field. static SchemaField MakeOptional(int32_t field_id, std::string_view name, @@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \brief Get the field documentation. std::string_view doc() const; + /// \brief Get the default value for this field used when reading rows written + /// before the field existed (v3 `initial-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> initial_default() + const; + + /// \brief Get the default value for this field used when a writer does not + /// supply a value (v3 `write-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> write_default() + const; + + /// \brief Get the shared owning pointer to the `initial-default` value, or null if + /// absent. Prefer initial_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& initial_default_ptr() const; + + /// \brief Get the shared owning pointer to the `write-default` value, or null if + /// absent. Prefer write_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& write_default_ptr() const; + [[nodiscard]] std::string ToString() const override; Status Validate() const; @@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr type_; bool optional_; std::string doc_; + // Default values are owned by this field and never mutated after being set; copies + // of the field share the same payload (reference-counted) instead of deep-copying, + // like `type_` above. Sharing is unobservable because the payload is immutable. + std::shared_ptr initial_default_; + std::shared_ptr write_default_; }; } // namespace iceberg diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 4ff678fc6..5ff828258 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -172,10 +172,14 @@ Result ProjectNested(const Type& expected_type, const Type& sou iter->second.local_index, prune_source)); } else if (MetadataColumns::IsMetadataColumn(field_id)) { child_projection.kind = FieldProjection::Kind::kMetadata; + } else if (expected_field.initial_default().has_value()) { + // Rows written before the field existed assume its `initial-default` value. + child_projection.kind = FieldProjection::Kind::kDefault; + child_projection.from = expected_field.initial_default()->get(); } else if (expected_field.optional()) { child_projection.kind = FieldProjection::Kind::kNull; } else { - // TODO(gangwu): support default value for v3 and constant value + // TODO(gangwu): support constant value return InvalidSchema("Missing required field: {}", expected_field.ToString()); } result.children.emplace_back(std::move(child_projection)); diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 53e3c231d..ef2f1bf38 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -489,7 +489,7 @@ Result TableMetadataUtil::Write(FileIO& io, const TableMetadata* ba Status TableMetadataUtil::Write(FileIO& io, const std::string& location, const TableMetadata& metadata) { - auto json = ToJson(metadata); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(metadata)); ICEBERG_ASSIGN_OR_RAISE(auto json_string, ToJsonString(json)); return io.WriteFile(location, json_string); } diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 88b44e3ab..c97ed64a6 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,10 +23,12 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/snapshot.h" #include "iceberg/sort_field.h" #include "iceberg/sort_order.h" @@ -35,10 +37,12 @@ #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" +#include "iceberg/type.h" #include "iceberg/util/base64.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" // IWYU pragma: keep #include "iceberg/util/timepoint.h" +#include "iceberg/util/uuid.h" namespace iceberg { @@ -72,6 +76,11 @@ Result> FromJsonHelper(const nlohmann::json& json) return NameMappingFromJson(json); } +template <> +Result> FromJsonHelper(const nlohmann::json& json) { + return FieldFromJson(json); +} + // Helper function to reduce duplication in testing template void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { @@ -84,8 +93,102 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { EXPECT_EQ(obj, *obj_ex.value()) << "Deserialized object mismatch."; } +// ToJson(SchemaField) returns Result, so it cannot use the shared +// TestJsonConversion helper. Unwrap the serialized json before comparing and +// round-tripping. +void TestSchemaFieldJsonConversion(const SchemaField& field, + const nlohmann::json& expected_json) { + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + EXPECT_EQ(expected_json, json) << "JSON conversion mismatch."; + + auto obj_ex = FieldFromJson(expected_json); + EXPECT_TRUE(obj_ex.has_value()) << "Failed to deserialize JSON."; + EXPECT_EQ(field, *obj_ex.value()) << "Deserialized object mismatch."; +} + } // namespace +// Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for +// `initial-default` and `write-default`, including the absence of the keys when a +// field carries no defaults. +TEST(JsonInternalTest, SchemaFieldDefaultValues) { + // Both defaults present. + SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(42)), + std::make_shared(Literal::Int(7))); + TestSchemaFieldJsonConversion( + with_both, + R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); + + // Only an initial-default; write-default must not appear in the JSON. + SchemaField initial_only(/*field_id=*/2, "name", string(), /*optional=*/true, + /*doc=*/{}, + std::make_shared(Literal::String("n/a")), + /*write_default=*/nullptr); + TestSchemaFieldJsonConversion( + initial_only, + R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); + + // No defaults; neither key may appear. + SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); + TestSchemaFieldJsonConversion( + no_defaults, R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); +} + +// Round-trips a field carrying both defaults through ToJson -> FieldFromJson for +// every primitive type, exercising the per-type single-value serialization the +// default path reuses (date/timestamp/decimal/uuid/binary have non-trivial wire +// encodings). +TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { + ICEBERG_UNWRAP_OR_FAIL(auto uuid_value, + Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")); + std::vector, Literal>> cases; + cases.emplace_back(boolean(), Literal::Boolean(true)); + cases.emplace_back(int32(), Literal::Int(-7)); + cases.emplace_back(int64(), Literal::Long(1234567890123LL)); + cases.emplace_back(float32(), Literal::Float(1.5f)); + cases.emplace_back(float64(), Literal::Double(2.5)); + cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(time(), Literal::Time(43200000000LL)); + cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(timestamp_tz(), Literal::TimestampTz(1719446400000000LL)); + cases.emplace_back(timestamp_ns(), Literal::TimestampNs(1719446400000000123LL)); + cases.emplace_back(timestamptz_ns(), Literal::TimestampTzNs(1719446400000000123LL)); + cases.emplace_back(string(), Literal::String("hello")); + cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); + cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); + cases.emplace_back(binary(), Literal::Binary({0xDE, 0xAD, 0xBE, 0xEF})); + cases.emplace_back(uuid(), Literal::UUID(uuid_value)); + + int32_t field_id = 1; + for (const auto& [type, literal] : cases) { + SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, + std::make_shared(literal), + std::make_shared(literal)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); + EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() + << ", json=" << json.dump(); + } +} + +// The spec only permits UTC offsets for timestamptz / timestamptz_ns default values. +// A non-UTC offset (which the shared parser would silently normalize) must be rejected, +// while the UTC form is accepted. +TEST(JsonInternalTest, SchemaFieldRejectsNonUtcTimestamptzDefault) { + auto non_utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T05:00:00+05:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc).has_value()); + + auto non_utc_ns = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz_ns","write-default":"2024-06-27T05:00:00-08:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc_ns).has_value()); + + auto utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T00:00:00+00:00"})"); + EXPECT_TRUE(FieldFromJson(utc).has_value()); +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); @@ -325,7 +428,8 @@ TEST(JsonInternalTest, TableUpdateAssignUUID) { nlohmann::json expected = R"({"action":"assign-uuid","uuid":"550e8400-e29b-41d4-a716-446655440000"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -336,7 +440,8 @@ TEST(JsonInternalTest, TableUpdateUpgradeFormatVersion) { nlohmann::json expected = R"({"action":"upgrade-format-version","format-version":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -350,7 +455,7 @@ TEST(JsonInternalTest, TableUpdateAddSchema) { /*schema_id=*/1); table::AddSchema update(schema, 2); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-schema"); EXPECT_EQ(json["last-column-id"], 2); EXPECT_TRUE(json.contains("schema")); @@ -367,9 +472,10 @@ TEST(JsonInternalTest, TableUpdateAddSchemaWithoutDeprecatedLastColumnId) { std::vector{SchemaField(1, "id", int64(), false), SchemaField(3, "name", string(), true)}, /*schema_id=*/1); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); nlohmann::json json = { {"action", "add-schema"}, - {"schema", ToJson(*schema)}, + {"schema", schema_json}, }; auto parsed = TableUpdateFromJson(json); @@ -383,7 +489,8 @@ TEST(JsonInternalTest, TableUpdateSetCurrentSchema) { table::SetCurrentSchema update(1); nlohmann::json expected = R"({"action":"set-current-schema","schema-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -397,7 +504,7 @@ TEST(JsonInternalTest, TableUpdateAddPartitionSpec) { PartitionSpec::Make(1, {PartitionField(3, 101, "region", identity_transform)})); table::AddPartitionSpec update(std::move(spec)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-spec"); EXPECT_TRUE(json.contains("spec")); @@ -411,7 +518,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultPartitionSpec) { table::SetDefaultPartitionSpec update(2); nlohmann::json expected = R"({"action":"set-default-spec","spec-id":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -424,7 +532,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { nlohmann::json expected = R"({"action":"remove-partition-specs","spec-ids":[1,2,3]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -434,7 +543,7 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { TEST(JsonInternalTest, TableUpdateRemoveSchemas) { table::RemoveSchemas update({1, 2}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-schemas"); EXPECT_THAT(json["schema-ids"].get>(), testing::UnorderedElementsAre(1, 2)); @@ -450,7 +559,7 @@ TEST(JsonInternalTest, TableUpdateAddSortOrder) { ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(1, {st})); table::AddSortOrder update(std::move(sort_order)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-sort-order"); EXPECT_TRUE(json.contains("sort-order")); @@ -465,7 +574,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultSortOrder) { nlohmann::json expected = R"({"action":"set-default-sort-order","sort-order-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -483,7 +593,7 @@ TEST(JsonInternalTest, TableUpdateAddSnapshot) { .schema_id = 1}); table::AddSnapshot update(snapshot); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-snapshot"); EXPECT_TRUE(json.contains("snapshot")); @@ -498,7 +608,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshots) { nlohmann::json expected = R"({"action":"remove-snapshots","snapshot-ids":[111,222,333]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -510,7 +621,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshotRef) { nlohmann::json expected = R"({"action":"remove-snapshot-ref","ref-name":"my-branch"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -521,7 +633,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { table::SetSnapshotRef update("main", 123456789, SnapshotRefType::kBranch, 5, 86400000, 604800000); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["ref-name"], "main"); EXPECT_EQ(json["snapshot-id"], 123456789); @@ -536,7 +648,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { table::SetSnapshotRef update("release-1.0", 987654321, SnapshotRefType::kTag); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["type"], "tag"); @@ -549,7 +661,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { TEST(JsonInternalTest, TableUpdateSetProperties) { table::SetProperties update({{"key1", "value1"}, {"key2", "value2"}}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-properties"); EXPECT_TRUE(json.contains("updates")); @@ -578,7 +690,7 @@ TEST(JsonInternalTest, TableUpdateSetPropertiesMissingCanonicalField) { TEST(JsonInternalTest, TableUpdateRemoveProperties) { table::RemoveProperties update({"key1", "key2"}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-properties"); EXPECT_TRUE(json.contains("removals")); @@ -610,7 +722,8 @@ TEST(JsonInternalTest, TableUpdateSetLocation) { nlohmann::json expected = R"({"action":"set-location","location":"s3://bucket/warehouse/table"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -646,7 +759,8 @@ TEST(JsonInternalTest, TableUpdateSetStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -657,7 +771,8 @@ TEST(JsonInternalTest, TableUpdateRemoveStatistics) { nlohmann::json expected = R"({"action":"remove-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -681,7 +796,8 @@ TEST(JsonInternalTest, TableUpdateSetPartitionStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -693,7 +809,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionStatistics) { nlohmann::json expected = R"({"action":"remove-partition-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -719,7 +836,8 @@ TEST(JsonInternalTest, TableUpdateAddEncryptionKey) { {"properties", {{"scope", "table"}}}}}, }; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -738,7 +856,8 @@ TEST(JsonInternalTest, TableUpdateRemoveEncryptionKey) { table::RemoveEncryptionKey update("key-1"); nlohmann::json expected = R"({"action":"remove-encryption-key","key-id":"key-1"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index ba47f312d..e214f1606 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -597,7 +597,7 @@ TEST(MetadataSerdeTest, EncryptionKeysRoundTrip) { EXPECT_EQ(metadata.value()->encryption_keys[0].encrypted_key_metadata, "secret-key-metadata"); - auto serialized = ToJson(*metadata.value()); + ICEBERG_UNWRAP_OR_FAIL(auto serialized, ToJson(*metadata.value())); ASSERT_TRUE(serialized.contains("encryption-keys")); EXPECT_EQ(serialized["encryption-keys"], metadata_json["encryption-keys"]); } diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 520a1eb70..f8b6114c1 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -42,8 +43,8 @@ class TypeJsonTest : public ::testing::TestWithParam {}; TEST_P(TypeJsonTest, SingleTypeRoundTrip) { // To Json const auto& param = GetParam(); - auto json = ToJson(*param.type).dump(); - ASSERT_EQ(param.json, json); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*param.type)); + ASSERT_EQ(param.json, json.dump()); // From Json auto type_result = TypeFromJson(nlohmann::json::parse(param.json)); @@ -190,8 +191,54 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(field2.type()->type_id(), TypeId::kString); ASSERT_TRUE(field2.optional()); - auto dumped_json = ToJson(*schema).dump(); - ASSERT_EQ(dumped_json, json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); +} + +TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":42,"name":"id","required":true,"type":"int","write-default":7},{"id":2,"initial-default":"n/a","name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + ASSERT_EQ(schema->fields().size(), 2); + + const auto& field1 = schema->fields()[0]; + ASSERT_TRUE(field1.initial_default().has_value()); + ASSERT_EQ(field1.initial_default()->get(), Literal::Int(42)); + ASSERT_TRUE(field1.write_default().has_value()); + ASSERT_EQ(field1.write_default()->get(), Literal::Int(7)); + + const auto& field2 = schema->fields()[1]; + ASSERT_TRUE(field2.initial_default().has_value()); + ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); + ASSERT_FALSE(field2.write_default().has_value()); + + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); +} + +TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":"oops","name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"})"; + + auto result = SchemaFromJson(nlohmann::json::parse(json)); + ASSERT_FALSE(result.has_value()); +} + +TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"name":"person","required":true,"type":{"fields":[{"id":2,"initial-default":18,"name":"age","required":true,"type":"int","write-default":21}],"type":"struct"}}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + const auto& person = schema->fields()[0]; + const auto& nested = dynamic_cast(*person.type()).fields()[0]; + ASSERT_TRUE(nested.initial_default().has_value()); + ASSERT_EQ(nested.initial_default()->get(), Literal::Int(18)); + ASSERT_TRUE(nested.write_default().has_value()); + ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); + + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, UnknownFieldRoundTrip) { @@ -206,7 +253,8 @@ TEST(SchemaJsonTest, UnknownFieldRoundTrip) { ASSERT_EQ(field.name(), "mystery"); ASSERT_EQ(field.type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(field.optional()); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { @@ -273,7 +321,8 @@ TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { ASSERT_EQ(properties->value().type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(properties->value().optional()); - ASSERT_EQ(ToJson(*schema), parsed_json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json, parsed_json); } TEST(SchemaJsonTest, IdentifierFieldIds) { @@ -292,7 +341,8 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_with_identifers->schema_id(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds().size(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds()[0], 1); - ASSERT_EQ(ToJson(*schema_with_identifers), json_with_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_with_identifiers_out, ToJson(*schema_with_identifers)); + ASSERT_EQ(json_with_identifiers_out, json_with_identifiers); // Test schema without identifier-field-ids constexpr std::string_view json_without_identifiers_str = @@ -305,7 +355,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ICEBERG_UNWRAP_OR_FAIL(auto schema_without_identifiers, SchemaFromJson(json_without_identifiers)); ASSERT_TRUE(schema_without_identifiers->IdentifierFieldIds().empty()); - ASSERT_EQ(ToJson(*schema_without_identifiers), json_without_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_without_identifiers_out, + ToJson(*schema_without_identifiers)); + ASSERT_EQ(json_without_identifiers_out, json_without_identifiers); // Test schema with multiple identifier fields constexpr std::string_view json_multi_identifiers_str = @@ -321,7 +373,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds().size(), 2); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[0], 1); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[1], 2); - ASSERT_EQ(ToJson(*schema_multi_identifiers), json_multi_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_multi_identifiers_out, + ToJson(*schema_multi_identifiers)); + ASSERT_EQ(json_multi_identifiers_out, json_multi_identifiers); } } // namespace iceberg diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index db99eb02b..b78798499 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" @@ -133,6 +134,81 @@ TEST(SchemaTest, ValidateRejectsV3TypesBeforeFormatV3) { iceberg::IsOk()); } +TEST(SchemaTest, ValidateRejectsInitialDefaultBeforeFormatV3) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)))}); + + auto status = schema.Validate(2); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("is not supported until v3")); + + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { + // A write-default does not reinterpret existing data, so it is not gated on + // format version: a write-default alone is accepted below v3. + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::Int(7)))}); + + EXPECT_THAT(schema.Validate(2), iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::String("oops")))}); + + // The default literal is cast to the field type; an uncastable type surfaces the + // original cast error. + auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kNotSupported)); + EXPECT_THAT(status, iceberg::HasErrorMessage("Cast from String")); +} + +TEST(SchemaTest, ValidateCastsDefaultToFieldType) { + // Matching Java, the default literal is cast to the field type rather than required to + // match exactly: an int default on a long field is accepted (int -> long). + iceberg::Schema widened({iceberg::SchemaField( + 1, "id", iceberg::int64(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(34)))}); + EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); + + // A default outside the field type's range is rejected (CastTo returns an + // above-max/below-min sentinel). + iceberg::Schema out_of_range({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Long(5'000'000'000)))}); + EXPECT_THAT(out_of_range.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); +} + +TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { + // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the + // default values over to the field with the new ID. + std::vector fields; + fields.push_back(iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)), + std::make_shared(iceberg::Literal::Int(7)))); + auto reassign_id = [](int32_t old_id) { return old_id + 1000; }; + + iceberg::Schema schema(std::move(fields), iceberg::Schema::kInitialSchemaId, + reassign_id); + + ASSERT_EQ(schema.fields().size(), 1); + const iceberg::SchemaField& field = schema.fields()[0]; + EXPECT_EQ(field.field_id(), 1001); + ASSERT_TRUE(field.initial_default().has_value()); + EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Int(42)); + ASSERT_TRUE(field.write_default().has_value()); + EXPECT_EQ(field.write_default()->get(), iceberg::Literal::Int(7)); +} + TEST(SchemaTest, ValidateRejectsInvalidUnknownFields) { iceberg::Schema required_unknown_schema( {iceberg::SchemaField(1, "mystery", iceberg::unknown(), false)}); diff --git a/src/iceberg/test/schema_util_test.cc b/src/iceberg/test/schema_util_test.cc index ee075006f..9a3eff887 100644 --- a/src/iceberg/test/schema_util_test.cc +++ b/src/iceberg/test/schema_util_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/metadata_columns.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -179,6 +180,58 @@ TEST(SchemaUtilTest, ProjectMissingRequiredField) { ASSERT_THAT(projection_result, HasErrorMessage("Missing required field")); } +TEST(SchemaUtilTest, ProjectMissingRequiredFieldWithInitialDefault) { + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::int32(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Int(42))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + AssertProjectedField(projection.fields[0], 0); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::Int(42)); +} + +TEST(SchemaUtilTest, ProjectMissingOptionalFieldWithInitialDefault) { + // An optional field with an initial-default reads the default, not null. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::string(), /*optional=*/true, + /*doc=*/{}, std::make_shared(Literal::String("n/a"))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::String("n/a")); +} + +TEST(SchemaUtilTest, ProjectPresentFieldIgnoresInitialDefault) { + // initial-default only applies when the field is missing from the data file. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField(/*field_id=*/1, "id", iceberg::int64(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Long(-1))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + AssertProjectedField(projection_result->fields[0], 0); +} + TEST(SchemaUtilTest, ProjectMetadataColumn) { Schema source_schema = CreateFlatSchema(); Schema expected_schema({ diff --git a/src/iceberg/test/temporal_util_test.cc b/src/iceberg/test/temporal_util_test.cc index 0d4426b0b..791d02c56 100644 --- a/src/iceberg/test/temporal_util_test.cc +++ b/src/iceberg/test/temporal_util_test.cc @@ -53,6 +53,31 @@ TEST(TemporalUtilTest, ParseTimestampNsChecksInt64Bounds) { IsError(ErrorKind::kInvalidArgument)); } +TEST(TemporalUtilTest, IsUtcOffset) { + // UTC offsets: "Z", "+00:00" and "-00:00". + ICEBERG_UNWRAP_OR_FAIL(auto z, TemporalUtils::IsUtcOffset("2024-06-27T00:00:00Z")); + EXPECT_TRUE(z); + ICEBERG_UNWRAP_OR_FAIL(auto plus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00+00:00")); + EXPECT_TRUE(plus_zero); + ICEBERG_UNWRAP_OR_FAIL(auto minus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-00:00")); + EXPECT_TRUE(minus_zero); + + // Non-UTC offsets. + ICEBERG_UNWRAP_OR_FAIL(auto plus_five, + TemporalUtils::IsUtcOffset("2024-06-27T05:00:00+05:00")); + EXPECT_FALSE(plus_five); + ICEBERG_UNWRAP_OR_FAIL(auto minus_eight, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-08:00")); + EXPECT_FALSE(minus_eight); + + // A missing or unparseable timezone suffix is an error. + EXPECT_THAT(TemporalUtils::IsUtcOffset("2024-06-27T00:00:00"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(TemporalUtils::IsUtcOffset(""), IsError(ErrorKind::kInvalidArgument)); +} + TEST(TemporalUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { EXPECT_THAT(TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), IsError(ErrorKind::kInvalidArgument)); diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index e00ee7cf5..9e6a93cfd 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -444,6 +444,11 @@ Result TemporalUtils::ParseTimestampNsWithZone(std::string_view str) { /*units_per_micro=*/internal::kNanosPerMicro); } +Result TemporalUtils::IsUtcOffset(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + return timestamp_with_offset.second == 0; +} + #define DISPATCH_EXTRACT_YEAR(type_id) \ case type_id: \ return ExtractYearImpl(literal); diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index 2121f565d..cc7a16319 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -125,6 +125,18 @@ class ICEBERG_EXPORT TemporalUtils { /// \return The number of nanoseconds since epoch (UTC), or an error. static Result ParseTimestampNsWithZone(std::string_view str); + /// \brief Reports whether a timestamp-with-zone string uses a UTC offset. + /// + /// The ParseTimestamp*WithZone parsers accept any offset and silently normalize it + /// to UTC. The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` + /// default values only permits UTC ("Z" or "+00:00"), so callers that must enforce + /// that rule check the offset here before parsing. + /// + /// \param str The timestamp-with-zone string to inspect. + /// \return true if the offset is UTC, false if it is a non-UTC offset, or an error + /// if the timezone suffix cannot be parsed. + static Result IsUtcOffset(std::string_view str); + /// \brief Extract a date or timestamp year, as years from 1970 static Result ExtractYear(const Literal& literal);