From f663e0e87c2982478e47d24399c252ccc364ffd0 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 15 Jun 2026 10:34:02 -0700 Subject: [PATCH 01/12] feat(schema): represent, serialize and validate v3 column default values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First of a multi-part split of column default value support (#730) — the schema foundation the read and evolution paths build on. Purely additive; no read/write behavior change on its own. - SchemaField carries `initial-default` / `write-default` (immutable std::shared_ptr) with copy-preserving WithInitialDefault / WithWriteDefault modifiers; getters return optional. - JSON serde reads/writes `initial-default` / `write-default` via the existing single-value serialization. - Schema::Validate rejects default values below format v3 and validates they are non-null primitive literals matching the field type. - Generic schema projection maps a column missing from a data file with an initial-default to FieldProjection::Kind::kDefault. Read-path application (Parquet/Avro) and schema evolution follow in separate PRs. See #731 for the full end-to-end proof-of-concept. --- src/iceberg/json_serde.cc | 32 ++++- src/iceberg/schema.cc | 26 +++- src/iceberg/schema_field.cc | 79 ++++++++++- src/iceberg/schema_field.h | 41 +++++- src/iceberg/schema_util.cc | 6 +- .../test/resources/TableMetadataV3Valid.json | 123 ++++++++++++++++++ src/iceberg/test/schema_json_test.cc | 45 +++++++ src/iceberg/test/schema_test.cc | 56 ++++++++ src/iceberg/test/schema_util_test.cc | 53 ++++++++ 9 files changed, 451 insertions(+), 10 deletions(-) create mode 100644 src/iceberg/test/resources/TableMetadataV3Valid.json diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index c72b7da57..4552db883 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -27,6 +27,8 @@ #include #include "iceberg/constants.h" +#include "iceberg/expression/json_serde_internal.h" +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_field.h" @@ -298,6 +300,15 @@ nlohmann::json ToJson(const SchemaField& field) { if (!field.doc().empty()) { json[kDoc] = field.doc(); } + // Defaults are validated to be primitive literals matching the field type, so + // single-value serialization cannot fail here. + if (field.initial_default().has_value()) { + ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault], + ToJson(field.initial_default()->get())); + } + if (field.write_default().has_value()) { + ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get())); + } return json; } @@ -310,7 +321,6 @@ nlohmann::json ToJson(const Type& type) { nlohmann::json fields_json = nlohmann::json::array(); for (const auto& field : struct_type.fields()) { fields_json.push_back(ToJson(field)); - // TODO(gangwu): add default values } json[kFields] = fields_json; return json; @@ -552,9 +562,27 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); + ICEBERG_ASSIGN_OR_RAISE(std::optional initial_default_json, + GetJsonValueOptional(json, kInitialDefault)); + ICEBERG_ASSIGN_OR_RAISE(std::optional write_default_json, + GetJsonValueOptional(json, kWriteDefault)); + + std::shared_ptr initial_default; + if (initial_default_json.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*initial_default_json, type.get())); + initial_default = std::make_shared(std::move(literal)); + } + std::shared_ptr write_default; + if (write_default_json.has_value()) { + ICEBERG_ASSIGN_OR_RAISE(Literal literal, + LiteralFromJson(*write_default_json, type.get())); + write_default = std::make_shared(std::move(literal)); + } return std::make_unique(field_id, std::move(name), std::move(type), - !required, doc); + !required, doc, std::move(initial_default), + std::move(write_default)); } Result> SchemaFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index fcac43c78..d6347251c 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -116,9 +116,15 @@ std::shared_ptr ReassignTypeIds(const std::shared_ptr& type, SchemaField ReassignField(const SchemaField& field, int32_t new_id, const Schema::GetId& get_id, Schema::IdMap& ids_to_reassigned, Schema::IdMap& ids_to_original) { - return {new_id, std::string(field.name()), + // Reassigning IDs only rewrites the field ID and nested type IDs; share the field's + // (immutable) default values rather than copying them. + return {new_id, + std::string(field.name()), ReassignTypeIds(field.type(), get_id, ids_to_reassigned, ids_to_original), - field.optional(), std::string(field.doc())}; + field.optional(), + std::string(field.doc()), + field.initial_default_ptr(), + field.write_default_ptr()}; } std::vector ReassignIds(std::vector fields, @@ -447,7 +453,21 @@ Status Schema::Validate(int32_t format_version) const { } } - // TODO(GuoTao.yu): Check default values when they are supported + // Only the initial-default is gated on format version: it changes how existing + // data files are read (rows written before the column existed materialize this + // value), so it requires the v3 reader contract. A write-default only affects + // values written going forward and does not reinterpret existing data. + if (field.initial_default().has_value() && + format_version < TableMetadata::kMinFormatVersionDefaultValues) { + return InvalidSchema( + "Invalid initial default for {}: non-null default ({}) is not supported " + "until v{}", + field.name(), field.initial_default()->get(), + TableMetadata::kMinFormatVersionDefaultValues); + } + if (field.initial_default().has_value() || field.write_default().has_value()) { + ICEBERG_RETURN_UNEXPECTED(field.Validate()); + } } return {}; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 206915ec2..8c3fb9be6 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -21,19 +21,26 @@ #include #include +#include +#include "iceberg/expression/literal.h" #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { SchemaField::SchemaField(int32_t field_id, std::string_view name, - std::shared_ptr type, bool optional, std::string_view doc) + std::shared_ptr type, bool optional, std::string_view doc, + std::shared_ptr initial_default, + std::shared_ptr write_default) : field_id_(field_id), name_(name), type_(std::move(type)), optional_(optional), - doc_(doc) {} + doc_(doc), + initial_default_(std::move(initial_default)), + write_default_(std::move(write_default)) {} SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, std::shared_ptr type, std::string_view doc) { @@ -55,6 +62,51 @@ bool SchemaField::optional() const { return optional_; } std::string_view SchemaField::doc() const { return doc_; } +std::optional> SchemaField::initial_default() + const { + if (initial_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*initial_default_); +} + +std::optional> SchemaField::write_default() const { + if (write_default_ == nullptr) { + return std::nullopt; + } + return std::cref(*write_default_); +} + +const std::shared_ptr& SchemaField::initial_default_ptr() const { + return initial_default_; +} + +const std::shared_ptr& SchemaField::write_default_ptr() const { + return write_default_; +} + +namespace { + +Status ValidateDefault(const SchemaField& field, const Literal& value, + std::string_view kind) { + if (value.IsNull() || value.IsAboveMax() || value.IsBelowMin()) { + return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, + field.name()); + } + if (field.type() == nullptr || !field.type()->is_primitive()) { + return InvalidSchema( + "Invalid {} value for {}: default values are only supported for primitive types", + kind, field.name()); + } + if (*value.type() != *field.type()) { + return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), + *value.type(), *field.type()); + } + return {}; +} + +} // namespace + Status SchemaField::Validate() const { if (name_.empty()) [[unlikely]] { return InvalidSchema("SchemaField cannot have empty name"); @@ -62,6 +114,13 @@ Status SchemaField::Validate() const { if (type_ == nullptr) [[unlikely]] { return InvalidSchema("SchemaField cannot have null type"); } + if (initial_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED( + ValidateDefault(*this, *initial_default_, "initial-default")); + } + if (write_default_ != nullptr) { + ICEBERG_RETURN_UNEXPECTED(ValidateDefault(*this, *write_default_, "write-default")); + } return {}; } @@ -72,9 +131,23 @@ std::string SchemaField::ToString() const { return result; } +namespace { + +bool DefaultEquals(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == nullptr || rhs == nullptr) { + return lhs == rhs; + } + return *lhs == *rhs; +} + +} // namespace + bool SchemaField::Equals(const SchemaField& other) const { return field_id_ == other.field_id_ && name_ == other.name_ && *type_ == *other.type_ && - optional_ == other.optional_; + optional_ == other.optional_ && + DefaultEquals(initial_default_, other.initial_default_) && + DefaultEquals(write_default_, other.write_default_); } } // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index fd20226a5..e90425ca7 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -24,7 +24,9 @@ /// type (e.g. a struct). #include +#include #include +#include #include #include @@ -46,8 +48,14 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. + /// \param[in] initial_default The v3 `initial-default` value, or null if absent. The + /// field shares ownership of the (immutable) value. + /// \param[in] write_default The v3 `write-default` value, or null if absent. The field + /// shares ownership of the (immutable) value. SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, - bool optional, std::string_view doc = {}); + bool optional, std::string_view doc = {}, + std::shared_ptr initial_default = nullptr, + std::shared_ptr write_default = nullptr); /// \brief Construct an optional (nullable) field. static SchemaField MakeOptional(int32_t field_id, std::string_view name, @@ -71,6 +79,32 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \brief Get the field documentation. std::string_view doc() const; + /// \brief Get the default value for this field used when reading rows written + /// before the field existed (v3 `initial-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> initial_default() + const; + + /// \brief Get the default value for this field used when a writer does not + /// supply a value (v3 `write-default`). Empty if absent. + /// + /// The returned reference is a non-owning view into a value owned by this field; + /// it remains valid for the lifetime of this SchemaField. + [[nodiscard]] std::optional> write_default() + const; + + /// \brief Get the shared owning pointer to the `initial-default` value, or null if + /// absent. Prefer initial_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& initial_default_ptr() const; + + /// \brief Get the shared owning pointer to the `write-default` value, or null if + /// absent. Prefer write_default() for reading; this exists so a rebuilt field can + /// share the (immutable) value rather than copy it. + [[nodiscard]] const std::shared_ptr& write_default_ptr() const; + [[nodiscard]] std::string ToString() const override; Status Validate() const; @@ -100,6 +134,11 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::shared_ptr type_; bool optional_; std::string doc_; + // Default values are owned by this field and never mutated after being set; copies + // of the field share the same payload (reference-counted) instead of deep-copying, + // like `type_` above. Sharing is unobservable because the payload is immutable. + std::shared_ptr initial_default_; + std::shared_ptr write_default_; }; } // namespace iceberg diff --git a/src/iceberg/schema_util.cc b/src/iceberg/schema_util.cc index 4ff678fc6..5ff828258 100644 --- a/src/iceberg/schema_util.cc +++ b/src/iceberg/schema_util.cc @@ -172,10 +172,14 @@ Result ProjectNested(const Type& expected_type, const Type& sou iter->second.local_index, prune_source)); } else if (MetadataColumns::IsMetadataColumn(field_id)) { child_projection.kind = FieldProjection::Kind::kMetadata; + } else if (expected_field.initial_default().has_value()) { + // Rows written before the field existed assume its `initial-default` value. + child_projection.kind = FieldProjection::Kind::kDefault; + child_projection.from = expected_field.initial_default()->get(); } else if (expected_field.optional()) { child_projection.kind = FieldProjection::Kind::kNull; } else { - // TODO(gangwu): support default value for v3 and constant value + // TODO(gangwu): support constant value return InvalidSchema("Missing required field: {}", expected_field.ToString()); } result.children.emplace_back(std::move(child_projection)); diff --git a/src/iceberg/test/resources/TableMetadataV3Valid.json b/src/iceberg/test/resources/TableMetadataV3Valid.json new file mode 100644 index 000000000..712d4b5bd --- /dev/null +++ b/src/iceberg/test/resources/TableMetadataV3Valid.json @@ -0,0 +1,123 @@ +{ + "format-version": 3, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "next-row-id": 0, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 08275a45c..71fc474f4 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -137,6 +138,50 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(dumped_json, json); } +TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":42,"name":"id","required":true,"type":"int","write-default":7},{"id":2,"initial-default":"n/a","name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + ASSERT_EQ(schema->fields().size(), 2); + + const auto& field1 = schema->fields()[0]; + ASSERT_TRUE(field1.initial_default().has_value()); + ASSERT_EQ(field1.initial_default()->get(), Literal::Int(42)); + ASSERT_TRUE(field1.write_default().has_value()); + ASSERT_EQ(field1.write_default()->get(), Literal::Int(7)); + + const auto& field2 = schema->fields()[1]; + ASSERT_TRUE(field2.initial_default().has_value()); + ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); + ASSERT_FALSE(field2.write_default().has_value()); + + ASSERT_EQ(ToJson(*schema).dump(), json); +} + +TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"initial-default":"oops","name":"id","required":true,"type":"int"}],"schema-id":1,"type":"struct"})"; + + auto result = SchemaFromJson(nlohmann::json::parse(json)); + ASSERT_FALSE(result.has_value()); +} + +TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { + constexpr std::string_view json = + R"({"fields":[{"id":1,"name":"person","required":true,"type":{"fields":[{"id":2,"initial-default":18,"name":"age","required":true,"type":"int","write-default":21}],"type":"struct"}}],"schema-id":1,"type":"struct"})"; + + ICEBERG_UNWRAP_OR_FAIL(auto schema, SchemaFromJson(nlohmann::json::parse(json))); + const auto& person = schema->fields()[0]; + const auto& nested = dynamic_cast(*person.type()).fields()[0]; + ASSERT_TRUE(nested.initial_default().has_value()); + ASSERT_EQ(nested.initial_default()->get(), Literal::Int(18)); + ASSERT_TRUE(nested.write_default().has_value()); + ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); + + ASSERT_EQ(ToJson(*schema).dump(), json); +} + TEST(SchemaJsonTest, UnknownFieldRoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"mystery","required":false,"type":"unknown"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 8f1b20035..28f7eb9a0 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,6 +25,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" @@ -133,6 +134,61 @@ TEST(SchemaTest, ValidateRejectsV3TypesBeforeFormatV3) { iceberg::IsOk()); } +TEST(SchemaTest, ValidateRejectsInitialDefaultBeforeFormatV3) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)))}); + + auto status = schema.Validate(2); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("is not supported until v3")); + + EXPECT_THAT(schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateDoesNotVersionGateWriteDefault) { + // A write-default does not reinterpret existing data, so it is not gated on + // format version: a write-default alone is accepted below v3. + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::Int(7)))}); + + EXPECT_THAT(schema.Validate(2), iceberg::IsOk()); +} + +TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { + iceberg::Schema schema({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, + std::make_shared(iceberg::Literal::String("oops")))}); + + auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); + EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); +} + +TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { + // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the + // default values over to the field with the new ID. + std::vector fields; + fields.push_back(iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(42)), + std::make_shared(iceberg::Literal::Int(7)))); + auto reassign_id = [](int32_t old_id) { return old_id + 1000; }; + + iceberg::Schema schema(std::move(fields), iceberg::Schema::kInitialSchemaId, + reassign_id); + + ASSERT_EQ(schema.fields().size(), 1); + const iceberg::SchemaField& field = schema.fields()[0]; + EXPECT_EQ(field.field_id(), 1001); + ASSERT_TRUE(field.initial_default().has_value()); + EXPECT_EQ(field.initial_default()->get(), iceberg::Literal::Int(42)); + ASSERT_TRUE(field.write_default().has_value()); + EXPECT_EQ(field.write_default()->get(), iceberg::Literal::Int(7)); +} + TEST(SchemaTest, ValidateRejectsInvalidUnknownFields) { iceberg::Schema required_unknown_schema( {iceberg::SchemaField(1, "mystery", iceberg::unknown(), false)}); diff --git a/src/iceberg/test/schema_util_test.cc b/src/iceberg/test/schema_util_test.cc index ee075006f..9a3eff887 100644 --- a/src/iceberg/test/schema_util_test.cc +++ b/src/iceberg/test/schema_util_test.cc @@ -24,6 +24,7 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/metadata_columns.h" #include "iceberg/schema.h" #include "iceberg/schema_field.h" @@ -179,6 +180,58 @@ TEST(SchemaUtilTest, ProjectMissingRequiredField) { ASSERT_THAT(projection_result, HasErrorMessage("Missing required field")); } +TEST(SchemaUtilTest, ProjectMissingRequiredFieldWithInitialDefault) { + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::int32(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Int(42))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + AssertProjectedField(projection.fields[0], 0); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::Int(42)); +} + +TEST(SchemaUtilTest, ProjectMissingOptionalFieldWithInitialDefault) { + // An optional field with an initial-default reads the default, not null. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField::MakeRequired(/*field_id=*/1, "id", iceberg::int64()), + SchemaField(/*field_id=*/10, "extra", iceberg::string(), /*optional=*/true, + /*doc=*/{}, std::make_shared(Literal::String("n/a"))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + + const auto& projection = *projection_result; + ASSERT_EQ(projection.fields.size(), 2); + ASSERT_EQ(projection.fields[1].kind, FieldProjection::Kind::kDefault); + ASSERT_EQ(std::get(projection.fields[1].from), Literal::String("n/a")); +} + +TEST(SchemaUtilTest, ProjectPresentFieldIgnoresInitialDefault) { + // initial-default only applies when the field is missing from the data file. + Schema source_schema = CreateFlatSchema(); + Schema expected_schema({ + SchemaField(/*field_id=*/1, "id", iceberg::int64(), /*optional=*/false, + /*doc=*/{}, std::make_shared(Literal::Long(-1))), + }); + + auto projection_result = + Project(expected_schema, source_schema, /*prune_source=*/false); + ASSERT_THAT(projection_result, IsOk()); + AssertProjectedField(projection_result->fields[0], 0); +} + TEST(SchemaUtilTest, ProjectMetadataColumn) { Schema source_schema = CreateFlatSchema(); Schema expected_schema({ From 511d9c8e619494e57955ad06325b058527c1b2de Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 10:20:11 -0700 Subject: [PATCH 02/12] test: remove unused TableMetadataV3Valid.json resource This file was added by this PR but is referenced by no test (table-metadata resources are loaded by explicit name via ReadTableMetadataFromResource) and carries no column defaults, so it is dead. Remove it. --- .../test/resources/TableMetadataV3Valid.json | 123 ------------------ 1 file changed, 123 deletions(-) delete mode 100644 src/iceberg/test/resources/TableMetadataV3Valid.json diff --git a/src/iceberg/test/resources/TableMetadataV3Valid.json b/src/iceberg/test/resources/TableMetadataV3Valid.json deleted file mode 100644 index 712d4b5bd..000000000 --- a/src/iceberg/test/resources/TableMetadataV3Valid.json +++ /dev/null @@ -1,123 +0,0 @@ -{ - "format-version": 3, - "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", - "location": "s3://bucket/test/location", - "last-sequence-number": 34, - "next-row-id": 0, - "last-updated-ms": 1602638573590, - "last-column-id": 3, - "current-schema-id": 1, - "schemas": [ - { - "type": "struct", - "schema-id": 0, - "fields": [ - { - "id": 1, - "name": "x", - "required": true, - "type": "long" - } - ] - }, - { - "type": "struct", - "schema-id": 1, - "identifier-field-ids": [ - 1, - 2 - ], - "fields": [ - { - "id": 1, - "name": "x", - "required": true, - "type": "long" - }, - { - "id": 2, - "name": "y", - "required": true, - "type": "long", - "doc": "comment" - }, - { - "id": 3, - "name": "z", - "required": true, - "type": "long" - } - ] - } - ], - "default-spec-id": 0, - "partition-specs": [ - { - "spec-id": 0, - "fields": [ - { - "name": "x", - "transform": "identity", - "source-id": 1, - "field-id": 1000 - } - ] - } - ], - "last-partition-id": 1000, - "default-sort-order-id": 3, - "sort-orders": [ - { - "order-id": 3, - "fields": [ - { - "transform": "identity", - "source-id": 2, - "direction": "asc", - "null-order": "nulls-first" - }, - { - "transform": "bucket[4]", - "source-id": 3, - "direction": "desc", - "null-order": "nulls-last" - } - ] - } - ], - "properties": {}, - "current-snapshot-id": 3055729675574597004, - "snapshots": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/1.avro" - }, - { - "snapshot-id": 3055729675574597004, - "parent-snapshot-id": 3051729675574597004, - "timestamp-ms": 1555100955770, - "sequence-number": 1, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/2.avro", - "schema-id": 1 - } - ], - "snapshot-log": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770 - }, - { - "snapshot-id": 3055729675574597004, - "timestamp-ms": 1555100955770 - } - ], - "metadata-log": [] -} From 9690529a959c40b4f10eacca24c2eb66cc83e2f0 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 10:35:28 -0700 Subject: [PATCH 03/12] test(json_serde): cover default-value serde in json_serde_test Add SchemaField default-value coverage directly to json_serde_test (the binary that exercises json_serde.cc): one test pins the initial-default / write-default wire format (incl. their absence when unset), and one round-trips a field with both defaults through ToJson -> FieldFromJson for every primitive type, exercising the non-trivial date/timestamp/decimal/uuid/binary encodings the default path reuses. --- src/iceberg/test/json_serde_test.cc | 69 +++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 0dac226e9..3a57375dc 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -23,10 +23,12 @@ #include #include +#include "iceberg/expression/literal.h" #include "iceberg/json_serde_internal.h" #include "iceberg/name_mapping.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/snapshot.h" #include "iceberg/sort_field.h" #include "iceberg/sort_order.h" @@ -35,9 +37,11 @@ #include "iceberg/table_update.h" #include "iceberg/test/matchers.h" #include "iceberg/transform.h" +#include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" // IWYU pragma: keep #include "iceberg/util/timepoint.h" +#include "iceberg/util/uuid.h" namespace iceberg { @@ -71,6 +75,11 @@ Result> FromJsonHelper(const nlohmann::json& json) return NameMappingFromJson(json); } +template <> +Result> FromJsonHelper(const nlohmann::json& json) { + return FieldFromJson(json); +} + // Helper function to reduce duplication in testing template void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { @@ -85,6 +94,66 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { } // namespace +// Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for +// `initial-default` and `write-default`, including the absence of the keys when a +// field carries no defaults. +TEST(JsonInternalTest, SchemaFieldDefaultValues) { + // Both defaults present. + SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, + std::make_shared(Literal::Int(42)), + std::make_shared(Literal::Int(7))); + TestJsonConversion( + with_both, + R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); + + // Only an initial-default; write-default must not appear in the JSON. + SchemaField initial_only(/*field_id=*/2, "name", string(), /*optional=*/true, + /*doc=*/{}, + std::make_shared(Literal::String("n/a")), + /*write_default=*/nullptr); + TestJsonConversion( + initial_only, + R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); + + // No defaults; neither key may appear. + SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); + TestJsonConversion(no_defaults, + R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); +} + +// Round-trips a field carrying both defaults through ToJson -> FieldFromJson for +// every primitive type, exercising the per-type single-value serialization the +// default path reuses (date/timestamp/decimal/uuid/binary have non-trivial wire +// encodings). +TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { + ICEBERG_UNWRAP_OR_FAIL(auto uuid_value, + Uuid::FromString("f79c3e09-677c-4bbd-a479-3f349cb785e7")); + std::vector, Literal>> cases; + cases.emplace_back(boolean(), Literal::Boolean(true)); + cases.emplace_back(int32(), Literal::Int(-7)); + cases.emplace_back(int64(), Literal::Long(1234567890123LL)); + cases.emplace_back(float32(), Literal::Float(1.5f)); + cases.emplace_back(float64(), Literal::Double(2.5)); + cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(string(), Literal::String("hello")); + cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); + cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); + cases.emplace_back(binary(), Literal::Binary({0xDE, 0xAD, 0xBE, 0xEF})); + cases.emplace_back(uuid(), Literal::UUID(uuid_value)); + + int32_t field_id = 1; + for (const auto& [type, literal] : cases) { + SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, + std::make_shared(literal), + std::make_shared(literal)); + auto json = ToJson(field); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); + EXPECT_EQ(field, *parsed) + << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); + } +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); From d15a0feefd0cf213805427966eadbb1c1b960dc7 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sat, 20 Jun 2026 11:45:58 -0700 Subject: [PATCH 04/12] test(json_serde): fix clang-format in default-value tests --- src/iceberg/test/json_serde_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index f8023535c..416d0e520 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -150,8 +150,8 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { std::make_shared(literal)); auto json = ToJson(field); ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); - EXPECT_EQ(field, *parsed) - << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); + EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() + << ", json=" << json.dump(); } } From 180a9f969b985cfdb6ba160bab711bfe2a057d1c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 22 Jun 2026 23:19:34 -0700 Subject: [PATCH 05/12] fix(schema): align v3 default-value validation/serde with spec and Java Addresses wgtmac's review feedback: - Enforce UTC offset for timestamptz / timestamptz_ns default values in FieldFromJson. The shared timestamp parser accepts any offset and silently normalizes to UTC, so C++ could accept default metadata Java rejects and rewrite the offset on write. Added TemporalUtils::IsUtcOffset, which reuses the existing timezone-suffix parser. - ValidateDefault now casts the default literal to the field type (Literal::CastTo) instead of requiring an exact type match, matching Java's Types.NestedField (e.g. an int default on a long field). Uncastable or out-of-range defaults are still rejected. - Documented that non-primitive (struct/list/map) defaults remain unsupported, matching Java's current model. - Extended the json_serde round-trip tests with time/timestamptz/timestamp_ns/ timestamptz_ns and a negative case for non-UTC timestamptz defaults; added a schema test for the cast-to-field-type behavior. --- build-rest-tests.sh | 35 +++++++++++++++++++++++++++++ src/iceberg/json_serde.cc | 34 ++++++++++++++++++++++++++++ src/iceberg/schema_field.cc | 17 +++++++++++--- src/iceberg/test/json_serde_test.cc | 21 +++++++++++++++++ src/iceberg/test/schema_test.cc | 17 ++++++++++++++ src/iceberg/util/temporal_util.cc | 5 +++++ src/iceberg/util/temporal_util.h | 12 ++++++++++ 7 files changed, 138 insertions(+), 3 deletions(-) create mode 100755 build-rest-tests.sh diff --git a/build-rest-tests.sh b/build-rest-tests.sh new file mode 100755 index 000000000..40ca233f4 --- /dev/null +++ b/build-rest-tests.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# +# Build and run the REST catalog unit tests (target: rest_catalog_test, which +# includes rest_json_serde_test.cc and rest_file_io_test.cc). +# +# Fast path: uses Homebrew LLVM (C++23 + libc++) and Homebrew apache-arrow, so +# Arrow is found via find_package instead of being compiled from source. +# +# One-time install: +# brew install cmake ninja llvm apache-arrow +# +set -euo pipefail +cd "$(dirname "$0")" + +LLVM_PREFIX="$(brew --prefix llvm)" +export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}" # help LLVM clang find the macOS SDK + +# NOTE: RelWithDebInfo (not Debug) on purpose. The vendored nanoarrow applies +# `-Werror -Wpedantic` only in Debug ($<$:...>), and Homebrew's +# Clang 22 flags nanoarrow's use of `__COUNTER__` as a C2y extension, which then +# becomes a fatal error. Non-Debug configs drop those flags. +cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER="${LLVM_PREFIX}/bin/clang++" \ + -DCMAKE_C_COMPILER="${LLVM_PREFIX}/bin/clang" \ + -DCMAKE_PREFIX_PATH="$(brew --prefix apache-arrow);$(brew --prefix)" \ + -DICEBERG_BUILD_REST=ON \ + -DICEBERG_BUILD_TESTS=ON + # If the Homebrew Arrow version mismatches and fails to compile, force the + # vendored Arrow 24.0.0 build instead by adding: + # -DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER + +cmake --build build --target rest_catalog_test -j + +ctest --test-dir build -R rest_catalog_test --output-on-failure diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 45da0bd27..24fb591e1 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -51,6 +51,7 @@ #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" +#include "iceberg/util/temporal_util.h" #include "iceberg/util/timepoint.h" namespace iceberg { @@ -564,6 +565,35 @@ Result> TypeFromJson(const nlohmann::json& json) { } } +namespace { + +// The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` default +// values requires a UTC offset. The shared timestamp parser accepts any offset and +// silently normalizes to UTC, which would let C++ accept default metadata that Java +// rejects and then rewrite the offset on serialization. Enforce UTC for these +// defaults at parse time, where the original offset is still visible. +Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& value) { + const auto type_id = type.type_id(); + if (type_id != TypeId::kTimestampTz && type_id != TypeId::kTimestampTzNs) { + return {}; + } + if (!value.is_string()) { + // Let LiteralFromJson report the type mismatch. + return {}; + } + const auto str = value.get(); + ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); + if (!is_utc) { + return JsonParseError( + "Invalid timestamptz default '{}' for {}: default values must use UTC " + "(offset 'Z' or '+00:00')", + str, type.ToString()); + } + return {}; +} + +} // namespace + Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE( auto type, GetJsonValue(json, kType).and_then(TypeFromJson)); @@ -578,12 +608,16 @@ Result> FieldFromJson(const nlohmann::json& json) { std::shared_ptr initial_default; if (initial_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *initial_default_json)); ICEBERG_ASSIGN_OR_RAISE(Literal literal, LiteralFromJson(*initial_default_json, type.get())); initial_default = std::make_shared(std::move(literal)); } std::shared_ptr write_default; if (write_default_json.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + ValidateTimestamptzDefaultIsUtc(*type, *write_default_json)); ICEBERG_ASSIGN_OR_RAISE(Literal literal, LiteralFromJson(*write_default_json, type.get())); write_default = std::make_shared(std::move(literal)); diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 8c3fb9be6..d2a7a8e4f 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -93,14 +93,25 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, return InvalidSchema("Invalid {} value for {}: must be a non-null value", kind, field.name()); } + // Defaults are only supported on primitive fields. The spec also permits JSON + // single-value defaults for struct/list/map (e.g. an empty struct `{}` whose + // sub-field defaults live in field metadata); that matches the current Java model's + // gap and is left as a follow-up. if (field.type() == nullptr || !field.type()->is_primitive()) { return InvalidSchema( "Invalid {} value for {}: default values are only supported for primitive types", kind, field.name()); } - if (*value.type() != *field.type()) { - return InvalidSchema("{} of field {} has type {} but expected {}", kind, field.name(), - *value.type(), *field.type()); + // Match Java (Types.NestedField), which casts the default literal to the field type + // instead of requiring an exact type match (e.g. an int default on a long field, or + // a string default on a date/timestamp/uuid field). Reject only defaults that cannot + // be cast to the field type or fall outside its range (CastTo signals out-of-range as + // an above-max/below-min sentinel). + auto field_type = std::static_pointer_cast(field.type()); + auto cast = value.CastTo(field_type); + if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { + return InvalidSchema("{} of field {} has type {} that cannot be cast to {}", kind, + field.name(), *value.type(), *field.type()); } return {}; } diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 416d0e520..3951a806c 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -136,7 +136,11 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { cases.emplace_back(float32(), Literal::Float(1.5f)); cases.emplace_back(float64(), Literal::Double(2.5)); cases.emplace_back(date(), Literal::Date(19738)); + cases.emplace_back(time(), Literal::Time(43200000000LL)); cases.emplace_back(timestamp(), Literal::Timestamp(1719446400000000LL)); + cases.emplace_back(timestamp_tz(), Literal::TimestampTz(1719446400000000LL)); + cases.emplace_back(timestamp_ns(), Literal::TimestampNs(1719446400000000123LL)); + cases.emplace_back(timestamptz_ns(), Literal::TimestampTzNs(1719446400000000123LL)); cases.emplace_back(string(), Literal::String("hello")); cases.emplace_back(decimal(9, 2), Literal::Decimal(12345, 9, 2)); cases.emplace_back(fixed(3), Literal::Fixed({0x01, 0x02, 0x03})); @@ -155,6 +159,23 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { } } +// The spec only permits UTC offsets for timestamptz / timestamptz_ns default values. +// A non-UTC offset (which the shared parser would silently normalize) must be rejected, +// while the UTC form is accepted. +TEST(JsonInternalTest, SchemaFieldRejectsNonUtcTimestamptzDefault) { + auto non_utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T05:00:00+05:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc).has_value()); + + auto non_utc_ns = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz_ns","write-default":"2024-06-27T05:00:00-08:00"})"); + EXPECT_FALSE(FieldFromJson(non_utc_ns).has_value()); + + auto utc = nlohmann::json::parse( + R"({"id":1,"name":"ts","required":true,"type":"timestamptz","initial-default":"2024-06-27T00:00:00+00:00"})"); + EXPECT_TRUE(FieldFromJson(utc).has_value()); +} + TEST(JsonInternalTest, SortField) { auto identity_transform = Transform::Identity(); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 28f7eb9a0..d07cf71a8 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -167,6 +167,23 @@ TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); } +TEST(SchemaTest, ValidateCastsDefaultToFieldType) { + // Matching Java, the default literal is cast to the field type rather than required to + // match exactly: an int default on a long field is accepted (int -> long). + iceberg::Schema widened({iceberg::SchemaField( + 1, "id", iceberg::int64(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::Int(34)))}); + EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsOk()); + + // A default whose type cannot be cast to the field type is still rejected. + iceberg::Schema uncastable({iceberg::SchemaField( + 1, "id", iceberg::int32(), false, /*doc=*/{}, + std::make_shared(iceberg::Literal::String("oops")))}); + EXPECT_THAT(uncastable.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); +} + TEST(SchemaTest, ReassignIdsPreservesDefaultValues) { // Reassigning field IDs rebuilds each SchemaField, so the rebuild must carry the // default values over to the field with the new ID. diff --git a/src/iceberg/util/temporal_util.cc b/src/iceberg/util/temporal_util.cc index e00ee7cf5..9e6a93cfd 100644 --- a/src/iceberg/util/temporal_util.cc +++ b/src/iceberg/util/temporal_util.cc @@ -444,6 +444,11 @@ Result TemporalUtils::ParseTimestampNsWithZone(std::string_view str) { /*units_per_micro=*/internal::kNanosPerMicro); } +Result TemporalUtils::IsUtcOffset(std::string_view str) { + ICEBERG_ASSIGN_OR_RAISE(auto timestamp_with_offset, ParseTimestampWithZoneSuffix(str)); + return timestamp_with_offset.second == 0; +} + #define DISPATCH_EXTRACT_YEAR(type_id) \ case type_id: \ return ExtractYearImpl(literal); diff --git a/src/iceberg/util/temporal_util.h b/src/iceberg/util/temporal_util.h index 2121f565d..cc7a16319 100644 --- a/src/iceberg/util/temporal_util.h +++ b/src/iceberg/util/temporal_util.h @@ -125,6 +125,18 @@ class ICEBERG_EXPORT TemporalUtils { /// \return The number of nanoseconds since epoch (UTC), or an error. static Result ParseTimestampNsWithZone(std::string_view str); + /// \brief Reports whether a timestamp-with-zone string uses a UTC offset. + /// + /// The ParseTimestamp*WithZone parsers accept any offset and silently normalize it + /// to UTC. The spec's JSON single-value form for `timestamptz` / `timestamptz_ns` + /// default values only permits UTC ("Z" or "+00:00"), so callers that must enforce + /// that rule check the offset here before parsing. + /// + /// \param str The timestamp-with-zone string to inspect. + /// \return true if the offset is UTC, false if it is a non-UTC offset, or an error + /// if the timezone suffix cannot be parsed. + static Result IsUtcOffset(std::string_view str); + /// \brief Extract a date or timestamp year, as years from 1970 static Result ExtractYear(const Literal& literal); From 2052e4fbc9e4c7e490968049d9758c5729f281df Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 22 Jun 2026 23:20:55 -0700 Subject: [PATCH 06/12] chore: drop accidentally committed build-rest-tests.sh --- build-rest-tests.sh | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100755 build-rest-tests.sh diff --git a/build-rest-tests.sh b/build-rest-tests.sh deleted file mode 100755 index 40ca233f4..000000000 --- a/build-rest-tests.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash -# -# Build and run the REST catalog unit tests (target: rest_catalog_test, which -# includes rest_json_serde_test.cc and rest_file_io_test.cc). -# -# Fast path: uses Homebrew LLVM (C++23 + libc++) and Homebrew apache-arrow, so -# Arrow is found via find_package instead of being compiled from source. -# -# One-time install: -# brew install cmake ninja llvm apache-arrow -# -set -euo pipefail -cd "$(dirname "$0")" - -LLVM_PREFIX="$(brew --prefix llvm)" -export SDKROOT="${SDKROOT:-$(xcrun --show-sdk-path)}" # help LLVM clang find the macOS SDK - -# NOTE: RelWithDebInfo (not Debug) on purpose. The vendored nanoarrow applies -# `-Werror -Wpedantic` only in Debug ($<$:...>), and Homebrew's -# Clang 22 flags nanoarrow's use of `__COUNTER__` as a C2y extension, which then -# becomes a fatal error. Non-Debug configs drop those flags. -cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER="${LLVM_PREFIX}/bin/clang++" \ - -DCMAKE_C_COMPILER="${LLVM_PREFIX}/bin/clang" \ - -DCMAKE_PREFIX_PATH="$(brew --prefix apache-arrow);$(brew --prefix)" \ - -DICEBERG_BUILD_REST=ON \ - -DICEBERG_BUILD_TESTS=ON - # If the Homebrew Arrow version mismatches and fails to compile, force the - # vendored Arrow 24.0.0 build instead by adding: - # -DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER - -cmake --build build --target rest_catalog_test -j - -ctest --test-dir build -R rest_catalog_test --output-on-failure From 54d9636223532377e941f48c8181d1b9285a36f2 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 23 Jun 2026 11:44:24 -0700 Subject: [PATCH 07/12] test(temporal): add unit test for TemporalUtils::IsUtcOffset Directly cover the UTC/non-UTC/unparseable offset cases the timestamptz default-value enforcement relies on. --- src/iceberg/test/temporal_util_test.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/iceberg/test/temporal_util_test.cc b/src/iceberg/test/temporal_util_test.cc index 0d4426b0b..791d02c56 100644 --- a/src/iceberg/test/temporal_util_test.cc +++ b/src/iceberg/test/temporal_util_test.cc @@ -53,6 +53,31 @@ TEST(TemporalUtilTest, ParseTimestampNsChecksInt64Bounds) { IsError(ErrorKind::kInvalidArgument)); } +TEST(TemporalUtilTest, IsUtcOffset) { + // UTC offsets: "Z", "+00:00" and "-00:00". + ICEBERG_UNWRAP_OR_FAIL(auto z, TemporalUtils::IsUtcOffset("2024-06-27T00:00:00Z")); + EXPECT_TRUE(z); + ICEBERG_UNWRAP_OR_FAIL(auto plus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00+00:00")); + EXPECT_TRUE(plus_zero); + ICEBERG_UNWRAP_OR_FAIL(auto minus_zero, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-00:00")); + EXPECT_TRUE(minus_zero); + + // Non-UTC offsets. + ICEBERG_UNWRAP_OR_FAIL(auto plus_five, + TemporalUtils::IsUtcOffset("2024-06-27T05:00:00+05:00")); + EXPECT_FALSE(plus_five); + ICEBERG_UNWRAP_OR_FAIL(auto minus_eight, + TemporalUtils::IsUtcOffset("2024-06-27T00:00:00-08:00")); + EXPECT_FALSE(minus_eight); + + // A missing or unparseable timezone suffix is an error. + EXPECT_THAT(TemporalUtils::IsUtcOffset("2024-06-27T00:00:00"), + IsError(ErrorKind::kInvalidArgument)); + EXPECT_THAT(TemporalUtils::IsUtcOffset(""), IsError(ErrorKind::kInvalidArgument)); +} + TEST(TemporalUtilTest, ParseTimestampNsRejectsMoreThanNineFractionalDigits) { EXPECT_THAT(TemporalUtils::ParseTimestampNs("2026-01-01T00:00:01.0000010011"), IsError(ErrorKind::kInvalidArgument)); From 925611ae59a4a9b1afdd4131a8b487b6556747ee Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 23 Jun 2026 13:57:26 -0700 Subject: [PATCH 08/12] ci: re-trigger CI The AMD64 Windows job failed in its cleanup step (rm -rf example/build: Device or resource busy) after the example compiled and linked successfully; re-run CI. From 0b4d4524363a69d008e1b499cd7e42c3c29693f6 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 10:15:07 -0700 Subject: [PATCH 09/12] fix(schema): address review nits on default-value validation/serde - ValidateDefault: use internal::checked_pointer_cast and ICEBERG_ASSIGN_OR_RAISE so an uncastable default surfaces the original CastTo error instead of a generic message; keep the out-of-range (sentinel) case as an invalid-schema error. - json_serde: error out directly when a timestamptz default is not a string, drop the offset-specific wording from the UTC error message (it also accepts -00:00), and use auto for the parsed default JSON. - Update schema tests for the propagated cast error and the out-of-range case. --- src/iceberg/json_serde.cc | 11 +++++------ src/iceberg/schema_field.cc | 11 ++++++----- src/iceberg/test/schema_test.cc | 15 +++++++++------ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 24fb591e1..6790e9861 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -578,15 +578,14 @@ Status ValidateTimestamptzDefaultIsUtc(const Type& type, const nlohmann::json& v return {}; } if (!value.is_string()) { - // Let LiteralFromJson report the type mismatch. - return {}; + return JsonParseError("Invalid timestamptz default {} for {}: expected a string", + SafeDumpJson(value), type.ToString()); } const auto str = value.get(); ICEBERG_ASSIGN_OR_RAISE(bool is_utc, TemporalUtils::IsUtcOffset(str)); if (!is_utc) { return JsonParseError( - "Invalid timestamptz default '{}' for {}: default values must use UTC " - "(offset 'Z' or '+00:00')", + "Invalid timestamptz default '{}' for {}: default values must use a UTC offset", str, type.ToString()); } return {}; @@ -601,9 +600,9 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); - ICEBERG_ASSIGN_OR_RAISE(std::optional initial_default_json, + ICEBERG_ASSIGN_OR_RAISE(auto initial_default_json, GetJsonValueOptional(json, kInitialDefault)); - ICEBERG_ASSIGN_OR_RAISE(std::optional write_default_json, + ICEBERG_ASSIGN_OR_RAISE(auto write_default_json, GetJsonValueOptional(json, kWriteDefault)); std::shared_ptr initial_default; diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index d2a7a8e4f..144f75799 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -25,6 +25,7 @@ #include "iceberg/expression/literal.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" @@ -107,11 +108,11 @@ Status ValidateDefault(const SchemaField& field, const Literal& value, // a string default on a date/timestamp/uuid field). Reject only defaults that cannot // be cast to the field type or fall outside its range (CastTo signals out-of-range as // an above-max/below-min sentinel). - auto field_type = std::static_pointer_cast(field.type()); - auto cast = value.CastTo(field_type); - if (!cast.has_value() || cast->IsAboveMax() || cast->IsBelowMin()) { - return InvalidSchema("{} of field {} has type {} that cannot be cast to {}", kind, - field.name(), *value.type(), *field.type()); + auto field_type = internal::checked_pointer_cast(field.type()); + ICEBERG_ASSIGN_OR_RAISE(auto cast, value.CastTo(field_type)); + if (cast.IsAboveMax() || cast.IsBelowMin()) { + return InvalidSchema("{} of field {} ({}) is out of range for {}", kind, field.name(), + *value.type(), *field.type()); } return {}; } diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index d07cf71a8..2db69a551 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -162,9 +162,11 @@ TEST(SchemaTest, ValidateRejectsMismatchedDefaultValue) { 1, "id", iceberg::int32(), false, /*doc=*/{}, /*initial_default=*/nullptr, std::make_shared(iceberg::Literal::String("oops")))}); + // The default literal is cast to the field type; an uncastable type surfaces the + // original cast error. auto status = schema.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion); - ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); - EXPECT_THAT(status, iceberg::HasErrorMessage("write-default")); + ASSERT_THAT(status, iceberg::IsError(iceberg::ErrorKind::kNotSupported)); + EXPECT_THAT(status, iceberg::HasErrorMessage("Cast from String")); } TEST(SchemaTest, ValidateCastsDefaultToFieldType) { @@ -176,11 +178,12 @@ TEST(SchemaTest, ValidateCastsDefaultToFieldType) { EXPECT_THAT(widened.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), iceberg::IsOk()); - // A default whose type cannot be cast to the field type is still rejected. - iceberg::Schema uncastable({iceberg::SchemaField( + // A default outside the field type's range is rejected (CastTo returns an + // above-max/below-min sentinel). + iceberg::Schema out_of_range({iceberg::SchemaField( 1, "id", iceberg::int32(), false, /*doc=*/{}, - std::make_shared(iceberg::Literal::String("oops")))}); - EXPECT_THAT(uncastable.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), + std::make_shared(iceberg::Literal::Long(5'000'000'000)))}); + EXPECT_THAT(out_of_range.Validate(iceberg::TableMetadata::kSupportedTableFormatVersion), iceberg::IsError(iceberg::ErrorKind::kInvalidSchema)); } From b252b725e442b8271176542a3615b4239ac460e1 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 11:40:29 -0700 Subject: [PATCH 10/12] refactor(serde): return Result from ToJson for schema/type/metadata serializers Column default values make single-value (Literal) serialization fallible, so the schema/type serializers must propagate errors instead of throwing. Change ToJson for SchemaField, Type, Schema, TableMetadata and TableUpdate (plus the REST CreateTableRequest/CommitTableRequest/LoadTableResult/CommitTableResponse, which embed them) to return Result, replacing the ICEBERG_ASSIGN_OR_THROW in SchemaField default serialization with ICEBERG_ASSIGN_OR_RAISE. - The shared ToJsonList template (also used by infallible serializers) is left unchanged; TableMetadata serializes its schema list with an explicit loop. - REST adds an ICEBERG_DECLARE_JSON_SERDE_FALLIBLE macro variant for the four models that embed a schema/metadata; the other REST models keep the bare-json ToJson. - Callers bottom out at existing Result boundaries (ToJsonString wrappers, rest_catalog.cc, TableMetadataUtil::Write). - Tests unwrap via ICEBERG_UNWRAP_OR_FAIL. --- src/iceberg/catalog/rest/json_serde.cc | 17 ++-- .../catalog/rest/json_serde_internal.h | 19 +++- src/iceberg/catalog/rest/rest_catalog.cc | 6 +- src/iceberg/json_serde.cc | 49 +++++----- src/iceberg/json_serde_internal.h | 10 +-- src/iceberg/table_metadata.cc | 2 +- src/iceberg/test/json_serde_test.cc | 89 ++++++++++++------- src/iceberg/test/metadata_serde_test.cc | 2 +- src/iceberg/test/schema_json_test.cc | 31 ++++--- 9 files changed, 143 insertions(+), 82 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde.cc b/src/iceberg/catalog/rest/json_serde.cc index ac80e9f08..49dbcdd4c 100644 --- a/src/iceberg/catalog/rest/json_serde.cc +++ b/src/iceberg/catalog/rest/json_serde.cc @@ -690,10 +690,10 @@ Result RenameTableRequestFromJson(const nlohmann::json& json } // LoadTableResult (used by CreateTableResponse, LoadTableResponse) -nlohmann::json ToJson(const LoadTableResult& result) { +Result ToJson(const LoadTableResult& result) { nlohmann::json json; SetOptionalStringField(json, kMetadataLocation, result.metadata_location); - json[kMetadata] = ToJson(*result.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*result.metadata)); SetContainerField(json, kConfig, result.config); return json; } @@ -820,12 +820,12 @@ Result ListTablesResponseFromJson(const nlohmann::json& json return response; } -nlohmann::json ToJson(const CreateTableRequest& request) { +Result ToJson(const CreateTableRequest& request) { nlohmann::json json; json[kName] = request.name; SetOptionalStringField(json, kLocation, request.location); if (request.schema) { - json[kSchema] = ToJson(*request.schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*request.schema)); } if (request.partition_spec) { json[kPartitionSpec] = ToJson(*request.partition_spec); @@ -872,7 +872,7 @@ Result CreateTableRequestFromJson(const nlohmann::json& json } // CommitTableRequest serialization -nlohmann::json ToJson(const CommitTableRequest& request) { +Result ToJson(const CommitTableRequest& request) { nlohmann::json json; if (!request.identifier.name.empty()) { json[kIdentifier] = ToJson(request.identifier); @@ -886,7 +886,8 @@ nlohmann::json ToJson(const CommitTableRequest& request) { nlohmann::json updates_json = nlohmann::json::array(); for (const auto& update : request.updates) { - updates_json.push_back(ToJson(*update)); + ICEBERG_ASSIGN_OR_RAISE(auto update_json, ToJson(*update)); + updates_json.push_back(std::move(update_json)); } json[kUpdates] = std::move(updates_json); @@ -932,11 +933,11 @@ Result CommitTableRequestFromJson(const nlohmann::json& json } // CommitTableResponse serialization -nlohmann::json ToJson(const CommitTableResponse& response) { +Result ToJson(const CommitTableResponse& response) { nlohmann::json json; json[kMetadataLocation] = response.metadata_location; if (response.metadata) { - json[kMetadata] = ToJson(*response.metadata); + ICEBERG_ASSIGN_OR_RAISE(json[kMetadata], ToJson(*response.metadata)); } return json; } diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 61d2eb6e0..ba05704c6 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -46,6 +46,16 @@ Result FromJson(const nlohmann::json& json); \ ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); +// Same as ICEBERG_DECLARE_JSON_SERDE, but ToJson returns Result because the model +// embeds a Schema/TableMetadata whose default-value serialization can fail. +#define ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(Model) \ + ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ + \ + template <> \ + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ + \ + ICEBERG_REST_EXPORT Result ToJson(const Model& model); + /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of /// `json_internal.cc` to define the `FromJson` function for the model. ICEBERG_DECLARE_JSON_SERDE(CatalogConfig) @@ -57,15 +67,16 @@ ICEBERG_DECLARE_JSON_SERDE(GetNamespaceResponse) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesRequest) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesResponse) ICEBERG_DECLARE_JSON_SERDE(ListTablesResponse) -ICEBERG_DECLARE_JSON_SERDE(LoadTableResult) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(LoadTableResult) ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CreateTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableRequest) -ICEBERG_DECLARE_JSON_SERDE(CommitTableResponse) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CreateTableRequest) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableRequest) +ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableResponse) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) #undef ICEBERG_DECLARE_JSON_SERDE +#undef ICEBERG_DECLARE_JSON_SERDE_FALLIBLE ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, diff --git a/src/iceberg/catalog/rest/rest_catalog.cc b/src/iceberg/catalog/rest/rest_catalog.cc index 7c79b94d1..2f8035477 100644 --- a/src/iceberg/catalog/rest/rest_catalog.cc +++ b/src/iceberg/catalog/rest/rest_catalog.cc @@ -666,7 +666,8 @@ Result RestCatalog::CreateTableInternal( .properties = properties, }; - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(const auto response, client_->Post(path, json_request, /*headers=*/{}, *TableErrorHandler::Instance(), session)); @@ -709,7 +710,8 @@ Result RestCatalog::UpdateTableInternal( request.updates.push_back(update->Clone()); } - ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(ToJson(request))); + ICEBERG_ASSIGN_OR_RAISE(auto request_json, ToJson(request)); + ICEBERG_ASSIGN_OR_RAISE(auto json_request, ToJsonString(request_json)); ICEBERG_ASSIGN_OR_RAISE(const auto response, client_->Post(path, json_request, /*headers=*/{}, *TableErrorHandler::Instance(), session)); diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 6790e9861..4deb45330 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -301,28 +301,26 @@ Result> SortOrderFromJson(const nlohmann::json& json) return SortOrder::Make(order_id, std::move(sort_fields)); } -nlohmann::json ToJson(const SchemaField& field) { +Result ToJson(const SchemaField& field) { nlohmann::json json; json[kId] = field.field_id(); json[kName] = field.name(); json[kRequired] = !field.optional(); - json[kType] = ToJson(*field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(*field.type())); if (!field.doc().empty()) { json[kDoc] = field.doc(); } - // Defaults are validated to be primitive literals matching the field type, so - // single-value serialization cannot fail here. if (field.initial_default().has_value()) { - ICEBERG_ASSIGN_OR_THROW(json[kInitialDefault], + ICEBERG_ASSIGN_OR_RAISE(json[kInitialDefault], ToJson(field.initial_default()->get())); } if (field.write_default().has_value()) { - ICEBERG_ASSIGN_OR_THROW(json[kWriteDefault], ToJson(field.write_default()->get())); + ICEBERG_ASSIGN_OR_RAISE(json[kWriteDefault], ToJson(field.write_default()->get())); } return json; } -nlohmann::json ToJson(const Type& type) { +Result ToJson(const Type& type) { switch (type.type_id()) { case TypeId::kStruct: { const auto& struct_type = internal::checked_cast(type); @@ -330,7 +328,8 @@ nlohmann::json ToJson(const Type& type) { json[kType] = kStruct; nlohmann::json fields_json = nlohmann::json::array(); for (const auto& field : struct_type.fields()) { - fields_json.push_back(ToJson(field)); + ICEBERG_ASSIGN_OR_RAISE(auto field_json, ToJson(field)); + fields_json.push_back(std::move(field_json)); } json[kFields] = fields_json; return json; @@ -343,7 +342,7 @@ nlohmann::json ToJson(const Type& type) { const auto& element_field = list_type.fields().front(); json[kElementId] = element_field.field_id(); json[kElementRequired] = !element_field.optional(); - json[kElement] = ToJson(*element_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kElement], ToJson(*element_field.type())); return json; } case TypeId::kMap: { @@ -353,12 +352,12 @@ nlohmann::json ToJson(const Type& type) { const auto& key_field = map_type.key(); json[kKeyId] = key_field.field_id(); - json[kKey] = ToJson(*key_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kKey], ToJson(*key_field.type())); const auto& value_field = map_type.value(); json[kValueId] = value_field.field_id(); json[kValueRequired] = !value_field.optional(); - json[kValue] = ToJson(*value_field.type()); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(*value_field.type())); return json; } case TypeId::kBoolean: @@ -404,8 +403,9 @@ nlohmann::json ToJson(const Type& type) { std::unreachable(); } -nlohmann::json ToJson(const Schema& schema) { - nlohmann::json json = ToJson(internal::checked_cast(schema)); +Result ToJson(const Schema& schema) { + ICEBERG_ASSIGN_OR_RAISE(nlohmann::json json, + ToJson(internal::checked_cast(schema))); json[kSchemaId] = schema.schema_id(); if (!schema.IdentifierFieldIds().empty()) { json[kIdentifierFieldIds] = schema.IdentifierFieldIds(); @@ -414,7 +414,8 @@ nlohmann::json ToJson(const Schema& schema) { } Result ToJsonString(const Schema& schema) { - return ToJsonString(ToJson(schema)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(schema)); + return ToJsonString(json); } nlohmann::json ToJson(const SnapshotRef& ref) { @@ -956,7 +957,7 @@ Result EncryptedKeyFromJson(const nlohmann::json& json) { }; } -nlohmann::json ToJson(const TableMetadata& table_metadata) { +Result ToJson(const TableMetadata& table_metadata) { nlohmann::json json; json[kFormatVersion] = table_metadata.format_version; @@ -974,7 +975,7 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { if (table_metadata.format_version == 1) { for (const auto& schema : table_metadata.schemas) { if (schema->schema_id() == table_metadata.current_schema_id) { - json[kSchema] = ToJson(*schema); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*schema)); break; } } @@ -982,7 +983,14 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { // write the current schema ID and schema list json[kCurrentSchemaId] = table_metadata.current_schema_id; - json[kSchemas] = ToJsonList(table_metadata.schemas); + // Schemas can carry fallible default-value serialization, so the shared ToJsonList + // helper (which assumes infallible ToJson) is not used here. + nlohmann::json schemas_json = nlohmann::json::array(); + for (const auto& schema : table_metadata.schemas) { + ICEBERG_ASSIGN_OR_RAISE(auto schema_json, ToJson(*schema)); + schemas_json.push_back(std::move(schema_json)); + } + json[kSchemas] = std::move(schemas_json); // for older readers, continue writing the default spec as "partition-spec" if (table_metadata.format_version == 1) { @@ -1032,7 +1040,8 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { } Result ToJsonString(const TableMetadata& table_metadata) { - return ToJsonString(ToJson(table_metadata)); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(table_metadata)); + return ToJsonString(json); } namespace { @@ -1435,7 +1444,7 @@ Result NamespaceFromJson(const nlohmann::json& json) { return ns; } -nlohmann::json ToJson(const TableUpdate& update) { +Result ToJson(const TableUpdate& update) { nlohmann::json json; switch (update.kind()) { case TableUpdate::Kind::kAssignUUID: { @@ -1454,7 +1463,7 @@ nlohmann::json ToJson(const TableUpdate& update) { const auto& u = internal::checked_cast(update); json[kAction] = kActionAddSchema; if (u.schema()) { - json[kSchema] = ToJson(*u.schema()); + ICEBERG_ASSIGN_OR_RAISE(json[kSchema], ToJson(*u.schema())); } else { json[kSchema] = nlohmann::json::value_t::null; } diff --git a/src/iceberg/json_serde_internal.h b/src/iceberg/json_serde_internal.h index 351b41086..1a30e8e8f 100644 --- a/src/iceberg/json_serde_internal.h +++ b/src/iceberg/json_serde_internal.h @@ -93,7 +93,7 @@ ICEBERG_EXPORT Result> SortOrderFromJson( /// /// \param schema The Iceberg schema to convert. /// \return The JSON representation of the schema. -ICEBERG_EXPORT nlohmann::json ToJson(const Schema& schema); +ICEBERG_EXPORT Result ToJson(const Schema& schema); /// \brief Convert an Iceberg Schema to JSON. /// @@ -111,7 +111,7 @@ ICEBERG_EXPORT Result> SchemaFromJson(const nlohmann::js /// /// \param type The Iceberg type to convert. /// \return The JSON representation of the type. -ICEBERG_EXPORT nlohmann::json ToJson(const Type& type); +ICEBERG_EXPORT Result ToJson(const Type& type); /// \brief Convert JSON to an Iceberg Type. /// @@ -123,7 +123,7 @@ ICEBERG_EXPORT Result> TypeFromJson(const nlohmann::json& /// /// \param field The Iceberg field to convert. /// \return The JSON representation of the field. -ICEBERG_EXPORT nlohmann::json ToJson(const SchemaField& field); +ICEBERG_EXPORT Result ToJson(const SchemaField& field); /// \brief Convert JSON to an Iceberg SchemaField. /// @@ -300,7 +300,7 @@ ICEBERG_EXPORT Result EncryptedKeyFromJson(const nlohmann::json& j /// /// \param table_metadata The `TableMetadata` object to be serialized. /// \return A JSON object representing the `TableMetadata`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableMetadata& table_metadata); +ICEBERG_EXPORT Result ToJson(const TableMetadata& table_metadata); /// \brief Serializes a `TableMetadata` object to JSON. /// @@ -404,7 +404,7 @@ ICEBERG_EXPORT Result NamespaceFromJson(const nlohmann::json& json); /// /// \param update The `TableUpdate` object to be serialized. /// \return A JSON object representing the `TableUpdate`. -ICEBERG_EXPORT nlohmann::json ToJson(const TableUpdate& update); +ICEBERG_EXPORT Result ToJson(const TableUpdate& update); /// \brief Deserializes a JSON object into a `TableUpdate` object. /// diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 53e3c231d..ef2f1bf38 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -489,7 +489,7 @@ Result TableMetadataUtil::Write(FileIO& io, const TableMetadata* ba Status TableMetadataUtil::Write(FileIO& io, const std::string& location, const TableMetadata& metadata) { - auto json = ToJson(metadata); + ICEBERG_ASSIGN_OR_RAISE(auto json, ToJson(metadata)); ICEBERG_ASSIGN_OR_RAISE(auto json_string, ToJsonString(json)); return io.WriteFile(location, json_string); } diff --git a/src/iceberg/test/json_serde_test.cc b/src/iceberg/test/json_serde_test.cc index 3951a806c..c97ed64a6 100644 --- a/src/iceberg/test/json_serde_test.cc +++ b/src/iceberg/test/json_serde_test.cc @@ -93,6 +93,19 @@ void TestJsonConversion(const T& obj, const nlohmann::json& expected_json) { EXPECT_EQ(obj, *obj_ex.value()) << "Deserialized object mismatch."; } +// ToJson(SchemaField) returns Result, so it cannot use the shared +// TestJsonConversion helper. Unwrap the serialized json before comparing and +// round-tripping. +void TestSchemaFieldJsonConversion(const SchemaField& field, + const nlohmann::json& expected_json) { + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); + EXPECT_EQ(expected_json, json) << "JSON conversion mismatch."; + + auto obj_ex = FieldFromJson(expected_json); + EXPECT_TRUE(obj_ex.has_value()) << "Failed to deserialize JSON."; + EXPECT_EQ(field, *obj_ex.value()) << "Deserialized object mismatch."; +} + } // namespace // Pins the wire format produced by ToJson(SchemaField) / FieldFromJson for @@ -103,7 +116,7 @@ TEST(JsonInternalTest, SchemaFieldDefaultValues) { SchemaField with_both(/*field_id=*/1, "id", int32(), /*optional=*/false, /*doc=*/{}, std::make_shared(Literal::Int(42)), std::make_shared(Literal::Int(7))); - TestJsonConversion( + TestSchemaFieldJsonConversion( with_both, R"({"id":1,"name":"id","required":true,"type":"int","initial-default":42,"write-default":7})"_json); @@ -112,14 +125,14 @@ TEST(JsonInternalTest, SchemaFieldDefaultValues) { /*doc=*/{}, std::make_shared(Literal::String("n/a")), /*write_default=*/nullptr); - TestJsonConversion( + TestSchemaFieldJsonConversion( initial_only, R"({"id":2,"name":"name","required":false,"type":"string","initial-default":"n/a"})"_json); // No defaults; neither key may appear. SchemaField no_defaults(/*field_id=*/3, "plain", int32(), /*optional=*/false); - TestJsonConversion(no_defaults, - R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); + TestSchemaFieldJsonConversion( + no_defaults, R"({"id":3,"name":"plain","required":true,"type":"int"})"_json); } // Round-trips a field carrying both defaults through ToJson -> FieldFromJson for @@ -152,7 +165,7 @@ TEST(JsonInternalTest, SchemaFieldDefaultValuesRoundTripAllTypes) { SchemaField field(field_id++, "f", type, /*optional=*/false, /*doc=*/{}, std::make_shared(literal), std::make_shared(literal)); - auto json = ToJson(field); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(field)); ICEBERG_UNWRAP_OR_FAIL(auto parsed, FieldFromJson(json)); EXPECT_EQ(field, *parsed) << "round-trip mismatch for type " << type->ToString() << ", json=" << json.dump(); @@ -415,7 +428,8 @@ TEST(JsonInternalTest, TableUpdateAssignUUID) { nlohmann::json expected = R"({"action":"assign-uuid","uuid":"550e8400-e29b-41d4-a716-446655440000"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -426,7 +440,8 @@ TEST(JsonInternalTest, TableUpdateUpgradeFormatVersion) { nlohmann::json expected = R"({"action":"upgrade-format-version","format-version":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -440,7 +455,7 @@ TEST(JsonInternalTest, TableUpdateAddSchema) { /*schema_id=*/1); table::AddSchema update(schema, 2); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-schema"); EXPECT_EQ(json["last-column-id"], 2); EXPECT_TRUE(json.contains("schema")); @@ -457,9 +472,10 @@ TEST(JsonInternalTest, TableUpdateAddSchemaWithoutDeprecatedLastColumnId) { std::vector{SchemaField(1, "id", int64(), false), SchemaField(3, "name", string(), true)}, /*schema_id=*/1); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); nlohmann::json json = { {"action", "add-schema"}, - {"schema", ToJson(*schema)}, + {"schema", schema_json}, }; auto parsed = TableUpdateFromJson(json); @@ -473,7 +489,8 @@ TEST(JsonInternalTest, TableUpdateSetCurrentSchema) { table::SetCurrentSchema update(1); nlohmann::json expected = R"({"action":"set-current-schema","schema-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -487,7 +504,7 @@ TEST(JsonInternalTest, TableUpdateAddPartitionSpec) { PartitionSpec::Make(1, {PartitionField(3, 101, "region", identity_transform)})); table::AddPartitionSpec update(std::move(spec)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-spec"); EXPECT_TRUE(json.contains("spec")); @@ -501,7 +518,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultPartitionSpec) { table::SetDefaultPartitionSpec update(2); nlohmann::json expected = R"({"action":"set-default-spec","spec-id":2})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -514,7 +532,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { nlohmann::json expected = R"({"action":"remove-partition-specs","spec-ids":[1,2,3]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -524,7 +543,7 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionSpecs) { TEST(JsonInternalTest, TableUpdateRemoveSchemas) { table::RemoveSchemas update({1, 2}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-schemas"); EXPECT_THAT(json["schema-ids"].get>(), testing::UnorderedElementsAre(1, 2)); @@ -540,7 +559,7 @@ TEST(JsonInternalTest, TableUpdateAddSortOrder) { ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(1, {st})); table::AddSortOrder update(std::move(sort_order)); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-sort-order"); EXPECT_TRUE(json.contains("sort-order")); @@ -555,7 +574,8 @@ TEST(JsonInternalTest, TableUpdateSetDefaultSortOrder) { nlohmann::json expected = R"({"action":"set-default-sort-order","sort-order-id":1})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -573,7 +593,7 @@ TEST(JsonInternalTest, TableUpdateAddSnapshot) { .schema_id = 1}); table::AddSnapshot update(snapshot); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "add-snapshot"); EXPECT_TRUE(json.contains("snapshot")); @@ -588,7 +608,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshots) { nlohmann::json expected = R"({"action":"remove-snapshots","snapshot-ids":[111,222,333]})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -600,7 +621,8 @@ TEST(JsonInternalTest, TableUpdateRemoveSnapshotRef) { nlohmann::json expected = R"({"action":"remove-snapshot-ref","ref-name":"my-branch"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -611,7 +633,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { table::SetSnapshotRef update("main", 123456789, SnapshotRefType::kBranch, 5, 86400000, 604800000); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["ref-name"], "main"); EXPECT_EQ(json["snapshot-id"], 123456789); @@ -626,7 +648,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefBranch) { TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { table::SetSnapshotRef update("release-1.0", 987654321, SnapshotRefType::kTag); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-snapshot-ref"); EXPECT_EQ(json["type"], "tag"); @@ -639,7 +661,7 @@ TEST(JsonInternalTest, TableUpdateSetSnapshotRefTag) { TEST(JsonInternalTest, TableUpdateSetProperties) { table::SetProperties update({{"key1", "value1"}, {"key2", "value2"}}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "set-properties"); EXPECT_TRUE(json.contains("updates")); @@ -668,7 +690,7 @@ TEST(JsonInternalTest, TableUpdateSetPropertiesMissingCanonicalField) { TEST(JsonInternalTest, TableUpdateRemoveProperties) { table::RemoveProperties update({"key1", "key2"}); - auto json = ToJson(update); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); EXPECT_EQ(json["action"], "remove-properties"); EXPECT_TRUE(json.contains("removals")); @@ -700,7 +722,8 @@ TEST(JsonInternalTest, TableUpdateSetLocation) { nlohmann::json expected = R"({"action":"set-location","location":"s3://bucket/warehouse/table"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -736,7 +759,8 @@ TEST(JsonInternalTest, TableUpdateSetStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), update); @@ -747,7 +771,8 @@ TEST(JsonInternalTest, TableUpdateRemoveStatistics) { nlohmann::json expected = R"({"action":"remove-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -771,7 +796,8 @@ TEST(JsonInternalTest, TableUpdateSetPartitionStatistics) { } })"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -783,7 +809,8 @@ TEST(JsonInternalTest, TableUpdateRemovePartitionStatistics) { nlohmann::json expected = R"({"action":"remove-partition-statistics","snapshot-id":123456789})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ( @@ -809,7 +836,8 @@ TEST(JsonInternalTest, TableUpdateAddEncryptionKey) { {"properties", {{"scope", "table"}}}}}, }; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), @@ -828,7 +856,8 @@ TEST(JsonInternalTest, TableUpdateRemoveEncryptionKey) { table::RemoveEncryptionKey update("key-1"); nlohmann::json expected = R"({"action":"remove-encryption-key","key-id":"key-1"})"_json; - EXPECT_EQ(ToJson(update), expected); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(update)); + EXPECT_EQ(json, expected); auto parsed = TableUpdateFromJson(expected); ASSERT_THAT(parsed, IsOk()); EXPECT_EQ(*internal::checked_cast(parsed.value().get()), diff --git a/src/iceberg/test/metadata_serde_test.cc b/src/iceberg/test/metadata_serde_test.cc index 48f88f2bc..acda5e068 100644 --- a/src/iceberg/test/metadata_serde_test.cc +++ b/src/iceberg/test/metadata_serde_test.cc @@ -534,7 +534,7 @@ TEST(MetadataSerdeTest, EncryptionKeysRoundTrip) { EXPECT_EQ(metadata.value()->encryption_keys[0].encrypted_key_metadata, "secret-key-metadata"); - auto serialized = ToJson(*metadata.value()); + ICEBERG_UNWRAP_OR_FAIL(auto serialized, ToJson(*metadata.value())); ASSERT_TRUE(serialized.contains("encryption-keys")); EXPECT_EQ(serialized["encryption-keys"], metadata_json["encryption-keys"]); } diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 71fc474f4..cf7bdb647 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -43,8 +43,8 @@ class TypeJsonTest : public ::testing::TestWithParam {}; TEST_P(TypeJsonTest, SingleTypeRoundTrip) { // To Json const auto& param = GetParam(); - auto json = ToJson(*param.type).dump(); - ASSERT_EQ(param.json, json); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*param.type)); + ASSERT_EQ(param.json, json.dump()); // From Json auto type_result = TypeFromJson(nlohmann::json::parse(param.json)); @@ -134,8 +134,8 @@ TEST(SchemaJsonTest, RoundTrip) { ASSERT_EQ(field2.type()->type_id(), TypeId::kString); ASSERT_TRUE(field2.optional()); - auto dumped_json = ToJson(*schema).dump(); - ASSERT_EQ(dumped_json, json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { @@ -156,7 +156,8 @@ TEST(SchemaJsonTest, FieldWithDefaultValuesRoundTrip) { ASSERT_EQ(field2.initial_default()->get(), Literal::String("n/a")); ASSERT_FALSE(field2.write_default().has_value()); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, FieldWithMismatchedDefaultValueFails) { @@ -179,7 +180,8 @@ TEST(SchemaJsonTest, NestedFieldWithDefaultValuesRoundTrip) { ASSERT_TRUE(nested.write_default().has_value()); ASSERT_EQ(nested.write_default()->get(), Literal::Int(21)); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, UnknownFieldRoundTrip) { @@ -194,7 +196,8 @@ TEST(SchemaJsonTest, UnknownFieldRoundTrip) { ASSERT_EQ(field.name(), "mystery"); ASSERT_EQ(field.type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(field.optional()); - ASSERT_EQ(ToJson(*schema).dump(), json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json.dump(), json); } TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { @@ -261,7 +264,8 @@ TEST(SchemaJsonTest, NestedUnknownFieldsRoundTrip) { ASSERT_EQ(properties->value().type()->type_id(), TypeId::kUnknown); ASSERT_TRUE(properties->value().optional()); - ASSERT_EQ(ToJson(*schema), parsed_json); + ICEBERG_UNWRAP_OR_FAIL(auto schema_json, ToJson(*schema)); + ASSERT_EQ(schema_json, parsed_json); } TEST(SchemaJsonTest, IdentifierFieldIds) { @@ -280,7 +284,8 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_with_identifers->schema_id(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds().size(), 1); ASSERT_EQ(schema_with_identifers->IdentifierFieldIds()[0], 1); - ASSERT_EQ(ToJson(*schema_with_identifers), json_with_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_with_identifiers_out, ToJson(*schema_with_identifers)); + ASSERT_EQ(json_with_identifiers_out, json_with_identifiers); // Test schema without identifier-field-ids constexpr std::string_view json_without_identifiers_str = @@ -293,7 +298,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ICEBERG_UNWRAP_OR_FAIL(auto schema_without_identifiers, SchemaFromJson(json_without_identifiers)); ASSERT_TRUE(schema_without_identifiers->IdentifierFieldIds().empty()); - ASSERT_EQ(ToJson(*schema_without_identifiers), json_without_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_without_identifiers_out, + ToJson(*schema_without_identifiers)); + ASSERT_EQ(json_without_identifiers_out, json_without_identifiers); // Test schema with multiple identifier fields constexpr std::string_view json_multi_identifiers_str = @@ -309,7 +316,9 @@ TEST(SchemaJsonTest, IdentifierFieldIds) { ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds().size(), 2); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[0], 1); ASSERT_EQ(schema_multi_identifiers->IdentifierFieldIds()[1], 2); - ASSERT_EQ(ToJson(*schema_multi_identifiers), json_multi_identifiers); + ICEBERG_UNWRAP_OR_FAIL(auto json_multi_identifiers_out, + ToJson(*schema_multi_identifiers)); + ASSERT_EQ(json_multi_identifiers_out, json_multi_identifiers); } } // namespace iceberg From 36a76ca87eab6db670133c65d5cabd4a59b6748b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 11:55:05 -0700 Subject: [PATCH 11/12] refactor(serde): avoid duplicate macro for fallible REST ToJson Extract the FromJson declarations into ICEBERG_DECLARE_FROM_JSON so the four schema/metadata-bearing models declare a Result-returning ToJson without duplicating the whole serde macro. --- .../catalog/rest/json_serde_internal.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index ba05704c6..5b7c90fa0 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -38,23 +38,16 @@ namespace iceberg::rest { template Result FromJson(const nlohmann::json& json); -#define ICEBERG_DECLARE_JSON_SERDE(Model) \ +// Declares the two FromJson entry points for a model. +#define ICEBERG_DECLARE_FROM_JSON(Model) \ ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ \ template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ - \ - ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); -// Same as ICEBERG_DECLARE_JSON_SERDE, but ToJson returns Result because the model -// embeds a Schema/TableMetadata whose default-value serialization can fail. -#define ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(Model) \ - ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ - \ - template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ - \ - ICEBERG_REST_EXPORT Result ToJson(const Model& model); +#define ICEBERG_DECLARE_JSON_SERDE(Model) \ + ICEBERG_DECLARE_FROM_JSON(Model) \ + ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of /// `json_internal.cc` to define the `FromJson` function for the model. @@ -67,16 +60,23 @@ ICEBERG_DECLARE_JSON_SERDE(GetNamespaceResponse) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesRequest) ICEBERG_DECLARE_JSON_SERDE(UpdateNamespacePropertiesResponse) ICEBERG_DECLARE_JSON_SERDE(ListTablesResponse) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(LoadTableResult) ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CreateTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableRequest) -ICEBERG_DECLARE_JSON_SERDE_FALLIBLE(CommitTableResponse) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) +// These models embed a Schema/TableMetadata whose default-value serialization can +// fail, so their ToJson returns Result; FromJson is declared like the others. +ICEBERG_DECLARE_FROM_JSON(LoadTableResult) +ICEBERG_REST_EXPORT Result ToJson(const LoadTableResult& model); +ICEBERG_DECLARE_FROM_JSON(CreateTableRequest) +ICEBERG_REST_EXPORT Result ToJson(const CreateTableRequest& model); +ICEBERG_DECLARE_FROM_JSON(CommitTableRequest) +ICEBERG_REST_EXPORT Result ToJson(const CommitTableRequest& model); +ICEBERG_DECLARE_FROM_JSON(CommitTableResponse) +ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); + +#undef ICEBERG_DECLARE_FROM_JSON #undef ICEBERG_DECLARE_JSON_SERDE -#undef ICEBERG_DECLARE_JSON_SERDE_FALLIBLE ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json, From 7f49ebc743250e2e3e3d96abd9ca92dec479e45b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 26 Jun 2026 12:52:08 -0700 Subject: [PATCH 12/12] refactor(serde): declare fallible REST ToJson inline Keep ICEBERG_DECLARE_JSON_SERDE identical to upstream and declare the four schema/metadata-bearing models (LoadTableResult, CreateTableRequest, CommitTableRequest, CommitTableResponse) explicitly, since only their ToJson return type (Result) differs. --- .../catalog/rest/json_serde_internal.h | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/iceberg/catalog/rest/json_serde_internal.h b/src/iceberg/catalog/rest/json_serde_internal.h index 5b7c90fa0..efe81a4cf 100644 --- a/src/iceberg/catalog/rest/json_serde_internal.h +++ b/src/iceberg/catalog/rest/json_serde_internal.h @@ -38,15 +38,12 @@ namespace iceberg::rest { template Result FromJson(const nlohmann::json& json); -// Declares the two FromJson entry points for a model. -#define ICEBERG_DECLARE_FROM_JSON(Model) \ +#define ICEBERG_DECLARE_JSON_SERDE(Model) \ ICEBERG_REST_EXPORT Result Model##FromJson(const nlohmann::json& json); \ \ template <> \ - ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); - -#define ICEBERG_DECLARE_JSON_SERDE(Model) \ - ICEBERG_DECLARE_FROM_JSON(Model) \ + ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); \ + \ ICEBERG_REST_EXPORT nlohmann::json ToJson(const Model& model); /// \note Don't forget to add `ICEBERG_DEFINE_FROM_JSON` to the end of @@ -64,19 +61,33 @@ ICEBERG_DECLARE_JSON_SERDE(RegisterTableRequest) ICEBERG_DECLARE_JSON_SERDE(RenameTableRequest) ICEBERG_DECLARE_JSON_SERDE(OAuthTokenResponse) -// These models embed a Schema/TableMetadata whose default-value serialization can -// fail, so their ToJson returns Result; FromJson is declared like the others. -ICEBERG_DECLARE_FROM_JSON(LoadTableResult) +#undef ICEBERG_DECLARE_JSON_SERDE + +// These models embed a Schema/TableMetadata whose default-value serialization can fail, +// so their ToJson returns Result. FromJson is declared like the macro-based models above. +ICEBERG_REST_EXPORT Result LoadTableResultFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const LoadTableResult& model); -ICEBERG_DECLARE_FROM_JSON(CreateTableRequest) + +ICEBERG_REST_EXPORT Result CreateTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const CreateTableRequest& model); -ICEBERG_DECLARE_FROM_JSON(CommitTableRequest) + +ICEBERG_REST_EXPORT Result CommitTableRequestFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); ICEBERG_REST_EXPORT Result ToJson(const CommitTableRequest& model); -ICEBERG_DECLARE_FROM_JSON(CommitTableResponse) -ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); -#undef ICEBERG_DECLARE_FROM_JSON -#undef ICEBERG_DECLARE_JSON_SERDE +ICEBERG_REST_EXPORT Result CommitTableResponseFromJson( + const nlohmann::json& json); +template <> +ICEBERG_REST_EXPORT Result FromJson(const nlohmann::json& json); +ICEBERG_REST_EXPORT Result ToJson(const CommitTableResponse& model); ICEBERG_REST_EXPORT Result PlanTableScanResponseFromJson( const nlohmann::json& json,