diff --git a/src/iceberg/avro/avro_schema_util.cc b/src/iceberg/avro/avro_schema_util.cc index c26e2bbc9..14b464cee 100644 --- a/src/iceberg/avro/avro_schema_util.cc +++ b/src/iceberg/avro/avro_schema_util.cc @@ -248,6 +248,18 @@ Status ToAvroNodeVisitor::Visit(const UnknownType&, ::avro::NodePtr* node) { return {}; } +Status ToAvroNodeVisitor::Visit(const VariantType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg variant type to Avro is not supported"); +} + +Status ToAvroNodeVisitor::Visit(const GeometryType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg geometry type to Avro is not supported"); +} + +Status ToAvroNodeVisitor::Visit(const GeographyType&, ::avro::NodePtr*) { + return NotSupported("Writing Iceberg geography type to Avro is not supported"); +} + Status ToAvroNodeVisitor::Visit(const StructType& type, ::avro::NodePtr* node) { *node = std::make_shared<::avro::NodeRecord>(); @@ -631,6 +643,11 @@ Status ValidateAvroSchemaEvolution(const Type& expected_type, break; case TypeId::kUnknown: return {}; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Reading Iceberg type {} from Avro is not supported", + expected_type); default: break; } diff --git a/src/iceberg/avro/avro_schema_util_internal.h b/src/iceberg/avro/avro_schema_util_internal.h index a5bfb989e..342b119a5 100644 --- a/src/iceberg/avro/avro_schema_util_internal.h +++ b/src/iceberg/avro/avro_schema_util_internal.h @@ -59,6 +59,9 @@ class ToAvroNodeVisitor { Status Visit(const FixedType& type, ::avro::NodePtr* node); Status Visit(const BinaryType& type, ::avro::NodePtr* node); Status Visit(const UnknownType&, ::avro::NodePtr*); + Status Visit(const VariantType&, ::avro::NodePtr*); + Status Visit(const GeometryType&, ::avro::NodePtr*); + Status Visit(const GeographyType&, ::avro::NodePtr*); Status Visit(const StructType& type, ::avro::NodePtr* node); Status Visit(const ListType& type, ::avro::NodePtr* node); Status Visit(const MapType& type, ::avro::NodePtr* node); diff --git a/src/iceberg/delete_file_index.cc b/src/iceberg/delete_file_index.cc index 7c8c35032..8c58e861b 100644 --- a/src/iceberg/delete_file_index.cc +++ b/src/iceberg/delete_file_index.cc @@ -56,7 +56,7 @@ Status EqualityDeleteFile::ConvertBoundsIfNeeded() const { } const auto& schema_field = field.value().get(); - if (schema_field.type()->is_nested()) { + if (!schema_field.type()->is_primitive()) { continue; } @@ -103,7 +103,7 @@ Result CanContainEqDeletesForFile(const DataFile& data_file, } const auto& field = found_field.value().get(); - if (field.type()->is_nested()) { + if (!field.type()->is_primitive()) { continue; } diff --git a/src/iceberg/json_serde.cc b/src/iceberg/json_serde.cc index 1f2b8f45c..e137aed1d 100644 --- a/src/iceberg/json_serde.cc +++ b/src/iceberg/json_serde.cc @@ -389,6 +389,12 @@ nlohmann::json ToJson(const Type& type) { return "uuid"; case TypeId::kUnknown: return "unknown"; + case TypeId::kVariant: + return "variant"; + case TypeId::kGeometry: + return type.ToString(); + case TypeId::kGeography: + return type.ToString(); } std::unreachable(); } @@ -459,9 +465,10 @@ Result> ListTypeFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto element_required, GetJsonValue(json, kElementRequired)); - return std::make_unique( - SchemaField(element_id, std::string(ListType::kElementName), - std::move(element_type), !element_required)); + ICEBERG_ASSIGN_OR_RAISE(auto type, ListType::Make(SchemaField( + element_id, std::string(ListType::kElementName), + std::move(element_type), !element_required))); + return std::unique_ptr(std::move(type)); } Result> MapTypeFromJson(const nlohmann::json& json) { @@ -478,79 +485,126 @@ Result> MapTypeFromJson(const nlohmann::json& json) { /*optional=*/false); SchemaField value_field(value_id, std::string(MapType::kValueName), std::move(value_type), !value_required); - return std::make_unique(std::move(key_field), std::move(value_field)); + ICEBERG_ASSIGN_OR_RAISE(auto type, + MapType::Make(std::move(key_field), std::move(value_field))); + return std::unique_ptr(std::move(type)); } } // namespace Result> TypeFromJson(const nlohmann::json& json) { if (json.is_string()) { - std::string type_str = json.get(); - if (type_str == "boolean") { + const auto type_name = json.get(); + const auto normalized_type_name = StringUtils::ToLower(type_name); + if (normalized_type_name == "boolean") { return std::make_unique(); - } else if (type_str == "int") { + } else if (normalized_type_name == "int") { return std::make_unique(); - } else if (type_str == "long") { + } else if (normalized_type_name == "long") { return std::make_unique(); - } else if (type_str == "float") { + } else if (normalized_type_name == "float") { return std::make_unique(); - } else if (type_str == "double") { + } else if (normalized_type_name == "double") { return std::make_unique(); - } else if (type_str == "date") { + } else if (normalized_type_name == "date") { return std::make_unique(); - } else if (type_str == "time") { + } else if (normalized_type_name == "time") { return std::make_unique(); - } else if (type_str == "timestamp") { + } else if (normalized_type_name == "timestamp") { return std::make_unique(); - } else if (type_str == "timestamptz") { + } else if (normalized_type_name == "timestamptz") { return std::make_unique(); - } else if (type_str == "timestamp_ns") { + } else if (normalized_type_name == "timestamp_ns") { return std::make_unique(); - } else if (type_str == "timestamptz_ns") { + } else if (normalized_type_name == "timestamptz_ns") { return std::make_unique(); - } else if (type_str == "string") { + } else if (normalized_type_name == "string") { return std::make_unique(); - } else if (type_str == "binary") { + } else if (normalized_type_name == "binary") { return std::make_unique(); - } else if (type_str == "uuid") { + } else if (normalized_type_name == "uuid") { return std::make_unique(); - } else if (type_str == "unknown") { + } else if (normalized_type_name == "unknown") { return std::make_unique(); - } else if (type_str.starts_with("fixed")) { - std::regex fixed_regex(R"(fixed\[\s*(\d+)\s*\])"); + } else if (normalized_type_name == "variant") { + return std::make_unique(); + } else if (normalized_type_name.starts_with("fixed")) { + static const std::regex kFixedRegex(R"(fixed\[\s*(\d+)\s*\])"); std::smatch match; - if (std::regex_match(type_str, match, fixed_regex)) { + if (std::regex_match(normalized_type_name, match, kFixedRegex)) { ICEBERG_ASSIGN_OR_RAISE(auto length, StringUtils::ParseNumber(match[1].str())); return std::make_unique(length); } - return JsonParseError("Invalid fixed type: {}", type_str); - } else if (type_str.starts_with("decimal")) { - std::regex decimal_regex(R"(decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))"); + return JsonParseError("Invalid fixed type: {}", type_name); + } else if (normalized_type_name.starts_with("decimal")) { + static const std::regex kDecimalRegex(R"(decimal\(\s*(\d+)\s*,\s*(\d+)\s*\))"); std::smatch match; - if (std::regex_match(type_str, match, decimal_regex)) { + if (std::regex_match(normalized_type_name, match, kDecimalRegex)) { ICEBERG_ASSIGN_OR_RAISE(auto precision, StringUtils::ParseNumber(match[1].str())); ICEBERG_ASSIGN_OR_RAISE(auto scale, StringUtils::ParseNumber(match[2].str())); return std::make_unique(precision, scale); } - return JsonParseError("Invalid decimal type: {}", type_str); + return JsonParseError("Invalid decimal type: {}", type_name); + } else if (normalized_type_name.starts_with("geometry")) { + static const std::regex kGeometryRegex(R"(geometry\s*(?:\(\s*([^)]*?)\s*\))?)", + std::regex_constants::icase); + std::smatch match; + if (std::regex_match(type_name, match, kGeometryRegex)) { + if (match[1].matched) { + auto crs = match[1].str(); + if (crs.empty()) { + return JsonParseError("Invalid geometry type: {}", type_name); + } + ICEBERG_ASSIGN_OR_RAISE(auto type, GeometryType::Make(std::move(crs))); + return std::unique_ptr(std::move(type)); + } + ICEBERG_ASSIGN_OR_RAISE(auto type, GeometryType::Make()); + return std::unique_ptr(std::move(type)); + } + return JsonParseError("Invalid geometry type: {}", type_name); + } else if (normalized_type_name.starts_with("geography")) { + static const std::regex kGeographyRegex( + R"(geography\s*(?:\(\s*([^,]*?)\s*(?:,\s*(\w*)\s*)?\))?)", + std::regex_constants::icase); + std::smatch match; + if (std::regex_match(type_name, match, kGeographyRegex)) { + auto crs = match[1].str(); + if (match[1].matched && crs.empty()) { + return JsonParseError("Invalid geography type: {}", type_name); + } + if (match[2].matched) { + ICEBERG_ASSIGN_OR_RAISE(auto algorithm, + EdgeAlgorithmFromString(match[2].str())); + ICEBERG_ASSIGN_OR_RAISE(auto type, + GeographyType::Make(std::move(crs), algorithm)); + return std::unique_ptr(std::move(type)); + } + if (match[1].matched) { + ICEBERG_ASSIGN_OR_RAISE(auto type, GeographyType::Make(std::move(crs))); + return std::unique_ptr(std::move(type)); + } + ICEBERG_ASSIGN_OR_RAISE(auto type, GeographyType::Make()); + return std::unique_ptr(std::move(type)); + } + return JsonParseError("Invalid geography type: {}", type_name); } else { - return JsonParseError("Unknown primitive type: {}", type_str); + return JsonParseError("Cannot parse type string: {}", type_name); } } // For complex types like struct, list, and map - ICEBERG_ASSIGN_OR_RAISE(auto type_str, GetJsonValue(json, kType)); - if (type_str == kStruct) { + ICEBERG_ASSIGN_OR_RAISE(auto complex_type_name, GetJsonValue(json, kType)); + if (complex_type_name == kStruct) { return StructTypeFromJson(json); - } else if (type_str == kList) { + } else if (complex_type_name == kList) { return ListTypeFromJson(json); - } else if (type_str == kMap) { + } else if (complex_type_name == kMap) { return MapTypeFromJson(json); } else { - return JsonParseError("Unknown complex type: {}", type_str); + return JsonParseError("Unknown complex type: {}", complex_type_name); } } diff --git a/src/iceberg/metrics_config.cc b/src/iceberg/metrics_config.cc index 95d1a1fe1..f5e30ace9 100644 --- a/src/iceberg/metrics_config.cc +++ b/src/iceberg/metrics_config.cc @@ -197,6 +197,8 @@ Result> MetricsConfig::LimitFieldIds(const Schema& s Status Visit(const Type& type) { if (type.is_nested()) { return VisitNested(internal::checked_cast(type)); + } else if (type.is_variant()) { + return {}; } else { return VisitPrimitive(internal::checked_cast(type)); } @@ -207,8 +209,7 @@ Result> MetricsConfig::LimitFieldIds(const Schema& s if (!ShouldContinue()) { break; } - // TODO(zhuo.wang): variant type should also be handled here - if (field.type()->is_primitive()) { + if (!field.type()->is_nested()) { ids_.insert(field.field_id()); } } diff --git a/src/iceberg/parquet/parquet_metrics.cc b/src/iceberg/parquet/parquet_metrics.cc index 32dc9fec8..dac9ea6a8 100644 --- a/src/iceberg/parquet/parquet_metrics.cc +++ b/src/iceberg/parquet/parquet_metrics.cc @@ -423,6 +423,10 @@ class CollectMetricsVisitor { Status VisitMap(const MapType& /*type*/, const std::string& /*prefix*/) { return {}; } + Status VisitVariant(const VariantType& /*type*/, const std::string& /*prefix*/) { + return {}; + } + Status VisitPrimitive(const PrimitiveType& /*type*/, const std::string& /*prefix*/) { return {}; } diff --git a/src/iceberg/parquet/parquet_schema_util.cc b/src/iceberg/parquet/parquet_schema_util.cc index 39e321d9f..a5629198f 100644 --- a/src/iceberg/parquet/parquet_schema_util.cc +++ b/src/iceberg/parquet/parquet_schema_util.cc @@ -239,6 +239,11 @@ Status ValidateParquetSchemaEvolution( break; case TypeId::kUnknown: return {}; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Reading Iceberg type {} from Parquet is not supported", + expected_type); case TypeId::kStruct: if (arrow_type->id() == ::arrow::Type::STRUCT) { return {}; diff --git a/src/iceberg/parquet/parquet_writer.cc b/src/iceberg/parquet/parquet_writer.cc index c50fb26b1..fea7cd834 100644 --- a/src/iceberg/parquet/parquet_writer.cc +++ b/src/iceberg/parquet/parquet_writer.cc @@ -178,6 +178,10 @@ class FieldMetricsCollector { Status VisitMap(const MapType& /*type*/, const ::arrow::Array& /*array*/) { return {}; } + Status VisitVariant(const VariantType& /*type*/, const ::arrow::Array& /*array*/) { + return {}; + } + Status VisitPrimitive(const PrimitiveType& type, const ::arrow::Array& array) { switch (type.type_id()) { case TypeId::kFloat: diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index c32ceb2a6..792341adf 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -19,6 +19,7 @@ #include "iceberg/schema_internal.h" +#include #include #include #include @@ -39,6 +40,33 @@ constexpr const char* kArrowExtensionMetadata = "ARROW:extension:metadata"; constexpr const char* kArrowUuidExtensionName = "arrow.uuid"; constexpr int32_t kUnknownFieldId = -1; +Status CheckArrowCompatible(const Type& type) { + switch (type.type_id()) { + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Iceberg type {} is not supported by Arrow conversion", + type.ToString()); + case TypeId::kStruct: + for (const auto& field : static_cast(type).fields()) { + ICEBERG_RETURN_UNEXPECTED(CheckArrowCompatible(*field.type())); + } + break; + case TypeId::kList: + ICEBERG_RETURN_UNEXPECTED( + CheckArrowCompatible(*static_cast(type).element().type())); + break; + case TypeId::kMap: { + const auto& map_type = static_cast(type); + ICEBERG_RETURN_UNEXPECTED(CheckArrowCompatible(*map_type.key().type())); + ICEBERG_RETURN_UNEXPECTED(CheckArrowCompatible(*map_type.value().type())); + } break; + default: + break; + } + return {}; +} + // Convert an Iceberg type to Arrow schema. Return value is Nanoarrow error code. ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view name, std::optional field_id, ArrowSchema* schema) { @@ -153,6 +181,11 @@ ArrowErrorCode ToArrowSchema(const Type& type, bool optional, std::string_view n case TypeId::kUnknown: NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA)); break; + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + ArrowBufferReset(&metadata_buffer); + return EINVAL; } if (!name.empty()) { @@ -179,6 +212,8 @@ Status ToArrowSchema(const Schema& schema, ArrowSchema* out) { return InvalidArgument("Output Arrow schema cannot be null"); } + ICEBERG_RETURN_UNEXPECTED(CheckArrowCompatible(schema)); + ArrowSchemaInit(out); if (ArrowErrorCode errorCode = ToArrowSchema(schema, /*optional=*/false, /*name=*/"", diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 06d636ef5..fd2c27199 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -79,9 +79,8 @@ struct ICEBERG_EXPORT TableMetadata { static constexpr int64_t kInitialRowId = 0; static inline const std::unordered_map kMinFormatVersions = { - {TypeId::kTimestampNs, 3}, - {TypeId::kTimestampTzNs, 3}, - {TypeId::kUnknown, 3}, + {TypeId::kTimestampNs, 3}, {TypeId::kTimestampTzNs, 3}, {TypeId::kUnknown, 3}, + {TypeId::kVariant, 3}, {TypeId::kGeometry, 3}, {TypeId::kGeography, 3}, }; /// An integer version number for the format diff --git a/src/iceberg/test/arrow_test.cc b/src/iceberg/test/arrow_test.cc index 9f8ce86f5..2a7242e71 100644 --- a/src/iceberg/test/arrow_test.cc +++ b/src/iceberg/test/arrow_test.cc @@ -122,6 +122,20 @@ INSTANTIATE_TEST_SUITE_P( ToArrowSchemaParam{.iceberg_type = iceberg::unknown(), .arrow_type = ::arrow::null()})); +TEST(ToArrowSchemaTest, UnsupportedV3Types) { + const std::vector> unsupported_types = { + iceberg::variant(), iceberg::geometry(), iceberg::geography()}; + + for (const auto& unsupported_type : unsupported_types) { + Schema schema( + {SchemaField::MakeOptional(/*field_id=*/1, "unsupported", unsupported_type)}, + /*schema_id=*/0); + ArrowSchema arrow_schema; + ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), + HasErrorMessage("is not supported by Arrow conversion")); + } +} + namespace { void CheckArrowField(const ::arrow::Field& field, ::arrow::Type::type type_id, diff --git a/src/iceberg/test/rest_json_serde_test.cc b/src/iceberg/test/rest_json_serde_test.cc index 7304831c6..50507dd3a 100644 --- a/src/iceberg/test/rest_json_serde_test.cc +++ b/src/iceberg/test/rest_json_serde_test.cc @@ -1078,7 +1078,7 @@ INSTANTIATE_TEST_SUITE_P( CreateTableRequestInvalidParam{ .test_name = "WrongSchemaType", .invalid_json_str = R"({"name":"my_table","schema":"invalid"})", - .expected_error_message = "Unknown primitive type: invalid"}), + .expected_error_message = "Cannot parse type string: invalid"}), [](const ::testing::TestParamInfo& info) { return info.param.test_name; }); diff --git a/src/iceberg/test/schema_json_test.cc b/src/iceberg/test/schema_json_test.cc index 08275a45c..520a1eb70 100644 --- a/src/iceberg/test/schema_json_test.cc +++ b/src/iceberg/test/schema_json_test.cc @@ -65,6 +65,21 @@ INSTANTIATE_TEST_SUITE_P( SchemaJsonParam{.json = "\"binary\"", .type = iceberg::binary()}, SchemaJsonParam{.json = "\"uuid\"", .type = iceberg::uuid()}, SchemaJsonParam{.json = "\"unknown\"", .type = iceberg::unknown()}, + SchemaJsonParam{.json = "\"variant\"", .type = iceberg::variant()}, + SchemaJsonParam{.json = "\"geometry\"", .type = iceberg::geometry()}, + SchemaJsonParam{.json = "\"geometry(srid:4326)\"", + .type = iceberg::geometry("srid:4326")}, + SchemaJsonParam{.json = "\"geography\"", .type = iceberg::geography()}, + SchemaJsonParam{.json = "\"geography(srid:4326)\"", + .type = iceberg::geography("srid:4326")}, + SchemaJsonParam{ + .json = "\"geography(srid:4326, spherical)\"", + .type = iceberg::geography("srid:4326", EdgeAlgorithm::kSpherical)}, + SchemaJsonParam{ + .json = "\"geography(OGC:CRS84, spherical)\"", + .type = iceberg::geography("OGC:CRS84", EdgeAlgorithm::kSpherical)}, + SchemaJsonParam{.json = "\"geography(srid:4326, karney)\"", + .type = iceberg::geography("srid:4326", EdgeAlgorithm::kKarney)}, SchemaJsonParam{.json = "\"fixed[8]\"", .type = iceberg::fixed(8)}, SchemaJsonParam{.json = "\"decimal(10,2)\"", .type = iceberg::decimal(10, 2)}, SchemaJsonParam{.json = "\"date\"", .type = iceberg::date()}, @@ -111,6 +126,48 @@ TEST(TypeJsonTest, FromJsonWithSpaces) { ASSERT_EQ(decimal->scale(), 2); } +TEST(TypeJsonTest, FromJsonV3TypesWithSpacesAndCase) { + auto variant_result = TypeFromJson(nlohmann::json::parse("\"Variant\"")); + ASSERT_TRUE(variant_result.has_value()); + ASSERT_EQ(*variant_result.value(), *iceberg::variant()); + + auto geometry_result = + TypeFromJson(nlohmann::json::parse("\"GEOMETRY( srid: 3857 )\"")); + ASSERT_TRUE(geometry_result.has_value()); + ASSERT_EQ(*geometry_result.value(), *iceberg::geometry("srid: 3857")); + + auto geography_result = + TypeFromJson(nlohmann::json::parse("\"geography(srid:4269,karney)\"")); + ASSERT_TRUE(geography_result.has_value()); + ASSERT_EQ(*geography_result.value(), + *iceberg::geography("srid:4269", EdgeAlgorithm::kKarney)); +} + +TEST(TypeJsonTest, InvalidV3Types) { + auto invalid_geometry = TypeFromJson(nlohmann::json::parse("\"geometry()\"")); + ASSERT_THAT(invalid_geometry, HasErrorMessage("Invalid geometry type")); + + auto invalid_geometry_with_spaces = + TypeFromJson(nlohmann::json::parse("\"geometry( )\"")); + ASSERT_THAT(invalid_geometry_with_spaces, HasErrorMessage("Invalid geometry type")); + + auto invalid_geography = TypeFromJson(nlohmann::json::parse("\"geography()\"")); + ASSERT_THAT(invalid_geography, HasErrorMessage("Invalid geography type")); + + auto invalid_geography_with_algorithm = + TypeFromJson(nlohmann::json::parse("\"geography( , spherical)\"")); + ASSERT_THAT(invalid_geography_with_algorithm, + HasErrorMessage("Invalid geography type")); + + auto invalid_geography_algorithm = + TypeFromJson(nlohmann::json::parse("\"geography(srid:4269, BadAlgorithm)\"")); + ASSERT_THAT(invalid_geography_algorithm, + HasErrorMessage("Invalid edge interpolation algorithm")); + + auto unknown_type = TypeFromJson(nlohmann::json::parse("\"nonsense\"")); + ASSERT_THAT(unknown_type, HasErrorMessage("Cannot parse type string")); +} + TEST(SchemaJsonTest, RoundTrip) { constexpr std::string_view json = R"({"fields":[{"id":1,"name":"id","required":true,"type":"int"},{"id":2,"name":"name","required":false,"type":"string"}],"schema-id":1,"type":"struct"})"; diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index 8f1b20035..db99eb02b 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -671,6 +671,7 @@ iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; } iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; } iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; } iceberg::SchemaField Email() { return {4, "email", iceberg::string(), true}; } +iceberg::SchemaField Payload() { return {5, "payload", iceberg::variant(), true}; } iceberg::SchemaField Street() { return {11, "street", iceberg::string(), true}; } iceberg::SchemaField City() { return {12, "city", iceberg::string(), true}; } iceberg::SchemaField Zip() { return {13, "zip", iceberg::int32(), true}; } @@ -683,6 +684,10 @@ static std::unique_ptr BasicSchema() { return MakeSchema(Id(), Name(), Age(), Email()); } +static std::unique_ptr VariantSchema() { + return MakeSchema(Id(), Payload()); +} + static std::unique_ptr AddressSchema() { auto address_type = MakeStructType(Street(), City(), Zip()); auto address_field = iceberg::SchemaField{14, "address", std::move(address_type), true}; @@ -932,30 +937,36 @@ TEST_P(ProjectParamTest, ProjectFields) { INSTANTIATE_TEST_SUITE_P( ProjectTestCases, ProjectParamTest, - ::testing::Values(ProjectTestParam{.test_name = "ProjectAllFields", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {1, 2, 3, 4}, - .expected_schema = []() { return BasicSchema(); }, - .should_succeed = true}, - - ProjectTestParam{ - .test_name = "ProjectSingleField", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {2}, - .expected_schema = []() { return MakeSchema(Name()); }, - .should_succeed = true}, + ::testing::Values( + ProjectTestParam{.test_name = "ProjectAllFields", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {1, 2, 3, 4}, + .expected_schema = []() { return BasicSchema(); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectSingleField", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {2}, + .expected_schema = []() { return MakeSchema(Name()); }, + .should_succeed = true}, - ProjectTestParam{.test_name = "ProjectNonExistentFieldId", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {999}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true}, - - ProjectTestParam{.test_name = "ProjectEmptySelection", - .create_schema = []() { return BasicSchema(); }, - .selected_ids = {}, - .expected_schema = []() { return MakeSchema(); }, - .should_succeed = true})); + ProjectTestParam{.test_name = "ProjectVariantField", + .create_schema = []() { return VariantSchema(); }, + .selected_ids = {5}, + .expected_schema = []() { return MakeSchema(Payload()); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectNonExistentFieldId", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {999}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true}, + + ProjectTestParam{.test_name = "ProjectEmptySelection", + .create_schema = []() { return BasicSchema(); }, + .selected_ids = {}, + .expected_schema = []() { return MakeSchema(); }, + .should_succeed = true})); INSTANTIATE_TEST_SUITE_P(ProjectNestedTestCases, ProjectParamTest, ::testing::Values(ProjectTestParam{ diff --git a/src/iceberg/test/transform_test.cc b/src/iceberg/test/transform_test.cc index 8d7cd880d..d3eae5971 100644 --- a/src/iceberg/test/transform_test.cc +++ b/src/iceberg/test/transform_test.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,18 @@ TEST(TransformTest, Transform) { ASSERT_TRUE(identity_transform); } +TEST(TransformTest, IdentityDoesNotSupportV3Types) { + const auto transform = Transform::Identity(); + const std::vector> unsupported_types = { + iceberg::variant(), iceberg::geometry(), iceberg::geography()}; + + for (const auto& type : unsupported_types) { + EXPECT_FALSE(transform->CanTransform(*type)); + EXPECT_THAT(transform->Bind(type), + HasErrorMessage("is not a valid input type for identity transform")); + } +} + TEST(TransformFunctionTest, CreateBucketTransform) { constexpr int32_t bucket_count = 8; auto transform = Transform::Bucket(bucket_count); diff --git a/src/iceberg/test/type_test.cc b/src/iceberg/test/type_test.cc index d405cccc1..ac188f229 100644 --- a/src/iceberg/test/type_test.cc +++ b/src/iceberg/test/type_test.cc @@ -38,6 +38,7 @@ struct TypeTestCase { std::shared_ptr type; iceberg::TypeId type_id; bool primitive; + bool nested = false; std::string repr; }; @@ -61,20 +62,31 @@ TEST_P(TypeTest, IsPrimitive) { const auto* primitive = dynamic_cast(test_case.type.get()); ASSERT_NE(nullptr, primitive); + } else { + ASSERT_FALSE(test_case.type->is_primitive()); } } TEST_P(TypeTest, IsNested) { const auto& test_case = GetParam(); - if (!test_case.primitive) { - ASSERT_FALSE(test_case.type->is_primitive()); + if (test_case.nested) { ASSERT_TRUE(test_case.type->is_nested()); const auto* nested = dynamic_cast(test_case.type.get()); ASSERT_NE(nullptr, nested); + } else { + ASSERT_FALSE(test_case.type->is_nested()); } } +TEST_P(TypeTest, TypeKindPredicates) { + const auto& test_case = GetParam(); + ASSERT_EQ(test_case.type_id == iceberg::TypeId::kStruct, test_case.type->is_struct()); + ASSERT_EQ(test_case.type_id == iceberg::TypeId::kList, test_case.type->is_list()); + ASSERT_EQ(test_case.type_id == iceberg::TypeId::kMap, test_case.type->is_map()); + ASSERT_EQ(test_case.type_id == iceberg::TypeId::kVariant, test_case.type->is_variant()); +} + TEST_P(TypeTest, ReflexiveEquality) { const auto& test_case = GetParam(); ASSERT_EQ(*test_case.type, *test_case.type); @@ -90,7 +102,7 @@ TEST_P(TypeTest, StdFormat) { ASSERT_EQ(test_case.repr, std::format("{}", *test_case.type)); } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -224,14 +236,37 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, + { + .name = "geometry", + .type = iceberg::geometry(), + .type_id = iceberg::TypeId::kGeometry, + .primitive = true, + .repr = "geometry", + }, + { + .name = "geography", + .type = iceberg::geography(), + .type_id = iceberg::TypeId::kGeography, + .primitive = true, + .repr = "geography", + }, }}; +const static TypeTestCase kVariantType = { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", +}; + const static std::array kNestedTypes = {{ { .name = "list_int", .type = std::make_shared(1, iceberg::int32(), true), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list", }, { @@ -240,6 +275,7 @@ const static std::array kNestedTypes = {{ 1, std::make_shared(2, iceberg::int32(), true), false), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list (required)>", }, { @@ -249,6 +285,7 @@ const static std::array kNestedTypes = {{ iceberg::SchemaField::MakeRequired(2, "value", iceberg::string())), .type_id = iceberg::TypeId::kMap, .primitive = false, + .nested = true, .repr = "map", }, { @@ -259,6 +296,7 @@ const static std::array kNestedTypes = {{ }), .type_id = iceberg::TypeId::kStruct, .primitive = false, + .nested = true, .repr = R"(struct< foo (1): long (required) bar (2): string (optional) @@ -269,6 +307,9 @@ const static std::array kNestedTypes = {{ INSTANTIATE_TEST_SUITE_P(Primitive, TypeTest, ::testing::ValuesIn(kPrimitiveTypes), TypeTestCaseToString); +INSTANTIATE_TEST_SUITE_P(Variant, TypeTest, ::testing::Values(kVariantType), + TypeTestCaseToString); + INSTANTIATE_TEST_SUITE_P(Nested, TypeTest, ::testing::ValuesIn(kNestedTypes), TypeTestCaseToString); @@ -277,6 +318,7 @@ TEST(TypeTest, Equality) { for (const auto& test_case : kPrimitiveTypes) { alltypes.push_back(test_case.type); } + alltypes.push_back(kVariantType.type); for (const auto& test_case : kNestedTypes) { alltypes.push_back(test_case.type); } @@ -294,6 +336,33 @@ TEST(TypeTest, Equality) { } } +TEST(TypeTest, GeographyExplicitDefaultAlgorithm) { + ASSERT_NE(*iceberg::geography("srid:4326"), + *iceberg::geography("srid:4326", iceberg::EdgeAlgorithm::kSpherical)); + ASSERT_NE(*iceberg::geography(), + *iceberg::geography("OGC:CRS84", iceberg::EdgeAlgorithm::kSpherical)); + ASSERT_EQ( + "geography(srid:4326, spherical)", + iceberg::geography("srid:4326", iceberg::EdgeAlgorithm::kSpherical)->ToString()); + ASSERT_EQ( + "geography(OGC:CRS84, spherical)", + iceberg::geography("OGC:CRS84", iceberg::EdgeAlgorithm::kSpherical)->ToString()); + ASSERT_NE(*iceberg::geography("srid:4326"), + *iceberg::geography("srid:4326", iceberg::EdgeAlgorithm::kKarney)); +} + +TEST(TypeTest, GeometryMakeRejectsEmptyCrs) { + auto result = iceberg::GeometryType::Make(""); + ASSERT_THAT(result, IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, iceberg::HasErrorMessage("GeometryType: CRS cannot be empty")); +} + +TEST(TypeTest, GeographyMakeRejectsEmptyCrs) { + auto result = iceberg::GeographyType::Make(""); + ASSERT_THAT(result, IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, iceberg::HasErrorMessage("GeographyType: CRS cannot be empty")); +} + TEST(TypeTest, Decimal) { { iceberg::DecimalType decimal(38, 2); @@ -359,11 +428,17 @@ TEST(TypeTest, List) { } ASSERT_THAT( []() { - iceberg::ListType list( - iceberg::SchemaField(1, "wrongname", iceberg::boolean(), true)); + iceberg::list(iceberg::SchemaField(1, "wrongname", iceberg::boolean(), true)); }, ::testing::ThrowsMessage( ::testing::HasSubstr("child field name should be 'element', was 'wrongname'"))); + + auto make_result = iceberg::ListType::Make( + iceberg::SchemaField(1, "wrongname", iceberg::boolean(), true)); + ASSERT_THAT(make_result, IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT(make_result, + iceberg::HasErrorMessage( + "ListType: child field name should be 'element', was 'wrongname'")); } TEST(TypeTest, Map) { @@ -397,7 +472,7 @@ TEST(TypeTest, Map) { []() { iceberg::SchemaField key(5, "notkey", iceberg::int32(), true); iceberg::SchemaField value(7, "value", iceberg::string(), true); - iceberg::MapType map(key, value); + iceberg::map(key, value); }, ::testing::ThrowsMessage( ::testing::HasSubstr("key field name should be 'key', was 'notkey'"))); @@ -405,10 +480,26 @@ TEST(TypeTest, Map) { []() { iceberg::SchemaField key(5, "key", iceberg::int32(), true); iceberg::SchemaField value(7, "notvalue", iceberg::string(), true); - iceberg::MapType map(key, value); + iceberg::map(key, value); }, ::testing::ThrowsMessage( ::testing::HasSubstr("value field name should be 'value', was 'notvalue'"))); + + auto invalid_key_result = + iceberg::MapType::Make(iceberg::SchemaField(5, "notkey", iceberg::int32(), true), + iceberg::SchemaField(7, "value", iceberg::string(), true)); + ASSERT_THAT(invalid_key_result, IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT( + invalid_key_result, + iceberg::HasErrorMessage("MapType: key field name should be 'key', was 'notkey'")); + + auto invalid_value_result = iceberg::MapType::Make( + iceberg::SchemaField(5, "key", iceberg::int32(), true), + iceberg::SchemaField(7, "notvalue", iceberg::string(), true)); + ASSERT_THAT(invalid_value_result, IsError(iceberg::ErrorKind::kInvalidArgument)); + ASSERT_THAT(invalid_value_result, + iceberg::HasErrorMessage( + "MapType: value field name should be 'value', was 'notvalue'")); } TEST(TypeTest, Struct) { diff --git a/src/iceberg/test/visit_type_test.cc b/src/iceberg/test/visit_type_test.cc index f038f906f..a6bd9f8c6 100644 --- a/src/iceberg/test/visit_type_test.cc +++ b/src/iceberg/test/visit_type_test.cc @@ -46,6 +46,7 @@ struct TypeTestCase { std::shared_ptr type; iceberg::TypeId type_id; bool primitive; + bool nested = false; std::string repr; }; @@ -53,7 +54,7 @@ std::string TypeTestCaseToString(const ::testing::TestParamInfo& i return info.param.name; } -const static std::array kPrimitiveTypes = {{ +const static std::array kPrimitiveTypes = {{ { .name = "boolean", .type = iceberg::boolean(), @@ -187,14 +188,37 @@ const static std::array kPrimitiveTypes = {{ .primitive = true, .repr = "unknown", }, + { + .name = "geometry", + .type = iceberg::geometry(), + .type_id = iceberg::TypeId::kGeometry, + .primitive = true, + .repr = "geometry", + }, + { + .name = "geography", + .type = iceberg::geography(), + .type_id = iceberg::TypeId::kGeography, + .primitive = true, + .repr = "geography", + }, }}; +const static TypeTestCase kVariantType = { + .name = "variant", + .type = iceberg::variant(), + .type_id = iceberg::TypeId::kVariant, + .primitive = false, + .repr = "variant", +}; + const static std::array kNestedTypes = {{ { .name = "list_int", .type = std::make_shared(1, iceberg::int32(), true), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list", }, { @@ -203,6 +227,7 @@ const static std::array kNestedTypes = {{ 1, std::make_shared(2, iceberg::int32(), true), false), .type_id = iceberg::TypeId::kList, .primitive = false, + .nested = true, .repr = "list (required)>", }, { @@ -212,6 +237,7 @@ const static std::array kNestedTypes = {{ iceberg::SchemaField::MakeRequired(2, "value", iceberg::string())), .type_id = iceberg::TypeId::kMap, .primitive = false, + .nested = true, .repr = "map", }, { @@ -222,6 +248,7 @@ const static std::array kNestedTypes = {{ }), .type_id = iceberg::TypeId::kStruct, .primitive = false, + .nested = true, .repr = R"(struct< foo (1): long (required) bar (2): string (optional) @@ -236,6 +263,9 @@ class VisitTypeTest : public ::testing::TestWithParam {}; INSTANTIATE_TEST_SUITE_P(Primitive, VisitTypeTest, ::testing::ValuesIn(kPrimitiveTypes), TypeTestCaseToString); +INSTANTIATE_TEST_SUITE_P(Variant, VisitTypeTest, ::testing::Values(kVariantType), + TypeTestCaseToString); + INSTANTIATE_TEST_SUITE_P(Nested, VisitTypeTest, ::testing::ValuesIn(kNestedTypes), TypeTestCaseToString); @@ -261,12 +291,12 @@ TEST_P(VisitTypeTest, VisitTypeReturnNestedTypeId) { const auto& test_case = GetParam(); auto result = VisitType(*test_case.type, visitor); - if (test_case.primitive) { - ASSERT_THAT(result, IsError(ErrorKind::kNotImplemented)); - ASSERT_THAT(result, HasErrorMessage("Type is not a nested type")); - } else { + if (test_case.nested) { ASSERT_THAT(result, IsOk()); ASSERT_EQ(result.value(), test_case.type_id); + } else { + ASSERT_THAT(result, IsError(ErrorKind::kNotImplemented)); + ASSERT_THAT(result, HasErrorMessage("Type is not a nested type")); } } diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 453941c95..f6d5c0c20 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -158,7 +158,9 @@ std::shared_ptr Transform::ResultType( bool Transform::CanTransform(const Type& source_type) const { switch (transform_type_) { case TransformType::kIdentity: - if (!source_type.is_primitive()) [[unlikely]] { + if (source_type.is_variant() || source_type.type_id() == TypeId::kGeometry || + source_type.type_id() == TypeId::kGeography || !source_type.is_primitive()) + [[unlikely]] { return false; } return true; diff --git a/src/iceberg/transform_function.cc b/src/iceberg/transform_function.cc index 4325c53d1..1e8e30bb5 100644 --- a/src/iceberg/transform_function.cc +++ b/src/iceberg/transform_function.cc @@ -40,7 +40,9 @@ std::shared_ptr IdentityTransform::ResultType() const { return source_type Result> IdentityTransform::Make( std::shared_ptr const& source_type) { - if (!source_type || !source_type->is_primitive()) { + if (!source_type || source_type->is_variant() || + source_type->type_id() == TypeId::kGeometry || + source_type->type_id() == TypeId::kGeography || !source_type->is_primitive()) { return NotSupported("{} is not a valid input type for identity transform", source_type ? source_type->ToString() : "null"); } diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index 057dcf513..fe48e6a99 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "iceberg/exception.h" @@ -142,12 +143,16 @@ StructType::InitFieldByLowerCaseName(const StructType& self) { return field_by_lowercase_name; } -ListType::ListType(SchemaField element) : element_(std::move(element)) { - ICEBERG_CHECK_OR_DIE(element_.name() == kElementName, - "ListType: child field name should be '{}', was '{}'", - kElementName, element_.name()); +Result> ListType::Make(SchemaField element) { + if (element.name() != kElementName) { + return InvalidArgument("ListType: child field name should be '{}', was '{}'", + kElementName, element.name()); + } + return std::make_unique(std::move(element)); } +ListType::ListType(SchemaField element) : element_(std::move(element)) {} + ListType::ListType(int32_t field_id, std::shared_ptr type, bool optional) : element_(field_id, std::string(kElementName), std::move(type), optional) {} @@ -198,16 +203,21 @@ bool ListType::Equals(const Type& other) const { return element_ == list.element_; } -MapType::MapType(SchemaField key, SchemaField value) - : fields_{std::move(key), std::move(value)} { - ICEBERG_CHECK_OR_DIE(this->key().name() == kKeyName, - "MapType: key field name should be '{}', was '{}'", kKeyName, - this->key().name()); - ICEBERG_CHECK_OR_DIE(this->value().name() == kValueName, - "MapType: value field name should be '{}', was '{}'", kValueName, - this->value().name()); +Result> MapType::Make(SchemaField key, SchemaField value) { + if (key.name() != kKeyName) { + return InvalidArgument("MapType: key field name should be '{}', was '{}'", kKeyName, + key.name()); + } + if (value.name() != kValueName) { + return InvalidArgument("MapType: value field name should be '{}', was '{}'", + kValueName, value.name()); + } + return std::make_unique(std::move(key), std::move(value)); } +MapType::MapType(SchemaField key, SchemaField value) + : fields_{std::move(key), std::move(value)} {} + const SchemaField& MapType::key() const { return fields_[0]; } const SchemaField& MapType::value() const { return fields_[1]; } TypeId MapType::type_id() const { return kTypeId; } @@ -264,6 +274,10 @@ bool MapType::Equals(const Type& other) const { return fields_ == map.fields_; } +TypeId VariantType::type_id() const { return kTypeId; } +std::string VariantType::ToString() const { return "variant"; } +bool VariantType::Equals(const Type& other) const { return other.type_id() == kTypeId; } + TypeId BooleanType::type_id() const { return kTypeId; } std::string BooleanType::ToString() const { return "boolean"; } bool BooleanType::Equals(const Type& other) const { return other.type_id() == kTypeId; } @@ -354,6 +368,97 @@ TypeId UnknownType::type_id() const { return kTypeId; } std::string UnknownType::ToString() const { return "unknown"; } bool UnknownType::Equals(const Type& other) const { return other.type_id() == kTypeId; } +Result> GeometryType::Make() { + return std::unique_ptr(new GeometryType()); +} + +Result> GeometryType::Make(std::string crs) { + if (crs.empty()) { + return InvalidArgument("GeometryType: CRS cannot be empty"); + } + return std::unique_ptr(new GeometryType(std::move(crs))); +} + +GeometryType::GeometryType(std::string crs) { + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +std::string_view GeometryType::crs() const { + return crs_.empty() ? kDefaultCrs : std::string_view(crs_); +} +TypeId GeometryType::type_id() const { return kTypeId; } +std::string GeometryType::ToString() const { + if (crs_.empty()) { + return "geometry"; + } + return std::format("geometry({})", crs_); +} +bool GeometryType::Equals(const Type& other) const { + if (other.type_id() != kTypeId) { + return false; + } + const auto& geometry = static_cast(other); + return crs_ == geometry.crs_; +} + +Result> GeographyType::Make() { + return std::unique_ptr(new GeographyType()); +} + +Result> GeographyType::Make(std::string crs) { + if (crs.empty()) { + return InvalidArgument("GeographyType: CRS cannot be empty"); + } + return std::unique_ptr(new GeographyType(std::move(crs))); +} + +Result> GeographyType::Make(std::string crs, + EdgeAlgorithm algorithm) { + if (crs.empty()) { + return InvalidArgument("GeographyType: CRS cannot be empty"); + } + return std::unique_ptr(new GeographyType(std::move(crs), algorithm)); +} + +GeographyType::GeographyType(std::string crs) { + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +GeographyType::GeographyType(std::string crs, EdgeAlgorithm algorithm) + : algorithm_(algorithm) { + if (StringUtils::ToLower(crs) != StringUtils::ToLower(kDefaultCrs)) { + crs_ = std::move(crs); + } +} + +std::string_view GeographyType::crs() const { + return crs_.empty() ? kDefaultCrs : std::string_view(crs_); +} +EdgeAlgorithm GeographyType::algorithm() const { + return algorithm_.value_or(kDefaultAlgorithm); +} +TypeId GeographyType::type_id() const { return kTypeId; } +std::string GeographyType::ToString() const { + if (algorithm_.has_value()) { + return std::format("geography({}, {})", crs(), iceberg::ToString(*algorithm_)); + } + if (!crs_.empty()) { + return std::format("geography({})", crs_); + } + return "geography"; +} +bool GeographyType::Equals(const Type& other) const { + if (other.type_id() != kTypeId) { + return false; + } + const auto& geography = static_cast(other); + return crs_ == geography.crs_ && algorithm_ == geography.algorithm_; +} + FixedType::FixedType(int32_t length) : length_(length) { ICEBERG_CHECK_OR_DIE(length >= 0, "FixedType: length must be >= 0, was {}", length); } @@ -374,7 +479,7 @@ std::string BinaryType::ToString() const { return "binary"; } bool BinaryType::Equals(const Type& other) const { return other.type_id() == kTypeId; } // ---------------------------------------------------------------------- -// Factory functions for creating primitive data types +// Factory functions for creating data types #define TYPE_FACTORY(NAME, KLASS) \ const std::shared_ptr& NAME() { \ @@ -397,9 +502,28 @@ TYPE_FACTORY(binary, BinaryType) TYPE_FACTORY(string, StringType) TYPE_FACTORY(uuid, UuidType) TYPE_FACTORY(unknown, UnknownType) +TYPE_FACTORY(variant, VariantType) #undef TYPE_FACTORY +const std::shared_ptr& geometry() { + static const std::shared_ptr result = [] { + auto type = GeometryType::Make(); + ICEBERG_CHECK_OR_DIE(type.has_value(), "Failed to create default geometry type"); + return std::shared_ptr(std::move(type.value())); + }(); + return result; +} + +const std::shared_ptr& geography() { + static const std::shared_ptr result = [] { + auto type = GeographyType::Make(); + ICEBERG_CHECK_OR_DIE(type.has_value(), "Failed to create default geography type"); + return std::shared_ptr(std::move(type.value())); + }(); + return result; +} + std::shared_ptr decimal(int32_t precision, int32_t scale) { return std::make_shared(precision, scale); } @@ -408,12 +532,44 @@ std::shared_ptr fixed(int32_t length) { return std::make_shared(length); } +std::shared_ptr geometry(std::string crs) { + auto type = GeometryType::Make(std::move(crs)); + if (!type.has_value()) { + throw IcebergError(type.error().message); + } + return std::move(type.value()); +} + +std::shared_ptr geography(std::string crs) { + auto type = GeographyType::Make(std::move(crs)); + if (!type.has_value()) { + throw IcebergError(type.error().message); + } + return std::move(type.value()); +} + +std::shared_ptr geography(std::string crs, EdgeAlgorithm algorithm) { + auto type = GeographyType::Make(std::move(crs), algorithm); + if (!type.has_value()) { + throw IcebergError(type.error().message); + } + return std::move(type.value()); +} + std::shared_ptr map(SchemaField key, SchemaField value) { - return std::make_shared(key, value); + auto type = MapType::Make(std::move(key), std::move(value)); + if (!type.has_value()) { + throw IcebergError(type.error().message); + } + return std::move(type.value()); } std::shared_ptr list(SchemaField element) { - return std::make_shared(std::move(element)); + auto type = ListType::Make(std::move(element)); + if (!type.has_value()) { + throw IcebergError(type.error().message); + } + return std::move(type.value()); } std::shared_ptr struct_(std::vector fields) { @@ -462,9 +618,52 @@ std::string_view ToString(TypeId id) { return "binary"; case TypeId::kUnknown: return "unknown"; + case TypeId::kVariant: + return "variant"; + case TypeId::kGeometry: + return "geometry"; + case TypeId::kGeography: + return "geography"; + } + + std::unreachable(); +} + +std::string_view ToString(EdgeAlgorithm algorithm) { + switch (algorithm) { + case EdgeAlgorithm::kSpherical: + return "spherical"; + case EdgeAlgorithm::kVincenty: + return "vincenty"; + case EdgeAlgorithm::kThomas: + return "thomas"; + case EdgeAlgorithm::kAndoyer: + return "andoyer"; + case EdgeAlgorithm::kKarney: + return "karney"; } std::unreachable(); } +Result EdgeAlgorithmFromString(std::string_view name) { + auto lower_name = StringUtils::ToLower(name); + if (lower_name == "spherical") { + return EdgeAlgorithm::kSpherical; + } + if (lower_name == "vincenty") { + return EdgeAlgorithm::kVincenty; + } + if (lower_name == "thomas") { + return EdgeAlgorithm::kThomas; + } + if (lower_name == "andoyer") { + return EdgeAlgorithm::kAndoyer; + } + if (lower_name == "karney") { + return EdgeAlgorithm::kKarney; + } + return InvalidArgument("Invalid edge interpolation algorithm: {}", name); +} + } // namespace iceberg diff --git a/src/iceberg/type.h b/src/iceberg/type.h index c0966759e..41484333d 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -46,20 +46,32 @@ class ICEBERG_EXPORT Type : public iceberg::util::Formattable { ~Type() override = default; /// \brief Get the type ID. - [[nodiscard]] virtual TypeId type_id() const = 0; + virtual TypeId type_id() const = 0; /// \brief Is this a primitive type (may not have child fields)? - [[nodiscard]] virtual bool is_primitive() const = 0; + virtual bool is_primitive() const = 0; /// \brief Is this a nested type (may have child fields)? - [[nodiscard]] virtual bool is_nested() const = 0; + virtual bool is_nested() const = 0; + + /// \brief Is this a struct type? + bool is_struct() const { return type_id() == TypeId::kStruct; } + + /// \brief Is this a list type? + bool is_list() const { return type_id() == TypeId::kList; } + + /// \brief Is this a map type? + bool is_map() const { return type_id() == TypeId::kMap; } + + /// \brief Is this a variant type? + bool is_variant() const { return type_id() == TypeId::kVariant; } /// \brief Compare two types for equality. friend bool operator==(const Type& lhs, const Type& rhs) { return lhs.Equals(rhs); } protected: /// \brief Compare two types for equality. - [[nodiscard]] virtual bool Equals(const Type& other) const = 0; + virtual bool Equals(const Type& other) const = 0; }; /// \brief A data type that does not have child fields. @@ -76,28 +88,27 @@ class ICEBERG_EXPORT NestedType : public Type { bool is_nested() const override { return true; } /// \brief Get a view of the child fields. - [[nodiscard]] virtual std::span fields() const = 0; + virtual std::span fields() const = 0; using SchemaFieldConstRef = std::reference_wrapper; /// \brief Get a field by field ID. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldById( + virtual Result> GetFieldById( int32_t field_id) const = 0; /// \brief Get a field by index. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldByIndex( + virtual Result> GetFieldByIndex( int32_t index) const = 0; /// \brief Get a field by name. Return an error Status if /// the field name is not unique; prefer GetFieldById or GetFieldByIndex /// when possible. /// /// \note This is O(1) complexity. - [[nodiscard]] virtual Result> GetFieldByName( + virtual Result> GetFieldByName( std::string_view name, bool case_sensitive) const = 0; /// \brief Get a field by name (case-sensitive). - [[nodiscard]] Result> GetFieldByName( - std::string_view name) const; + Result> GetFieldByName(std::string_view name) const; }; /// \defgroup type-nested Nested Types @@ -147,8 +158,11 @@ class ICEBERG_EXPORT ListType : public NestedType { constexpr static const TypeId kTypeId = TypeId::kList; constexpr static const std::string_view kElementName = "element"; - /// \brief Construct a list of the given element. The name of the child - /// field should be "element". + static Result> Make(SchemaField element); + + /// \brief Construct a list of the given element. + /// + /// Use Make or list to validate that the element field name is "element". explicit ListType(SchemaField element); /// \brief Construct a list of the given element type. ListType(int32_t field_id, std::shared_ptr type, bool optional); @@ -180,8 +194,11 @@ class ICEBERG_EXPORT MapType : public NestedType { constexpr static const std::string_view kKeyName = "key"; constexpr static const std::string_view kValueName = "value"; - /// \brief Construct a map of the given key/value fields. The field names - /// should be "key" and "value", respectively. + static Result> Make(SchemaField key, SchemaField value); + + /// \brief Construct a map of the given key/value fields. + /// + /// Use Make or map to validate that the field names are "key" and "value". explicit MapType(SchemaField key, SchemaField value); ~MapType() override = default; @@ -208,6 +225,30 @@ class ICEBERG_EXPORT MapType : public NestedType { /// @} +/// \defgroup type-semi-structured Semi-structured Types +/// Semi-structured types may contain values whose structure varies across rows. +/// @{ + +/// \brief A semi-structured type whose structure may vary across rows. +class ICEBERG_EXPORT VariantType : public Type { + public: + constexpr static const TypeId kTypeId = TypeId::kVariant; + + VariantType() = default; + ~VariantType() override = default; + + bool is_primitive() const override { return false; } + bool is_nested() const override { return false; } + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; +}; + +/// @} + /// \defgroup type-primitive Primitive Types /// Primitive types do not have nested fields. /// @{ @@ -296,14 +337,15 @@ class ICEBERG_EXPORT DecimalType : public PrimitiveType { constexpr static const int32_t kMaxPrecision = 38; /// \brief Construct a decimal type with the given precision and scale. + /// \throws IcebergError if precision is outside the supported range. DecimalType(int32_t precision, int32_t scale); ~DecimalType() override = default; /// \brief Get the precision (the number of decimal digits). - [[nodiscard]] int32_t precision() const; + int32_t precision() const; /// \brief Get the scale (essentially, the number of decimal digits after /// the decimal point; precisely, the value is scaled by $$10^{-s}$$.). - [[nodiscard]] int32_t scale() const; + int32_t scale() const; TypeId type_id() const override; std::string ToString() const override; @@ -353,9 +395,9 @@ class ICEBERG_EXPORT TimeType : public PrimitiveType { class ICEBERG_EXPORT TimestampBase : public PrimitiveType { public: /// \brief Is this type zoned or naive? - [[nodiscard]] virtual bool is_zoned() const = 0; + virtual bool is_zoned() const = 0; /// \brief The time resolution. - [[nodiscard]] virtual TimeUnit time_unit() const = 0; + virtual TimeUnit time_unit() const = 0; }; /// \brief A data type representing a timestamp in microseconds without @@ -471,11 +513,12 @@ class ICEBERG_EXPORT FixedType : public PrimitiveType { constexpr static const TypeId kTypeId = TypeId::kFixed; /// \brief Construct a fixed type with the given length. + /// \throws IcebergError if length is negative. explicit FixedType(int32_t length); ~FixedType() override = default; /// \brief The length (the number of bytes to store). - [[nodiscard]] int32_t length() const; + int32_t length() const; TypeId type_id() const override; std::string ToString() const override; @@ -518,11 +561,67 @@ class ICEBERG_EXPORT UnknownType : public PrimitiveType { bool Equals(const Type& other) const override; }; +/// \brief A data type representing OGC geometry in WKB format. +class ICEBERG_EXPORT GeometryType : public PrimitiveType { + public: + constexpr static const TypeId kTypeId = TypeId::kGeometry; + constexpr static std::string_view kDefaultCrs = "OGC:CRS84"; + + static Result> Make(); + static Result> Make(std::string crs); + ~GeometryType() override = default; + + std::string_view crs() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + GeometryType() = default; + explicit GeometryType(std::string crs); + + std::string crs_; +}; + +/// \brief A data type representing OGC geography in WKB format. +class ICEBERG_EXPORT GeographyType : public PrimitiveType { + public: + constexpr static const TypeId kTypeId = TypeId::kGeography; + constexpr static std::string_view kDefaultCrs = "OGC:CRS84"; + constexpr static EdgeAlgorithm kDefaultAlgorithm = EdgeAlgorithm::kSpherical; + + static Result> Make(); + static Result> Make(std::string crs); + static Result> Make(std::string crs, + EdgeAlgorithm algorithm); + ~GeographyType() override = default; + + std::string_view crs() const; + EdgeAlgorithm algorithm() const; + + TypeId type_id() const override; + std::string ToString() const override; + + protected: + bool Equals(const Type& other) const override; + + private: + GeographyType() = default; + explicit GeographyType(std::string crs); + GeographyType(std::string crs, EdgeAlgorithm algorithm); + + std::string crs_; + std::optional algorithm_; +}; + /// @} -/// \defgroup type-factories Factory functions for creating primitive data types +/// \defgroup type-factories Factory functions for creating data types /// -/// Factory functions for creating primitive data types +/// Factory functions for creating data types /// @{ /// \brief Return a BooleanType instance. @@ -555,18 +654,39 @@ ICEBERG_EXPORT const std::shared_ptr& string(); ICEBERG_EXPORT const std::shared_ptr& uuid(); /// \brief Return an UnknownType instance. ICEBERG_EXPORT const std::shared_ptr& unknown(); +/// \brief Return a VariantType instance. +ICEBERG_EXPORT const std::shared_ptr& variant(); +/// \brief Return the default GeometryType instance. +ICEBERG_EXPORT const std::shared_ptr& geometry(); +/// \brief Return the default GeographyType instance. +ICEBERG_EXPORT const std::shared_ptr& geography(); /// \brief Create a DecimalType with the given precision and scale. /// \param precision The number of decimal digits (max 38). /// \param scale The number of decimal digits after the decimal point. /// \return A shared pointer to the DecimalType instance. +/// \throws IcebergError if precision is outside the supported range. ICEBERG_EXPORT std::shared_ptr decimal(int32_t precision, int32_t scale); /// \brief Create a FixedType with the given length. /// \param length The number of bytes to store (must be >= 0). /// \return A shared pointer to the FixedType instance. +/// \throws IcebergError if length is negative. ICEBERG_EXPORT std::shared_ptr fixed(int32_t length); +/// \brief Create a GeometryType with the given CRS. +/// \throws IcebergError if crs is empty. +ICEBERG_EXPORT std::shared_ptr geometry(std::string crs); + +/// \brief Create a GeographyType with the given CRS. +/// \throws IcebergError if crs is empty. +ICEBERG_EXPORT std::shared_ptr geography(std::string crs); + +/// \brief Create a GeographyType with the given CRS and edge algorithm. +/// \throws IcebergError if crs is empty. +ICEBERG_EXPORT std::shared_ptr geography(std::string crs, + EdgeAlgorithm algorithm); + /// \brief Create a StructType with the given fields. /// \param fields The fields of the struct. /// \return A shared pointer to the StructType instance. @@ -575,12 +695,14 @@ ICEBERG_EXPORT std::shared_ptr struct_(std::vector fiel /// \brief Create a ListType with the given element field. /// \param element The element field of the list. /// \return A shared pointer to the ListType instance. +/// \throws IcebergError if element's name is not "element". ICEBERG_EXPORT std::shared_ptr list(SchemaField element); /// \brief Create a MapType with the given key and value fields. /// \param key The key field of the map. /// \param value The value field of the map. /// \return A shared pointer to the MapType instance. +/// \throws IcebergError if the key or value field has an invalid name. ICEBERG_EXPORT std::shared_ptr map(SchemaField key, SchemaField value); /// @} @@ -594,4 +716,10 @@ ICEBERG_EXPORT std::shared_ptr map(SchemaField key, SchemaField value); /// \return A string_view containing the lowercase type name ICEBERG_EXPORT std::string_view ToString(TypeId id); +/// \brief Get the lowercase string representation of an EdgeAlgorithm. +ICEBERG_EXPORT std::string_view ToString(EdgeAlgorithm algorithm); + +/// \brief Parse a lowercase edge algorithm name. +ICEBERG_EXPORT Result EdgeAlgorithmFromString(std::string_view name); + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 6c34d3a8d..0320f24ea 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -53,6 +53,9 @@ enum class TypeId { kFixed, kBinary, kUnknown, + kVariant, + kGeometry, + kGeography, }; /// \brief The time unit. In Iceberg V3 nanoseconds are also supported. @@ -61,6 +64,15 @@ enum class TimeUnit { kNanosecond, }; +/// \brief The algorithm used to interpolate geography edges. +enum class EdgeAlgorithm { + kSpherical, + kVincenty, + kThomas, + kAndoyer, + kKarney, +}; + /// \brief Data type family. class BinaryType; class BooleanType; @@ -86,6 +98,9 @@ class TimestampTzNsType; class Type; class UnknownType; class UuidType; +class VariantType; +class GeographyType; +class GeometryType; /// \brief Data values. class Decimal; diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 0e7f147b0..5c50ee41d 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -181,6 +181,12 @@ class ApplyChangesVisitor { return base_type; } + Result> VisitVariant(const VariantType& variant_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + return base_type; + } + private: Result> ProcessField( const SchemaField& field, const std::shared_ptr& field_type_result) { diff --git a/src/iceberg/util/struct_like_set.cc b/src/iceberg/util/struct_like_set.cc index 12648ea5e..35a0f9e28 100644 --- a/src/iceberg/util/struct_like_set.cc +++ b/src/iceberg/util/struct_like_set.cc @@ -342,6 +342,11 @@ Status ValidateScalarAgainstType(const Scalar& scalar, const Type& type) { return ValidateMapLikeAgainstType(*map, internal::checked_cast(type)); } + case TypeId::kVariant: + case TypeId::kGeometry: + case TypeId::kGeography: + return NotSupported("Scalar validation for type {} is not supported", + type.ToString()); } std::unreachable(); diff --git a/src/iceberg/util/type_util.cc b/src/iceberg/util/type_util.cc index cb01be08f..1f1e02747 100644 --- a/src/iceberg/util/type_util.cc +++ b/src/iceberg/util/type_util.cc @@ -37,6 +37,8 @@ IdToFieldVisitor::IdToFieldVisitor( Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } +Status IdToFieldVisitor::Visit(const VariantType& type) { return {}; } + Status IdToFieldVisitor::Visit(const NestedType& type) { const auto& nested = internal::checked_cast(type); const auto& fields = nested.fields(); @@ -64,7 +66,7 @@ Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, const auto& field = type.fields()[0]; std::string new_path = BuildPath(path, field.name(), case_sensitive_); std::string new_short_path; - if (field.type()->type_id() == TypeId::kStruct) { + if (field.type()->is_struct()) { new_short_path = short_path; } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); @@ -86,8 +88,7 @@ Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, const auto& fields = type.fields(); for (const auto& field : fields) { new_path = BuildPath(path, field.name(), case_sensitive_); - if (field.name() == MapType::kValueName && - field.type()->type_id() == TypeId::kStruct) { + if (field.name() == MapType::kValueName && field.type()->is_struct()) { new_short_path = short_path; } else { new_short_path = BuildPath(short_path, field.name(), case_sensitive_); @@ -128,6 +129,11 @@ Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path return {}; } +Status NameToIdVisitor::Visit(const VariantType& type, const std::string& path, + const std::string& short_path) { + return {}; +} + std::string NameToIdVisitor::BuildPath(std::string_view prefix, std::string_view field_name, bool case_sensitive) { std::string quoted_name; @@ -168,6 +174,20 @@ Status PositionPathVisitor::Visit(const PrimitiveType& type) { return {}; } +Status PositionPathVisitor::Visit(const VariantType& type) { + if (current_field_id_ == kUnassignedFieldId) { + return InvalidSchema("Current field id is not assigned, type: {}", type.ToString()); + } + + if (auto ret = position_path_.try_emplace(current_field_id_, current_path_); + !ret.second) { + return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}", + current_field_id_, ret.first->second, current_path_); + } + + return {}; +} + Status PositionPathVisitor::Visit(const StructType& type) { for (size_t i = 0; i < type.fields().size(); ++i) { const auto& field = type.fields()[i]; @@ -208,8 +228,8 @@ Result> PruneColumnVisitor::Visit( Result> PruneColumnVisitor::Visit(const SchemaField& field) const { if (selected_ids_.contains(field.field_id())) { - return (select_full_types_ || field.type()->is_primitive()) ? field.type() - : Visit(field.type()); + return (select_full_types_ || !field.type()->is_nested()) ? field.type() + : Visit(field.type()); } return Visit(field.type()); } @@ -278,6 +298,8 @@ GetProjectedIdsVisitor::GetProjectedIdsVisitor(bool include_struct_ids) Status GetProjectedIdsVisitor::Visit(const Type& type) { if (type.is_nested()) { return VisitNested(internal::checked_cast(type)); + } else if (type.is_variant()) { + return {}; } else { return VisitPrimitive(internal::checked_cast(type)); } @@ -288,9 +310,8 @@ Status GetProjectedIdsVisitor::VisitNested(const NestedType& type) { ICEBERG_RETURN_UNEXPECTED(Visit(*field.type())); } for (auto& field : type.fields()) { - // TODO(zhuo.wang) or is_variant - if ((include_struct_ids_ && field.type()->type_id() == TypeId::kStruct) || - field.type()->is_primitive()) { + if ((include_struct_ids_ && field.type()->is_struct()) || + !field.type()->is_nested()) { ids_.insert(field.field_id()); } } diff --git a/src/iceberg/util/type_util.h b/src/iceberg/util/type_util.h index 8fd5ef19f..8623ad254 100644 --- a/src/iceberg/util/type_util.h +++ b/src/iceberg/util/type_util.h @@ -45,6 +45,7 @@ class IdToFieldVisitor { std::unordered_map>& id_to_field); Status Visit(const PrimitiveType& type); + Status Visit(const VariantType& type); Status Visit(const NestedType& type); private: @@ -67,6 +68,8 @@ class NameToIdVisitor { const std::string& short_path); Status Visit(const PrimitiveType& type, const std::string& path, const std::string& short_path); + Status Visit(const VariantType& type, const std::string& path, + const std::string& short_path); void Finish(); private: @@ -85,6 +88,7 @@ class NameToIdVisitor { class PositionPathVisitor { public: Status Visit(const PrimitiveType& type); + Status Visit(const VariantType& type); Status Visit(const StructType& type); Status Visit(const ListType& type); Status Visit(const MapType& type); diff --git a/src/iceberg/util/visit_type.h b/src/iceberg/util/visit_type.h index bf52d2e9a..73fbbb5fd 100644 --- a/src/iceberg/util/visit_type.h +++ b/src/iceberg/util/visit_type.h @@ -127,21 +127,24 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { /// \brief Visit a type using a categorical visitor pattern /// /// This function provides a simplified visitor interface that groups Iceberg types into -/// four categories based on their structural properties: +/// five categories based on their structural properties: /// /// - **Struct types**: Complex types with named fields (StructType) /// - **List types**: Sequential container types (ListType) /// - **Map types**: Key-value container types (MapType) -/// - **Primitive types**: All leaf types without nested structure (14 primitive types) +/// - **Variant type**: Semi-structured type that is neither nested nor primitive +/// (VariantType) +/// - **Primitive types**: All leaf types without nested structure (primitive types) /// /// This grouping is useful for algorithms that need to distinguish between container /// types and leaf types, but don't require separate handling for each primitive type /// variant (e.g., Int vs Long vs String). /// -/// \tparam VISITOR Visitor class that must implement four Visit methods: +/// \tparam VISITOR Visitor class that must implement five Visit methods: /// - `VisitStruct(const StructType&, ARGS...)` for struct types /// - `VisitList(const ListType&, ARGS...)` for list types /// - `VisitMap(const MapType&, ARGS...)` for map types +/// - `VisitVariant(const VariantType&, ARGS...)` for the variant type /// - `VisitPrimitive(const PrimitiveType&, ARGS...)` for all primitive types /// \tparam ARGS Additional argument types forwarded to Visit methods /// \param type The type to visit diff --git a/src/iceberg/util/visitor_generate.h b/src/iceberg/util/visitor_generate.h index a5b0c2ced..ad6f5eb2d 100644 --- a/src/iceberg/util/visitor_generate.h +++ b/src/iceberg/util/visitor_generate.h @@ -39,6 +39,9 @@ namespace iceberg { ACTION(Fixed); \ ACTION(Binary); \ ACTION(Unknown); \ + ACTION(Variant); \ + ACTION(Geometry); \ + ACTION(Geography); \ ACTION(Struct); \ ACTION(List); \ ACTION(Map); @@ -49,7 +52,12 @@ namespace iceberg { /// - Struct types -> calls ACTION with Struct /// - List types -> calls ACTION with List /// - Map types -> calls ACTION with Map +/// - Variant type -> calls ACTION with Variant /// - All primitive types (default) -> calls ACTION with Primitive +/// +/// Variant is dispatched explicitly because it is neither a nested nor a primitive +/// type, so it must not be routed into the primitive default (which would cast it to +/// PrimitiveType). #define ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(ACTION) \ case ::iceberg::TypeId::kStruct: \ ACTION(Struct) \ @@ -57,6 +65,8 @@ namespace iceberg { ACTION(List) \ case ::iceberg::TypeId::kMap: \ ACTION(Map) \ + case ::iceberg::TypeId::kVariant: \ + ACTION(Variant) \ default: \ ACTION(Primitive)