forked from apache/iceberg-cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathavro_schema_util_internal.h
More file actions
194 lines (167 loc) · 7.76 KB
/
avro_schema_util_internal.h
File metadata and controls
194 lines (167 loc) · 7.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
#include <stack>
#include <avro/Node.hh>
#include "iceberg/iceberg_bundle_export.h"
#include "iceberg/name_mapping.h"
#include "iceberg/result.h"
#include "iceberg/schema_util.h"
#include "iceberg/type.h"
namespace avro {
class Schema;
class ValidSchema;
} // namespace avro
namespace iceberg::avro {
struct MapLogicalType : public ::avro::CustomLogicalType {
MapLogicalType() : ::avro::CustomLogicalType("map") {}
};
/// \brief A visitor that converts an Iceberg type to an Avro node.
class ICEBERG_BUNDLE_EXPORT ToAvroNodeVisitor {
public:
Status Visit(const BooleanType& type, ::avro::NodePtr* node);
Status Visit(const IntType& type, ::avro::NodePtr* node);
Status Visit(const LongType& type, ::avro::NodePtr* node);
Status Visit(const FloatType& type, ::avro::NodePtr* node);
Status Visit(const DoubleType& type, ::avro::NodePtr* node);
Status Visit(const DecimalType& type, ::avro::NodePtr* node);
Status Visit(const DateType& type, ::avro::NodePtr* node);
Status Visit(const TimeType& type, ::avro::NodePtr* node);
Status Visit(const TimestampType& type, ::avro::NodePtr* node);
Status Visit(const TimestampTzType& type, ::avro::NodePtr* node);
Status Visit(const StringType& type, ::avro::NodePtr* node);
Status Visit(const UuidType& type, ::avro::NodePtr* node);
Status Visit(const FixedType& type, ::avro::NodePtr* node);
Status Visit(const BinaryType& type, ::avro::NodePtr* node);
Status Visit(const StructType& type, ::avro::NodePtr* node);
Status Visit(const ListType& type, ::avro::NodePtr* node);
Status Visit(const MapType& type, ::avro::NodePtr* node);
Status Visit(const SchemaField& field, ::avro::NodePtr* node);
private:
// Store recently accessed field ids on the current visitor path.
std::stack<int32_t> field_ids_;
};
/// \brief A visitor that checks the presence of field IDs in an Avro schema.
class ICEBERG_BUNDLE_EXPORT HasIdVisitor {
public:
HasIdVisitor() = default;
/// \brief Visit an Avro node to check for field IDs.
/// \param node The Avro node to visit.
/// \return Status indicating success or an error if unsupported Avro types are
/// encountered.
Status Visit(const ::avro::NodePtr& node);
/// \brief Visit an Avro schema to check for field IDs.
/// \param schema The Avro schema to visit.
/// \return Status indicating success or an error if unsupported Avro types are
/// encountered.
Status Visit(const ::avro::ValidSchema& schema);
/// \brief Visit an Avro schema to check for field IDs.
/// \param schema The Avro schema to visit.
/// \return Status indicating success or an error if unsupported Avro types are
/// encountered.
Status Visit(const ::avro::Schema& node);
/// \brief Check if all fields in the visited schema have field IDs.
/// \return True if all fields have IDs, false otherwise.
bool AllHaveIds() const {
return total_fields_ == fields_with_id_ && fields_with_id_ != 0;
}
/// \brief Check if all fields in the visited schema have field IDs.
/// \return True if all fields have IDs, false otherwise.
bool HasNoIds() const { return total_fields_ == 0; }
private:
/// \brief Visit a record node to check for field IDs.
/// \param node The record node to visit.
/// \return Status indicating success or error.
Status VisitRecord(const ::avro::NodePtr& node);
/// \brief Visit an array node to check for element IDs.
/// \param node The array node to visit.
/// \return Status indicating success or error.
Status VisitArray(const ::avro::NodePtr& node);
/// \brief Visit a map node to check for key and value IDs.
/// \param node The map node to visit.
/// \return Status indicating success or error.
Status VisitMap(const ::avro::NodePtr& node);
/// \brief Visit a union node to check for field IDs in each branch.
/// \param node The union node to visit.
/// \return Status indicating success or error.
Status VisitUnion(const ::avro::NodePtr& node);
private:
// Total number of fields visited.
size_t total_fields_ = 0;
// Number of fields with IDs.
size_t fields_with_id_ = 0;
};
/// \brief Project an Iceberg Schema onto an Avro NodePtr.
///
/// This function creates a projection from an Iceberg Schema to an Avro schema node.
/// The projection determines how to read data from the Avro schema into the expected
/// Iceberg Schema.
///
/// \param expected_schema The Iceberg Schema that defines the expected structure.
/// \param avro_node The Avro node to read data from.
/// \param prune_source Whether the source schema can be pruned.
/// \return The schema projection result.
ICEBERG_BUNDLE_EXPORT Result<SchemaProjection> Project(const Schema& expected_schema,
const ::avro::NodePtr& avro_node,
bool prune_source);
std::string ToString(const ::avro::NodePtr& node);
std::string ToString(const ::avro::LogicalType& logical_type);
std::string ToString(const ::avro::LogicalType::Type& logical_type);
/// \brief Check if an Avro node has a map logical type.
/// \param node The Avro node to check.
/// \return True if the node has a map logical type, false otherwise.
bool HasMapLogicalType(const ::avro::NodePtr& node);
/// \brief Check if a string is a valid Avro name.
///
/// Valid Avro names must:
/// 1. Start with a letter or underscore
/// 2. Contain only letters, digits, or underscores
///
/// \param name The name to check.
/// \return True if the name is valid, false otherwise.
ICEBERG_BUNDLE_EXPORT bool ValidAvroName(std::string_view name);
/// \brief Create a new Avro node with field IDs from name mapping.
/// \param original_node The original Avro node to copy.
/// \param mapping The name mapping to apply field IDs from.
/// \return A new Avro node with field IDs applied, or an error.
ICEBERG_BUNDLE_EXPORT Result<::avro::NodePtr> MakeAvroNodeWithFieldIds(
const ::avro::NodePtr& original_node, const NameMapping& mapping);
/// \brief Sanitize a field name to make it compatible with Avro field name requirements.
///
/// Converts names that are not valid Avro names to valid Avro names.
/// Conversion rules:
/// 1. If the first character is not a letter or underscore, it is specially handled:
/// - Digits: Prefixed with an underscore (e.g., '3' -> '_3')
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation of the character (e.g., '$' -> '_x24')
/// 2. For characters other than the first:
/// - If it's a letter, digit, or underscore, it remains unchanged
/// - Other characters: Converted to '_x' followed by the uppercase hexadecimal
/// representation
///
/// Examples:
/// - "123field" -> "_123field"
/// - "user-name" -> "user_x2Dname"
/// - "$price" -> "_x24price"
/// - "valid_name_123" -> "valid_name_123" (no conversion needed)
///
/// \param field_name The original field name to sanitize.
/// \return A sanitized field name that follows Avro naming conventions.
ICEBERG_BUNDLE_EXPORT std::string SanitizeFieldName(std::string_view field_name);
} // namespace iceberg::avro