forked from apache/iceberg-cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfile_writer.h
More file actions
129 lines (104 loc) · 4.85 KB
/
file_writer.h
File metadata and controls
129 lines (104 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
/// \file iceberg/file_writer.h
/// Writer interface for file formats like Parquet, Avro and ORC.
#include <functional>
#include <memory>
#include <optional>
#include "iceberg/arrow_c_data.h"
#include "iceberg/file_format.h"
#include "iceberg/metrics.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
#include "iceberg/util/config.h"
namespace iceberg {
class ICEBERG_EXPORT WriterProperties : public ConfigBase<WriterProperties> {
public:
template <typename T>
using Entry = const ConfigBase<WriterProperties>::Entry<T>;
/// \brief The name of the Avro root node schema to write.
inline static Entry<std::string> kAvroSchemaName{"write.avro.schema-name", ""};
/// \brief The buffer size used by Avro output stream.
inline static Entry<int64_t> kAvroBufferSize{"write.avro.buffer-size", 1024 * 1024};
/// \brief The sync interval used by Avro writer.
inline static Entry<int64_t> kAvroSyncInterval{"write.avro.sync-interval", 16 * 1024};
/// TODO(gangwu): add more properties, like compression codec, compression level, etc.
/// \brief Create a default WriterProperties instance.
static std::unique_ptr<WriterProperties> default_properties();
/// \brief Create a WriterProperties instance from a map of key-value pairs.
static std::unique_ptr<WriterProperties> FromMap(
const std::unordered_map<std::string, std::string>& properties);
};
/// \brief Options for creating a writer.
struct ICEBERG_EXPORT WriterOptions {
/// \brief The path to the file to write.
std::string path;
/// \brief The schema of the data to write.
std::shared_ptr<Schema> schema;
/// \brief FileIO instance to open the file. Writer implementations should down cast it
/// to the specific FileIO implementation. By default, the `iceberg-bundle` library uses
/// `ArrowFileSystemFileIO` as the default implementation.
std::shared_ptr<class FileIO> io;
/// \brief Metadata to write to the file.
std::unordered_map<std::string, std::string> metadata;
/// \brief Format-specific or implementation-specific properties.
std::shared_ptr<WriterProperties> properties = WriterProperties::default_properties();
};
/// \brief Base writer class to write data from different file formats.
class ICEBERG_EXPORT Writer {
public:
virtual ~Writer() = default;
Writer() = default;
Writer(const Writer&) = delete;
Writer& operator=(const Writer&) = delete;
/// \brief Open the writer.
virtual Status Open(const WriterOptions& options) = 0;
/// \brief Close the writer.
virtual Status Close() = 0;
/// \brief Write arrow data to the file.
///
/// \return Status of write results.
/// \note Ownership of the data is transferred to the writer.
virtual Status Write(ArrowArray* data) = 0;
/// \brief Get the file statistics.
/// Only valid after the file is closed.
virtual std::optional<Metrics> metrics() = 0;
/// \brief Get the file length.
/// Only valid after the file is closed.
virtual std::optional<int64_t> length() = 0;
/// \brief Returns a list of recommended split locations, if applicable, empty
/// otherwise. When available, this information is used for planning scan tasks whose
/// boundaries are determined by these offsets. The returned list must be sorted in
/// ascending order. Only valid after the file is closed.
virtual std::vector<int64_t> split_offsets() = 0;
};
/// \brief Factory function to create a writer of a specific file format.
using WriterFactory = std::function<Result<std::unique_ptr<Writer>>()>;
/// \brief Registry of writer factories for different file formats.
struct ICEBERG_EXPORT WriterFactoryRegistry {
/// \brief Register a factory function for a specific file format.
WriterFactoryRegistry(FileFormatType format_type, WriterFactory factory);
/// \brief Get the factory function for a specific file format.
static WriterFactory& GetFactory(FileFormatType format_type);
/// \brief Open a writer for a specific file format.
static Result<std::unique_ptr<Writer>> Open(FileFormatType format_type,
const WriterOptions& options);
};
} // namespace iceberg