2323#include < string>
2424#include < vector>
2525
26+ #include " arrow/c/abi.h"
2627#include " paimon/cache/cache.h"
2728#include " paimon/predicate/predicate.h"
2829#include " paimon/result.h"
@@ -44,7 +45,7 @@ class FileSystem;
4445class PAIMON_EXPORT ReadContext {
4546 public:
4647 ReadContext (const std::string& path, const std::string& branch,
47- const std::vector<std::string>& read_schema ,
48+ const std::vector<std::string>& read_field_names ,
4849 const std::vector<int32_t >& read_field_ids,
4950 const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
5051 bool enable_prefetch, uint32_t prefetch_batch_count,
@@ -75,8 +76,8 @@ class PAIMON_EXPORT ReadContext {
7576 return options_;
7677 }
7778
78- const std::vector<std::string>& GetReadSchema () const {
79- return read_schema_ ;
79+ const std::vector<std::string>& GetReadFieldNames () const {
80+ return read_field_names_ ;
8081 }
8182
8283 const std::vector<int32_t >& GetReadFieldIds () const {
@@ -130,10 +131,26 @@ class PAIMON_EXPORT ReadContext {
130131 return cache_;
131132 }
132133
134+ // / Whether a read schema (C ArrowSchema) for nested column pruning was provided.
135+ bool HasReadSchema () const {
136+ return read_schema_ != nullptr && read_schema_->release != nullptr ;
137+ }
138+
139+ // / Get the read schema as a mutable C ArrowSchema pointer.
140+ // / ImportSchema will consume (release) the schema content.
141+ ArrowSchema* GetReadSchema () {
142+ return read_schema_.get ();
143+ }
144+
145+ // / Set the read schema from a C ArrowSchema unique_ptr and take ownership of
146+ // / schema resources (released via ArrowSchema::release in destructor).
147+ // / Called internally by ReadContextBuilder.
148+ void SetReadSchema (std::unique_ptr<ArrowSchema> schema);
149+
133150 private:
134151 std::string path_;
135152 std::string branch_;
136- std::vector<std::string> read_schema_ ;
153+ std::vector<std::string> read_field_names_ ;
137154 std::vector<int32_t > read_field_ids_;
138155 std::shared_ptr<Predicate> predicate_;
139156 bool enable_predicate_filter_;
@@ -151,6 +168,8 @@ class PAIMON_EXPORT ReadContext {
151168 PrefetchCacheMode prefetch_cache_mode_;
152169 CacheConfig cache_config_;
153170 std::shared_ptr<Cache> cache_;
171+ // Owns schema resources and releases ArrowSchema::release in destructor.
172+ std::unique_ptr<ArrowSchema> read_schema_;
154173};
155174
156175// / `ReadContextBuilder` used to build a `ReadContext`, has input validation.
@@ -173,9 +192,9 @@ class PAIMON_EXPORT ReadContextBuilder {
173192 // /
174193 // / @param read_field_names Vector of field names to read from the table.
175194 // / @return Reference to this builder for method chaining.
176- // / @note Currently supports top-level field selection. Future versions may support
177- // / nested field selection using ArrowSchema for more granular projection
178- ReadContextBuilder& SetReadSchema (const std::vector<std::string>& read_field_names);
195+ // / @note Currently supports top-level field selection. For nested field selection
196+ // / use SetReadSchema(std::unique_ptr< ArrowSchema>) instead.
197+ ReadContextBuilder& SetReadFieldNames (const std::vector<std::string>& read_field_names);
179198 // / Set the schema fields to read from the table.
180199 // /
181200 // / If not set, all fields from the table schema will be read. This is useful for
@@ -184,12 +203,51 @@ class PAIMON_EXPORT ReadContextBuilder {
184203 // /
185204 // / @param read_field_ids Vector of field ids to read from the table.
186205 // / @return Reference to this builder for method chaining.
187- // / @note Currently supports top-level field selection. Future versions may support
188- // / nested field selection using ArrowSchema for more granular projection.
189- // / @note SetReadFieldIds() and SetReadSchema() are mutually exclusive.
190- // / Calling both will ignore the read schema set by SetReadSchema().
206+ // / @note Currently supports top-level field selection.
207+ // / @note SetReadFieldIds() and SetReadFieldNames() are mutually exclusive.
208+ // / Calling both will ignore the read schema set by SetReadFieldNames().
191209 ReadContextBuilder& SetReadFieldIds (const std::vector<int32_t >& read_field_ids);
192210
211+ // / Set the read Arrow Schema for nested column pruning.
212+ // /
213+ // / The read schema is an Arrow C Data Interface schema where STRUCT types
214+ // / may contain only a subset of the original sub-fields, enabling nested column
215+ // / pruning to reduce I/O. Field matching is based on field name: the system
216+ // / looks up each field by name in the table schema and rebuilds the aligned
217+ // / schema using the table schema's type and metadata. Metadata propagation
218+ // / from the user-provided schema is whitelist-based: currently only
219+ // / "paimon.map.selected-keys" is preserved and merged into the final aligned
220+ // / schema.
221+ // /
222+ // / To prune map entries by key, attach metadata "paimon.map.selected-keys"
223+ // / to the target map field in read schema. The value is a comma-separated
224+ // / key list, for example: "k1,k2". Only map fields with string key type
225+ // / (Arrow utf8) are supported.
226+ // /
227+ // / Example:
228+ // / @code{.cpp}
229+ // / auto map_field = arrow::field("m", arrow::map(arrow::utf8(), arrow::int32()));
230+ // / auto map_meta = arrow::KeyValueMetadata::Make(
231+ // / {"paimon.map.selected-keys"}, {"k1,k2"});
232+ // / auto projected_schema = arrow::schema({
233+ // / arrow::field("id", arrow::int64()),
234+ // / map_field->WithMetadata(map_meta),
235+ // / });
236+ // /
237+ // / auto c_schema = std::make_unique<ArrowSchema>();
238+ // / arrow::ExportSchema(*projected_schema, c_schema.get());
239+ // /
240+ // / ReadContextBuilder builder("/path/to/table");
241+ // / builder.SetReadSchema(std::move(c_schema));
242+ // / @endcode
243+ // /
244+ // / @param read_schema Arrow C Schema. Ownership of schema resources is transferred
245+ // / to the built ReadContext.
246+ // / @return Reference to this builder for method chaining.
247+ // / @note Priority: read_schema > read_field_ids > read_field_names.
248+ // / When set, read_field_ids and read_field_names are ignored.
249+ ReadContextBuilder& SetReadSchema (std::unique_ptr<ArrowSchema> read_schema);
250+
193251 // / Set a configuration options map to set some option entries which are not defined in the
194252 // / table schema or whose values you want to overwrite.
195253 // / @note The options map will clear the options added by `AddOption()` before.
0 commit comments