@@ -172,56 +172,80 @@ std::shared_ptr<Metrics> OrcFileBatchReader::GetReaderMetrics() const {
172172 return metrics_;
173173}
174174
175- Result<std::list<std::string>> OrcFileBatchReader::GetAndCheckIncludedFields (
176- const ::orc::Type* src_type, const ::orc::Type* target_type,
177- std::vector<uint64_t >* target_column_ids) {
178- std::list<std::string> include_fields;
179- std::unordered_map<std::string, const ::orc::Type*> src_type_map;
180- for (uint64_t i = 0 ; i < src_type->getSubtypeCount (); i++) {
181- src_type_map[src_type->getFieldName (i)] = src_type->getSubtype (i);
175+ Status OrcFileBatchReader::CollectTargetColumnIds (const ::orc::Type* src_type,
176+ const ::orc::Type* target_type,
177+ std::vector<uint64_t >* target_column_ids) {
178+ auto src_kind = src_type->getKind ();
179+ auto target_kind = target_type->getKind ();
180+ if (src_kind != target_kind) {
181+ return Status::Invalid (fmt::format (" type kind mismatch: src {} vs target {}" ,
182+ src_type->toString (), target_type->toString ()));
182183 }
183- int64_t prev_target_field_col_id = -1 ;
184- for (uint64_t i = 0 ; i < target_type->getSubtypeCount (); i++) {
185- auto & field_name = target_type->getFieldName (i);
186- auto iter = src_type_map.find (field_name);
187- if (iter == src_type_map.end ()) {
188- return Status::Invalid (
189- fmt::format (" field {} not in file schema {}" , field_name, src_type->toString ()));
184+
185+ switch (src_kind) {
186+ case ::orc::TypeKind::STRUCT : {
187+ std::unordered_map<std::string, const ::orc::Type*> src_field_map;
188+ for (uint64_t i = 0 ; i < src_type->getSubtypeCount (); i++) {
189+ src_field_map[src_type->getFieldName (i)] = src_type->getSubtype (i);
190+ }
191+ for (uint64_t i = 0 ; i < target_type->getSubtypeCount (); i++) {
192+ auto & field_name = target_type->getFieldName (i);
193+ auto iter = src_field_map.find (field_name);
194+ if (iter == src_field_map.end ()) {
195+ return Status::Invalid (fmt::format (" field {} not in file schema {}" , field_name,
196+ src_type->toString ()));
197+ }
198+ PAIMON_RETURN_NOT_OK (CollectTargetColumnIds (
199+ iter->second , target_type->getSubtype (i), target_column_ids));
200+ }
201+ break ;
190202 }
191- // Noted that: do not support recall partial fields in nested type
192- if (iter->second ->toString () != target_type->getSubtype (i)->toString ()) {
193- return Status::Invalid (
194- fmt::format (" target_type {} not match src_type {}, mismatch field name {}" ,
195- target_type->toString (), src_type->toString (), field_name));
203+ case ::orc::TypeKind::LIST : {
204+ if (target_type->getSubtypeCount () != 1 || src_type->getSubtypeCount () != 1 ) {
205+ return Status::Invalid (fmt::format (" invalid list type: src {} vs target {}" ,
206+ src_type->toString (), target_type->toString ()));
207+ }
208+ // list cannot be partially projected
209+ target_column_ids->push_back (src_type->getColumnId ());
210+ PAIMON_RETURN_NOT_OK (CollectTargetColumnIds (
211+ src_type->getSubtype (0 ), target_type->getSubtype (0 ), target_column_ids));
212+ break ;
196213 }
197- int64_t target_field_col_id = iter->second ->getColumnId ();
198- GetSubColumnIds (iter->second , target_column_ids);
199- if (prev_target_field_col_id >= target_field_col_id) {
200- return Status::Invalid (
201- " The column id of the target field should be monotonically increasing in "
202- " format reader" );
214+ case ::orc::TypeKind::MAP : {
215+ if (target_type->getSubtypeCount () != 2 || src_type->getSubtypeCount () != 2 ) {
216+ return Status::Invalid (fmt::format (" invalid map type: src {} vs target {}" ,
217+ src_type->toString (), target_type->toString ()));
218+ }
219+ // map cannot be partially projected
220+ target_column_ids->push_back (src_type->getColumnId ());
221+ PAIMON_RETURN_NOT_OK (CollectTargetColumnIds (
222+ src_type->getSubtype (0 ), target_type->getSubtype (0 ), target_column_ids));
223+ PAIMON_RETURN_NOT_OK (CollectTargetColumnIds (
224+ src_type->getSubtype (1 ), target_type->getSubtype (1 ), target_column_ids));
225+ break ;
203226 }
204- prev_target_field_col_id = target_field_col_id;
205- include_fields.push_back (field_name);
206- }
207- return include_fields;
208- }
209-
210- void OrcFileBatchReader::GetSubColumnIds (const ::orc::Type* type, std::vector<uint64_t >* col_ids) {
211- col_ids->push_back (type->getColumnId ());
212- for (uint64_t i = 0 ; i < type->getSubtypeCount (); i++) {
213- GetSubColumnIds (type->getSubtype (i), col_ids);
227+ default :
228+ target_column_ids->push_back (src_type->getColumnId ());
229+ break ;
214230 }
231+ return Status::OK ();
215232}
216233
217234Result<::orc::RowReaderOptions> OrcFileBatchReader::CreateRowReaderOptions (
218235 const ::orc::Type* src_type, const ::orc::Type* target_type,
219236 std::unique_ptr<::orc::SearchArgument>&& search_arg,
220237 const std::map<std::string, std::string>& options, std::vector<uint64_t >* target_column_ids) {
221- PAIMON_ASSIGN_OR_RAISE (std::list<std::string> include_fields,
222- GetAndCheckIncludedFields (src_type, target_type, target_column_ids));
238+ PAIMON_RETURN_NOT_OK (CollectTargetColumnIds (src_type, target_type, target_column_ids));
239+ for (size_t i = 1 ; i < target_column_ids->size (); i++) {
240+ if ((*target_column_ids)[i - 1 ] >= (*target_column_ids)[i]) {
241+ return Status::Invalid (
242+ " The column id of the target field should be monotonically increasing in "
243+ " format reader" );
244+ }
245+ }
223246 ::orc::RowReaderOptions row_reader_options;
224- row_reader_options.include (include_fields);
247+ std::list<uint64_t > include_type_ids (target_column_ids->begin (), target_column_ids->end ());
248+ row_reader_options.includeTypes (include_type_ids);
225249 // In order to avoid issue like https://github.com/alibaba/paimon-cpp/issues/42, we explicitly
226250 // set GMT timezone.
227251 row_reader_options.setTimezoneName (" GMT" );
0 commit comments