@@ -1022,7 +1022,9 @@ GetFieldDatasFromStorageV2(std::vector<std::vector<std::string>>& remote_files,
1022
1022
int64_t dim) {
1023
1023
AssertInfo (remote_files.size () > 0 , " remote files size is 0" );
1024
1024
std::vector<FieldDataPtr> field_data_list;
1025
- for (auto & remote_chunk_files : remote_files) {
1025
+
1026
+ for (int i = 0 ; i < remote_files.size (); i++) {
1027
+ auto & remote_chunk_files = remote_files[i];
1026
1028
AssertInfo (remote_chunk_files.size () > 0 , " remote files size is 0" );
1027
1029
1028
1030
auto fs = milvus_storage::ArrowFileSystemSingleton::GetInstance ()
@@ -1043,7 +1045,12 @@ GetFieldDatasFromStorageV2(std::vector<std::vector<std::string>>& remote_files,
1043
1045
AssertInfo (column_offset.path_index < remote_files.size (),
1044
1046
" column offset path index {} is out of range" ,
1045
1047
column_offset.path_index );
1046
- auto column_group_file = remote_chunk_files[column_offset.path_index ];
1048
+ if (column_offset.path_index != i) {
1049
+ LOG_INFO (" Skip group id {} since target field shall be in group {}" ,
1050
+ i,
1051
+ column_offset.path_index );
1052
+ continue ;
1053
+ }
1047
1054
1048
1055
// set up channel for arrow reader
1049
1056
auto field_data_info = FieldDataInfo ();
@@ -1054,48 +1061,53 @@ GetFieldDatasFromStorageV2(std::vector<std::vector<std::string>>& remote_files,
1054
1061
auto & pool =
1055
1062
ThreadPools::GetThreadPool (milvus::ThreadPoolPriority::MIDDLE);
1056
1063
1057
- // get all row groups for each file
1058
- std::vector<std::vector<int64_t >> row_group_lists;
1059
- auto reader = std::make_shared<milvus_storage::FileRowGroupReader>(
1060
- fs, column_group_file);
1061
- auto row_group_num =
1062
- reader->file_metadata ()->GetRowGroupMetadataVector ().size ();
1063
- std::vector<int64_t > all_row_groups (row_group_num);
1064
- std::iota (all_row_groups.begin (), all_row_groups.end (), 0 );
1065
- row_group_lists.push_back (all_row_groups);
1066
-
1067
- // create a schema with only the field id
1068
- auto field_schema =
1069
- file_reader->schema ()->field (column_offset.col_index )->Copy ();
1070
- auto arrow_schema = arrow::schema ({field_schema});
1071
-
1072
- // split row groups for parallel reading
1073
- auto strategy = std::make_unique<segcore::ParallelDegreeSplitStrategy>(
1074
- parallel_degree);
1075
- auto load_future = pool.Submit ([&]() {
1076
- return LoadWithStrategy (std::vector<std::string>{column_group_file},
1077
- field_data_info.arrow_reader_channel ,
1078
- DEFAULT_FIELD_MAX_MEMORY_LIMIT,
1079
- std::move (strategy),
1080
- row_group_lists,
1081
- nullptr );
1082
- });
1083
- // read field data from channel
1084
- std::shared_ptr<milvus::ArrowDataWrapper> r;
1085
- while (field_data_info.arrow_reader_channel ->pop (r)) {
1086
- size_t num_rows = 0 ;
1087
- std::vector<std::shared_ptr<arrow::ChunkedArray>> chunked_arrays;
1088
- for (const auto & table : r->arrow_tables ) {
1089
- num_rows += table->num_rows ();
1090
- chunked_arrays.push_back (
1091
- table->column (column_offset.col_index ));
1092
- }
1093
- auto field_data = storage::CreateFieldData (
1094
- data_type, field_schema->nullable (), dim, num_rows);
1095
- for (const auto & chunked_array : chunked_arrays) {
1096
- field_data->FillFieldData (chunked_array);
1064
+ for (auto & column_group_file : remote_chunk_files) {
1065
+ // get all row groups for each file
1066
+ std::vector<std::vector<int64_t >> row_group_lists;
1067
+ auto reader = std::make_shared<milvus_storage::FileRowGroupReader>(
1068
+ fs, column_group_file);
1069
+ auto row_group_num =
1070
+ reader->file_metadata ()->GetRowGroupMetadataVector ().size ();
1071
+ std::vector<int64_t > all_row_groups (row_group_num);
1072
+ std::iota (all_row_groups.begin (), all_row_groups.end (), 0 );
1073
+ row_group_lists.push_back (all_row_groups);
1074
+
1075
+ // create a schema with only the field id
1076
+ auto field_schema =
1077
+ file_reader->schema ()->field (column_offset.col_index )->Copy ();
1078
+ auto arrow_schema = arrow::schema ({field_schema});
1079
+
1080
+ // split row groups for parallel reading
1081
+ auto strategy =
1082
+ std::make_unique<segcore::ParallelDegreeSplitStrategy>(
1083
+ parallel_degree);
1084
+ auto load_future = pool.Submit ([&]() {
1085
+ return LoadWithStrategy (
1086
+ std::vector<std::string>{column_group_file},
1087
+ field_data_info.arrow_reader_channel ,
1088
+ DEFAULT_FIELD_MAX_MEMORY_LIMIT,
1089
+ std::move (strategy),
1090
+ row_group_lists,
1091
+ nullptr );
1092
+ });
1093
+ // read field data from channel
1094
+ std::shared_ptr<milvus::ArrowDataWrapper> r;
1095
+ while (field_data_info.arrow_reader_channel ->pop (r)) {
1096
+ size_t num_rows = 0 ;
1097
+ std::vector<std::shared_ptr<arrow::ChunkedArray>>
1098
+ chunked_arrays;
1099
+ for (const auto & table : r->arrow_tables ) {
1100
+ num_rows += table->num_rows ();
1101
+ chunked_arrays.push_back (
1102
+ table->column (column_offset.col_index ));
1103
+ }
1104
+ auto field_data = storage::CreateFieldData (
1105
+ data_type, field_schema->nullable (), dim, num_rows);
1106
+ for (const auto & chunked_array : chunked_arrays) {
1107
+ field_data->FillFieldData (chunked_array);
1108
+ }
1109
+ field_data_list.push_back (field_data);
1097
1110
}
1098
- field_data_list.push_back (field_data);
1099
1111
}
1100
1112
}
1101
1113
return field_data_list;
0 commit comments