17
17
18
18
#include " paimon_reader.h"
19
19
20
+ #include < rapidjson/document.h>
21
+
20
22
#include < vector>
21
23
22
24
#include " common/status.h"
25
27
namespace doris ::vectorized {
26
28
#include " common/compile_check_begin.h"
27
29
PaimonReader::PaimonReader (std::unique_ptr<GenericReader> file_format_reader,
28
- RuntimeProfile* profile, const TFileScanRangeParams& params)
29
- : TableFormatReader(std::move(file_format_reader)), _profile(profile), _params(params) {
30
+ RuntimeProfile* profile, const TFileScanRangeParams& params,
31
+ const TFileRangeDesc& range, io::IOContext* io_ctx,
32
+ ShardedKVCache* kv_cache)
33
+ : TableFormatReader(std::move(file_format_reader)),
34
+ _profile (profile),
35
+ _io_ctx(io_ctx),
36
+ _range(range),
37
+ _kv_cache(kv_cache),
38
+ _params(params) {
30
39
static const char * paimon_profile = " PaimonProfile" ;
31
40
ADD_TIMER (_profile, paimon_profile);
32
41
_paimon_profile.num_delete_rows =
@@ -35,6 +44,223 @@ PaimonReader::PaimonReader(std::unique_ptr<GenericReader> file_format_reader,
35
44
ADD_CHILD_TIMER (_profile, " DeleteFileReadTime" , paimon_profile);
36
45
}
37
46
47
+ Status PaimonReader::get_next_block (Block* block, size_t * read_rows, bool * eof) {
48
+ if (_has_schema_change) {
49
+ for (int i = 0 ; i < block->columns (); i++) {
50
+ ColumnWithTypeAndName& col = block->get_by_position (i);
51
+ auto iter = _table_col_to_file_col.find (col.name );
52
+ if (iter != _table_col_to_file_col.end ()) {
53
+ col.name = iter->second ;
54
+ }
55
+ }
56
+ block->initialize_index_by_name ();
57
+ }
58
+
59
+ RETURN_IF_ERROR (_file_format_reader->get_next_block (block, read_rows, eof));
60
+
61
+ if (_has_schema_change) {
62
+ for (int i = 0 ; i < block->columns (); i++) {
63
+ ColumnWithTypeAndName& col = block->get_by_position (i);
64
+ auto iter = _file_col_to_table_col.find (col.name );
65
+ if (iter != _file_col_to_table_col.end ()) {
66
+ col.name = iter->second ;
67
+ }
68
+ }
69
+ block->initialize_index_by_name ();
70
+ }
71
+ return Status::OK ();
72
+ }
73
+
74
+ /* *
75
+ sql:
76
+ create table tmp3 (
77
+ k int,
78
+ vVV string,
79
+ a array<int>,
80
+ b struct<a:int,b:string>,
81
+ c map<string,int>
82
+ ) tblproperties (
83
+ 'primary-key' = 'k',
84
+ "file.format" = "parquet"
85
+ );
86
+ schema :
87
+ {
88
+ "version" : 2,
89
+ "id" : 0,
90
+ "fields" : [ {
91
+ "id" : 0,
92
+ "name" : "k",
93
+ "type" : "INT NOT NULL"
94
+ }, {
95
+ "id" : 1,
96
+ "name" : "vVV",
97
+ "type" : "STRING"
98
+ }, {
99
+ "id" : 2,
100
+ "name" : "a",
101
+ "type" : {
102
+ "type" : "ARRAY",
103
+ "element" : "INT"
104
+ }
105
+ }, {
106
+ "id" : 3,
107
+ "name" : "b",
108
+ "type" : {
109
+ "type" : "ROW",
110
+ "fields" : [ {
111
+ "id" : 4,
112
+ "name" : "a",
113
+ "type" : "INT"
114
+ }, {
115
+ "id" : 5,
116
+ "name" : "b",
117
+ "type" : "STRING"
118
+ } ]
119
+ }
120
+ }, {
121
+ "id" : 6,
122
+ "name" : "c",
123
+ "type" : {
124
+ "type" : "MAP",
125
+ "key" : "STRING NOT NULL",
126
+ "value" : "INT"
127
+ }
128
+ } ],
129
+ "highestFieldId" : 6,
130
+ "partitionKeys" : [ ],
131
+ "primaryKeys" : [ "k" ],
132
+ "options" : {
133
+ "owner" : "root",
134
+ "file.format" : "parquet"
135
+ },
136
+ "timeMillis" : 1741338580187
137
+ }
138
+ */
139
+ Status PaimonReader::read_schema_file (std::map<uint64_t , std::string>& file_id_to_name) {
140
+ file_id_to_name.clear ();
141
+ if (!_range.table_format_params .paimon_params .__isset .schema_file_path ) [[unlikely]] {
142
+ return Status::RuntimeError (" miss schema file" );
143
+ }
144
+
145
+ io::FileSystemProperties properties = {
146
+ .system_type = _params.file_type ,
147
+ .properties = _params.properties ,
148
+ .hdfs_params = _params.hdfs_params ,
149
+ .broker_addresses {},
150
+ };
151
+ if (_params.__isset .broker_addresses ) {
152
+ properties.broker_addresses .assign (_params.broker_addresses .begin (),
153
+ _params.broker_addresses .end ());
154
+ }
155
+ io::FileDescription file_description = {
156
+ .path = _range.table_format_params .paimon_params .schema_file_path ,
157
+ .file_size = -1 ,
158
+ .mtime = 0 ,
159
+ .fs_name = _range.fs_name ,
160
+ };
161
+ auto schema_file_reader = DORIS_TRY (FileFactory::create_file_reader (
162
+ properties, file_description, io::FileReaderOptions::DEFAULT));
163
+
164
+ size_t bytes_read = schema_file_reader->size ();
165
+ std::vector<char > buf (bytes_read);
166
+ Slice schema_result (buf.data (), bytes_read);
167
+ {
168
+ SCOPED_TIMER (_paimon_profile.delete_files_read_time );
169
+ RETURN_IF_ERROR (schema_file_reader->read_at (0 , schema_result, &bytes_read, _io_ctx));
170
+ }
171
+
172
+ rapidjson::Document json_doc;
173
+ if (json_doc.Parse (schema_result.data , schema_result.size ).HasParseError ()) {
174
+ return Status::IOError (" failed to parse json file, path:{}" ,
175
+ _range.table_format_params .paimon_params .schema_file_path );
176
+ }
177
+
178
+ if (!json_doc.HasMember (" fields" ) || !json_doc[" fields" ].IsArray ()) {
179
+ return Status::IOError (" Invalid JSON: missing or incorrect 'fields' array, path:{} " ,
180
+ _range.table_format_params .paimon_params .schema_file_path );
181
+ }
182
+ const auto & fields = json_doc[" fields" ];
183
+ for (const auto & field : fields.GetArray ()) {
184
+ if (field.HasMember (" id" ) && field[" id" ].IsInt () && field.HasMember (" name" ) &&
185
+ field[" name" ].IsString ()) {
186
+ int id = field[" id" ].GetInt ();
187
+ std::string name = to_lower (field[" name" ].GetString ());
188
+ file_id_to_name[id] = name;
189
+ }
190
+ }
191
+
192
+ return Status::OK ();
193
+ }
194
+
195
+ Status PaimonReader::gen_file_col_name (
196
+ const std::vector<std::string>& read_table_col_names,
197
+ const std::unordered_map<uint64_t , std::string>& table_col_id_table_name_map,
198
+ const std::unordered_map<std::string, ColumnValueRangeType>*
199
+ table_col_name_to_value_range) {
200
+ // It is a bit similar to iceberg. I will consider integrating it when I write hudi schema change later.
201
+ _table_col_to_file_col.clear ();
202
+ _file_col_to_table_col.clear ();
203
+
204
+ if (!_range.table_format_params .paimon_params .__isset .schema_file_path ) [[unlikely]] {
205
+ return Status::RuntimeError (" miss schema file" );
206
+ }
207
+
208
+ Status create_status = Status::OK ();
209
+ using MapType = std::map<uint64_t , std::string>;
210
+ const auto table_id_to_file_name = *_kv_cache->get <MapType>(
211
+ _range.table_format_params .paimon_params .schema_file_path , [&]() -> MapType* {
212
+ auto * file_id_to_name_ptr = new MapType ();
213
+ create_status = read_schema_file (*file_id_to_name_ptr);
214
+ if (!create_status) {
215
+ delete file_id_to_name_ptr;
216
+ return nullptr ;
217
+ }
218
+ return file_id_to_name_ptr;
219
+ });
220
+ RETURN_IF_ERROR (create_status);
221
+
222
+ for (auto [table_col_id, file_col_name] : table_id_to_file_name) {
223
+ if (table_col_id_table_name_map.find (table_col_id) == table_col_id_table_name_map.end ()) {
224
+ continue ;
225
+ }
226
+ auto & table_col_name = table_col_id_table_name_map.at (table_col_id);
227
+
228
+ _table_col_to_file_col.emplace (table_col_name, file_col_name);
229
+ _file_col_to_table_col.emplace (file_col_name, table_col_name);
230
+ if (table_col_name != file_col_name) {
231
+ _has_schema_change = true ;
232
+ }
233
+ }
234
+
235
+ _all_required_col_names.clear ();
236
+ _not_in_file_col_names.clear ();
237
+ for (auto name : read_table_col_names) {
238
+ auto iter = _table_col_to_file_col.find (name);
239
+ if (iter == _table_col_to_file_col.end ()) {
240
+ auto name_low = to_lower (name);
241
+ _all_required_col_names.emplace_back (name_low);
242
+
243
+ _table_col_to_file_col.emplace (name, name_low);
244
+ _file_col_to_table_col.emplace (name_low, name);
245
+ if (name != name_low) {
246
+ _has_schema_change = true ;
247
+ }
248
+ } else {
249
+ _all_required_col_names.emplace_back (iter->second );
250
+ }
251
+ }
252
+
253
+ for (auto & it : *table_col_name_to_value_range) {
254
+ auto iter = _table_col_to_file_col.find (it.first );
255
+ if (iter == _table_col_to_file_col.end ()) {
256
+ _new_colname_to_value_range.emplace (it.first , it.second );
257
+ } else {
258
+ _new_colname_to_value_range.emplace (iter->second , it.second );
259
+ }
260
+ }
261
+ return Status::OK ();
262
+ }
263
+
38
264
Status PaimonReader::init_row_filters (const TFileRangeDesc& range, io::IOContext* io_ctx) {
39
265
const auto & table_desc = range.table_format_params .paimon_params ;
40
266
if (!table_desc.__isset .deletion_file ) {
0 commit comments