Skip to content

Commit b50c3be

Browse files
committed
support filter map with keys
1 parent 3fd480e commit b50c3be

8 files changed

Lines changed: 487 additions & 65 deletions

File tree

src/paimon/common/types/data_field.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ class DataField : public Jsonizable<DataField> {
4141

4242
static constexpr char FIELD_ID[] = "paimon.id";
4343
static constexpr char DESCRIPTION[] = "paimon.description";
44+
/// Metadata key for map field selected keys. The value is a JSON array of
45+
/// string keys, e.g. '["key1","key2"]'. Only string-keyed maps are supported.
46+
static constexpr char MAP_SELECTED_KEYS[] = "paimon.map.selected-keys";
4447

4548
public:
4649
static std::shared_ptr<arrow::Field> ConvertDataFieldToArrowField(const DataField& field);

src/paimon/core/io/field_mapping_reader.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include <cassert>
1919
#include <cstddef>
20+
#include <set>
2021
#include <utility>
2122

2223
#include "arrow/api.h"
@@ -79,6 +80,16 @@ FieldMappingReader::FieldMappingReader(int32_t field_count,
7980
non_partition_info_.non_partition_read_schema[i].Type())) {
8081
need_mapping_ = true;
8182
}
83+
// Map selected-keys metadata also requires mapping so that
84+
// FilterMapArrayBySelectedKeys can filter out unwanted entries.
85+
if (!need_mapping_ &&
86+
non_partition_info_.non_partition_read_schema[i].Type()->id() == arrow::Type::MAP) {
87+
std::set<std::string> selected_keys = NestedProjectionUtils::GetMapSelectedKeys(
88+
non_partition_info_.non_partition_read_schema[i].ArrowField());
89+
if (!selected_keys.empty()) {
90+
need_mapping_ = true;
91+
}
92+
}
8293
}
8394
}
8495

@@ -306,7 +317,21 @@ Status FieldMappingReader::MappingFields(const std::shared_ptr<arrow::Array>& da
306317
// sub-fields than requested, prune the excess here.
307318
const std::shared_ptr<arrow::DataType>& target_type = read_fields_of_data_array[i].Type();
308319
if (!field_array->type()->Equals(target_type)) {
309-
PAIMON_ASSIGN_OR_RAISE(field_array, PruneArray(field_array, target_type));
320+
PAIMON_ASSIGN_OR_RAISE(field_array,
321+
NestedProjectionUtils::PruneArray(field_array, target_type));
322+
}
323+
324+
// Filter map entries by selected keys if metadata is present.
325+
if (field_array->type()->id() == arrow::Type::MAP) {
326+
std::set<std::string> selected_keys =
327+
NestedProjectionUtils::GetMapSelectedKeys(
328+
read_fields_of_data_array[i].ArrowField());
329+
if (!selected_keys.empty()) {
330+
PAIMON_ASSIGN_OR_RAISE(
331+
field_array,
332+
NestedProjectionUtils::FilterMapArrayBySelectedKeys(
333+
field_array, selected_keys));
334+
}
310335
}
311336

312337
(*target_array)[idx_in_target_schema[i]] = std::move(field_array);

src/paimon/core/utils/field_mapping.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ Result<ExistFieldInfo> FieldMappingBuilder::CreateExistFieldInfo(
107107
// projection. For atomic types this is a no-op.
108108
PAIMON_ASSIGN_OR_RAISE(
109109
std::optional<std::shared_ptr<arrow::DataType>> pruned_type,
110-
PruneDataType(read_field.Type(), data_field.Type()));
110+
NestedProjectionUtils::PruneDataType(read_field.Type(), data_field.Type()));
111111
if (!pruned_type.has_value()) {
112112
// All sub-fields pruned away — treat as non-existent.
113113
continue;

src/paimon/core/utils/nested_projection_utils.cpp

Lines changed: 145 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,23 @@
1616

1717
#include "paimon/core/utils/nested_projection_utils.h"
1818

19+
#include <set>
1920
#include <string>
2021
#include <utility>
22+
#include <vector>
2123

2224
#include "arrow/array/array_nested.h"
25+
#include "arrow/array/array_primitive.h"
26+
#include "arrow/array/builder_primitive.h"
27+
#include "arrow/array/concatenate.h"
2328
#include "arrow/type.h"
2429
#include "fmt/format.h"
2530
#include "paimon/status.h"
31+
#include "rapidjson/document.h"
2632

2733
namespace paimon {
2834

29-
Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
35+
Result<std::optional<std::shared_ptr<arrow::DataType>>> NestedProjectionUtils::PruneDataType(
3036
const std::shared_ptr<arrow::DataType>& read_type,
3137
const std::shared_ptr<arrow::DataType>& data_type) {
3238
// Identical types need no pruning.
@@ -106,7 +112,7 @@ Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
106112
// PruneArray — fallback for format readers that return extra nested columns
107113
// ---------------------------------------------------------------------------
108114

109-
Result<std::shared_ptr<arrow::Array>> PruneArray(
115+
Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::PruneArray(
110116
const std::shared_ptr<arrow::Array>& array,
111117
const std::shared_ptr<arrow::DataType>& target_type) {
112118
if (!array || array->type()->Equals(target_type)) {
@@ -172,4 +178,141 @@ Result<std::shared_ptr<arrow::Array>> PruneArray(
172178
}
173179
}
174180

181+
// ---------------------------------------------------------------------------
182+
// Map selected-keys support
183+
// ---------------------------------------------------------------------------
184+
185+
std::set<std::string> NestedProjectionUtils::GetMapSelectedKeys(
186+
const std::shared_ptr<arrow::Field>& field) {
187+
std::set<std::string> result;
188+
if (!field || !field->HasMetadata() || !field->metadata()) {
189+
return result;
190+
}
191+
auto get_result = field->metadata()->Get(DataField::MAP_SELECTED_KEYS);
192+
if (!get_result.ok()) {
193+
return result;
194+
}
195+
const std::string& json_str = get_result.ValueUnsafe();
196+
rapidjson::Document doc;
197+
doc.Parse(json_str.c_str());
198+
if (doc.HasParseError() || !doc.IsArray()) {
199+
return result;
200+
}
201+
for (rapidjson::SizeType i = 0; i < doc.Size(); ++i) {
202+
if (doc[i].IsString()) {
203+
result.emplace(doc[i].GetString(), doc[i].GetStringLength());
204+
}
205+
}
206+
return result;
207+
}
208+
209+
Result<std::shared_ptr<arrow::Array>> NestedProjectionUtils::FilterMapArrayBySelectedKeys(
210+
const std::shared_ptr<arrow::Array>& array,
211+
const std::set<std::string>& selected_keys) {
212+
if (selected_keys.empty() || !array || array->length() == 0) {
213+
return array;
214+
}
215+
216+
auto map_array = std::static_pointer_cast<arrow::MapArray>(array);
217+
auto map_type = std::static_pointer_cast<arrow::MapType>(array->type());
218+
219+
if (map_type->key_type()->id() != arrow::Type::STRING) {
220+
return Status::Invalid(fmt::format(
221+
"FilterMapArrayBySelectedKeys only supports string keys, got {}",
222+
map_type->key_type()->ToString()));
223+
}
224+
225+
auto keys_array = std::static_pointer_cast<arrow::StringArray>(map_array->keys());
226+
auto values_array = map_array->items();
227+
int64_t total_entries = keys_array->length();
228+
int64_t num_maps = map_array->length();
229+
230+
// Mark which flat entries to keep
231+
std::vector<bool> keep(total_entries, false);
232+
int64_t kept_count = 0;
233+
for (int64_t i = 0; i < total_entries; ++i) {
234+
if (!keys_array->IsNull(i)) {
235+
std::string_view key_view = keys_array->GetView(i);
236+
std::string key_str(key_view.data(), key_view.size());
237+
if (selected_keys.count(key_str) > 0) {
238+
keep[i] = true;
239+
++kept_count;
240+
}
241+
}
242+
}
243+
244+
if (kept_count == total_entries) {
245+
return array;
246+
}
247+
248+
// Collect kept slices as contiguous runs to build filtered key/value arrays
249+
// via Slice + Concatenate (avoids arrow::compute::Take dependency).
250+
arrow::ArrayVector key_slices;
251+
arrow::ArrayVector value_slices;
252+
key_slices.reserve(kept_count);
253+
value_slices.reserve(kept_count);
254+
255+
std::vector<int32_t> new_offsets;
256+
new_offsets.reserve(num_maps + 1);
257+
int32_t running_offset = 0;
258+
259+
for (int64_t map_idx = 0; map_idx < num_maps; ++map_idx) {
260+
new_offsets.push_back(running_offset);
261+
if (map_array->IsNull(map_idx)) {
262+
continue;
263+
}
264+
int64_t start = map_array->value_offset(map_idx);
265+
int64_t end = map_array->value_offset(map_idx + 1);
266+
// Collect contiguous runs of kept entries within this map
267+
int64_t run_start = -1;
268+
for (int64_t entry_idx = start; entry_idx <= end; ++entry_idx) {
269+
bool should_keep = (entry_idx < end) && keep[entry_idx];
270+
if (should_keep && run_start < 0) {
271+
run_start = entry_idx;
272+
} else if (!should_keep && run_start >= 0) {
273+
int64_t run_len = entry_idx - run_start;
274+
key_slices.push_back(keys_array->Slice(run_start, run_len));
275+
value_slices.push_back(values_array->Slice(run_start, run_len));
276+
running_offset += static_cast<int32_t>(run_len);
277+
run_start = -1;
278+
}
279+
}
280+
}
281+
new_offsets.push_back(running_offset);
282+
283+
// Build filtered key/value arrays
284+
std::shared_ptr<arrow::Array> filtered_keys;
285+
std::shared_ptr<arrow::Array> filtered_values;
286+
if (key_slices.empty()) {
287+
// All entries filtered out — create empty arrays
288+
filtered_keys = keys_array->Slice(0, 0);
289+
filtered_values = values_array->Slice(0, 0);
290+
} else if (key_slices.size() == 1) {
291+
filtered_keys = key_slices[0];
292+
filtered_values = value_slices[0];
293+
} else {
294+
PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_keys,
295+
arrow::Concatenate(key_slices));
296+
PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(filtered_values,
297+
arrow::Concatenate(value_slices));
298+
}
299+
300+
// Build new offsets array
301+
arrow::Int32Builder offset_builder;
302+
PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Reserve(
303+
static_cast<int64_t>(new_offsets.size())));
304+
for (int32_t offset : new_offsets) {
305+
offset_builder.UnsafeAppend(offset);
306+
}
307+
std::shared_ptr<arrow::Array> new_offsets_array;
308+
PAIMON_RETURN_NOT_OK_FROM_ARROW(offset_builder.Finish(&new_offsets_array));
309+
310+
PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(
311+
std::shared_ptr<arrow::Array> result_map,
312+
arrow::MapArray::FromArrays(new_offsets_array, filtered_keys, filtered_values,
313+
arrow::default_memory_pool(),
314+
map_array->null_bitmap()));
315+
return result_map;
316+
}
317+
175318
} // namespace paimon

src/paimon/core/utils/nested_projection_utils.h

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <cstdint>
2020
#include <memory>
2121
#include <optional>
22+
#include <set>
2223
#include <string>
2324
#include <vector>
2425

@@ -29,53 +30,73 @@
2930

3031
namespace paimon {
3132

32-
/// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
33-
/// Returns -1 if the metadata key is not present.
34-
inline int32_t GetPaimonFieldId(const std::shared_ptr<arrow::Field>& field) {
35-
if (!field || !field->HasMetadata() || !field->metadata()) {
36-
return -1;
37-
}
38-
auto result = field->metadata()->Get(DataField::FIELD_ID);
39-
if (!result.ok()) {
40-
return -1;
41-
}
42-
std::optional<int32_t> field_id = StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
43-
return field_id.value_or(-1);
44-
}
33+
/// Utility class for nested column pruning and map key selection.
34+
class NestedProjectionUtils {
35+
public:
36+
NestedProjectionUtils() = delete;
4537

46-
/// Find a child field in a STRUCT DataType by paimon field ID.
47-
/// Returns nullptr if no child has the given ID.
48-
inline std::shared_ptr<arrow::Field> FindFieldByPaimonId(
49-
const std::shared_ptr<arrow::DataType>& struct_type, int32_t field_id) {
50-
if (!struct_type || struct_type->id() != arrow::Type::STRUCT) {
51-
return nullptr;
38+
/// Extract the paimon field ID from an Arrow field's metadata ("paimon.id").
39+
/// Returns -1 if the metadata key is not present.
40+
static int32_t GetPaimonFieldId(const std::shared_ptr<arrow::Field>& field) {
41+
if (!field || !field->HasMetadata() || !field->metadata()) {
42+
return -1;
43+
}
44+
auto result = field->metadata()->Get(DataField::FIELD_ID);
45+
if (!result.ok()) {
46+
return -1;
47+
}
48+
std::optional<int32_t> field_id =
49+
StringUtils::StringToValue<int32_t>(result.ValueUnsafe());
50+
return field_id.value_or(-1);
5251
}
53-
for (const auto& child : struct_type->fields()) {
54-
if (GetPaimonFieldId(child) == field_id) {
55-
return child;
52+
53+
/// Find a child field in a STRUCT DataType by paimon field ID.
54+
/// Returns nullptr if no child has the given ID.
55+
static std::shared_ptr<arrow::Field> FindFieldByPaimonId(
56+
const std::shared_ptr<arrow::DataType>& struct_type, int32_t field_id) {
57+
if (!struct_type || struct_type->id() != arrow::Type::STRUCT) {
58+
return nullptr;
59+
}
60+
for (const auto& child : struct_type->fields()) {
61+
if (GetPaimonFieldId(child) == field_id) {
62+
return child;
63+
}
5664
}
65+
return nullptr;
5766
}
58-
return nullptr;
59-
}
6067

61-
/// Recursively prune `data_type` so that only the sub-fields requested by
62-
/// `read_type` are retained. Matching is done by paimon field ID to support
63-
/// schema evolution (field renames).
64-
///
65-
/// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse).
66-
/// For atomic types, `data_type` is returned as-is.
67-
///
68-
/// Returns std::nullopt when all sub-fields of a STRUCT are pruned away
69-
/// (caller should skip this field entirely, mirroring Java's null return).
70-
Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
71-
const std::shared_ptr<arrow::DataType>& read_type,
72-
const std::shared_ptr<arrow::DataType>& data_type);
68+
/// Recursively prune `data_type` so that only the sub-fields requested by
69+
/// `read_type` are retained. Matching is done by paimon field ID to support
70+
/// schema evolution (field renames).
71+
///
72+
/// Supported nesting: STRUCT, LIST (element recurse), MAP (key/value recurse).
73+
/// For atomic types, `data_type` is returned as-is.
74+
///
75+
/// Returns std::nullopt when all sub-fields of a STRUCT are pruned away
76+
/// (caller should skip this field entirely, mirroring Java's null return).
77+
static Result<std::optional<std::shared_ptr<arrow::DataType>>> PruneDataType(
78+
const std::shared_ptr<arrow::DataType>& read_type,
79+
const std::shared_ptr<arrow::DataType>& data_type);
80+
81+
/// Prune a StructArray so that only the sub-fields present in `target_type`
82+
/// are kept. Used as a fallback when the format reader returns more columns
83+
/// than requested.
84+
static Result<std::shared_ptr<arrow::Array>> PruneArray(
85+
const std::shared_ptr<arrow::Array>& array,
86+
const std::shared_ptr<arrow::DataType>& target_type);
87+
88+
/// Parse the "paimon.map.selected-keys" metadata from an Arrow field.
89+
/// Returns an empty set if the metadata key is absent or the field is not a MAP.
90+
/// The metadata value must be a JSON array of strings, e.g. '["key1","key2"]'.
91+
static std::set<std::string> GetMapSelectedKeys(
92+
const std::shared_ptr<arrow::Field>& field);
7393

74-
/// Prune a StructArray so that only the sub-fields present in `target_type`
75-
/// are kept. Used as a fallback when the format reader returns more columns
76-
/// than requested.
77-
Result<std::shared_ptr<arrow::Array>> PruneArray(
78-
const std::shared_ptr<arrow::Array>& array,
79-
const std::shared_ptr<arrow::DataType>& target_type);
94+
/// Filter a MapArray so that only entries whose key is in `selected_keys` are kept.
95+
/// Only supports string-keyed maps. Returns the original array unchanged if
96+
/// `selected_keys` is empty.
97+
static Result<std::shared_ptr<arrow::Array>> FilterMapArrayBySelectedKeys(
98+
const std::shared_ptr<arrow::Array>& map_array,
99+
const std::set<std::string>& selected_keys);
100+
};
80101

81102
} // namespace paimon

0 commit comments

Comments
 (0)