Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions be/src/common/status.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ static const char g_moved_from_state[5] = {'\x00', '\x00', '\x00', '\x00', TStat
inline const char* assemble_state(TStatusCode::type code, std::string_view msg, std::string_view ctx) {
DCHECK(code != TStatusCode::OK);

if (msg.empty() && ctx.empty()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fast initialization

auto result = new char[5]{0, 0, 0, 0, static_cast<char>(code)};
return result;
}

auto msg_size = std::min<size_t>(msg.size(), std::numeric_limits<uint16_t>::max());
auto ctx_size = std::min<size_t>(ctx.size(), std::numeric_limits<uint16_t>::max());

Expand Down
3 changes: 3 additions & 0 deletions be/src/exprs/variant_path_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ StatusOr<VariantRowValue> VariantPath::seek(const VariantRowValue* variant, cons
return Status::OK();
},
segment));
if (current.is_null()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fast break if it's null value.

break;
}
}
return VariantRowValue::from_variant(metadata, current);
}
Expand Down
83 changes: 43 additions & 40 deletions be/src/util/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,12 @@ StatusOr<std::string_view> VariantMetadata::get_key(uint32_t index) const {
static constexpr uint8_t kBinarySearchThreshold = 32;

Status VariantMetadata::_get_index(std::string_view key, void* _indexes, int hint) const {
KeyIndexVector& indexes = *static_cast<KeyIndexVector*>(_indexes);

uint32_t dict_sz = dict_size();
if (dict_sz == 0) {
return Status::OK();
}

KeyIndexVector& indexes = *static_cast<KeyIndexVector*>(_indexes);
bool is_sorted_unique = is_sorted_and_unique();
RETURN_IF_ERROR(_build_lookup_index());
const std::vector<std::string_view>& dict_strings = _lookup_index.dict_strings;
Expand All @@ -207,9 +206,15 @@ Status VariantMetadata::_get_index(std::string_view key, void* _indexes, int hin
indexes.push_back(std::distance(dict_strings.begin(), it));
}
}
} else {
// non-unique dictionary, find all matching indexes
} else if (hint == 0) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hint = 0 means just return the first index
otherwise it means to search from this index.

for (uint32_t i = 0; i < dict_sz; i++) {
if (dict_strings[i] == key) {
indexes.push_back(i);
break;
}
}
} else {
for (uint32_t i = hint; i < dict_sz; i++) {
if (dict_strings[i] == key) {
indexes.push_back(i);
}
Expand All @@ -220,21 +225,6 @@ Status VariantMetadata::_get_index(std::string_view key, void* _indexes, int hin

// Variant value class

VariantType VariantValue::type() const {
switch (basic_type()) {
case VariantValue::BasicType::PRIMITIVE:
return static_cast<VariantType>(value_header());
case VariantValue::BasicType::SHORT_STRING:
return VariantType::STRING; // Short string is treated as a string type.
case VariantValue::BasicType::OBJECT:
return VariantType::OBJECT;
case VariantValue::BasicType::ARRAY:
return VariantType::ARRAY;
default:
return VariantType::NULL_TYPE; // Should not happen, but return NULL_TYPE as a fallback.
}
}

StatusOr<VariantObjectInfo> VariantValue::get_object_info() const {
const std::string_view& value = _value;
VariantValue::BasicType btype = basic_type();
Expand Down Expand Up @@ -558,25 +548,20 @@ StatusOr<uint32_t> VariantValue::num_elements() const {
}

StatusOr<VariantValue> VariantValue::get_object_by_key(const VariantMetadata& metadata, std::string_view key) const {
auto obj_status = get_object_info();
if (!obj_status.ok()) {
return obj_status.status();
}

const VariantObjectInfo& info = obj_status.value();
ASSIGN_OR_RETURN(const VariantObjectInfo& info, get_object_info());
// hint: used to speed up the lookup for non-unique dictionary
// even if the flag is non-unique, the dictionary may still be unique in most cases
// so we try hint=0 first, which means just return the first matched index
// if failed, we try hint=1, which means return all matched indexes. but we can skip the first item
// because it has been tried in the previous attempt
// if failed, we try hint=(last matched index), which means return all matched indexes.

KeyIndexVector dict_indexes;
RETURN_IF_ERROR(metadata._get_index(key, (void*)&dict_indexes, 0));
if (dict_indexes.empty()) {
return Status::NotFound("Field key not exists: " + std::string(key));
}

int32_t field_index = -1;

auto search = [&](int from_index) {
RETURN_IF_ERROR(metadata._get_index(key, (void*)&dict_indexes, from_index));
if (dict_indexes.empty()) {
return Status::OK();
}
#define SEARCH_DICT_INDEX_SZ(sz) \
case sz: { \
const char* id_base = _value.data() + info.id_start_offset; \
Expand All @@ -595,15 +580,32 @@ StatusOr<VariantValue> VariantValue::get_object_by_key(const VariantMetadata& me
break; \
}

switch (info.id_size) {
SEARCH_DICT_INDEX_SZ(1);
SEARCH_DICT_INDEX_SZ(2);
SEARCH_DICT_INDEX_SZ(3);
SEARCH_DICT_INDEX_SZ(4);
switch (info.id_size) {
SEARCH_DICT_INDEX_SZ(1);
SEARCH_DICT_INDEX_SZ(2);
SEARCH_DICT_INDEX_SZ(3);
SEARCH_DICT_INDEX_SZ(4);
}

return Status::OK();
};

RETURN_IF_ERROR(search(0));

// don't find key in the dict, then return null value.
if (dict_indexes.empty()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there is no dict index found, we don't return error, but return null value.
remember this case is not a abnormal case.

return VariantValue();
}

// but if not find field_index, and the dict is non-unique, we need to search again
if (field_index == -1 && !metadata.is_sorted_and_unique()) {
int from_index = dict_indexes[0] + 1;
dict_indexes.clear();
RETURN_IF_ERROR(search(from_index));
}

if (field_index == -1) {
return Status::NotFound("Field key not found: " + std::string(key));
return VariantValue();
}

const uint32_t offset = inline_read_little_endian_unsigned32(
Expand All @@ -625,8 +627,9 @@ StatusOr<VariantValue> VariantValue::get_element_at_index(const VariantMetadata&

const auto& info = array_info_status.value();
if (index >= info.num_elements) {
return Status::VariantError("Array index out of range: " + std::to_string(index) +
" >= " + std::to_string(info.num_elements));
// if you access out-of-bounds index, just return a null variant value
// it does not mean error, and usually it is the normal case.
return VariantValue();
}

uint32_t offset = inline_read_little_endian_unsigned32(
Expand Down
16 changes: 15 additions & 1 deletion be/src/util/variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,21 @@ class VariantValue {

BasicType basic_type() const { return static_cast<VariantValue::BasicType>(_value[0] & kBasicTypeMask); }
std::string_view raw() const { return _value; }
VariantType type() const;
VariantType type() const {
switch (basic_type()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inline this method.

case VariantValue::BasicType::PRIMITIVE:
return static_cast<VariantType>(value_header());
case VariantValue::BasicType::SHORT_STRING:
return VariantType::STRING; // Short string is treated as a string type.
case VariantValue::BasicType::OBJECT:
return VariantType::OBJECT;
case VariantValue::BasicType::ARRAY:
return VariantType::ARRAY;
default:
return VariantType::NULL_TYPE; // Should not happen, but return NULL_TYPE as a fallback.
}
}
bool is_null() const { return _value[0] == 0; }

// Get the primitive boolean value.
StatusOr<bool> get_bool() const;
Expand Down
7 changes: 4 additions & 3 deletions be/test/formats/parquet/parquet_variant_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ TEST_F(ParquetVariantTest, ObjectPrimitive) {
EXPECT_EQ(VariantType::STRING, timestamp_field.type());
EXPECT_EQ("2025-04-16T12:34:56.78", *timestamp_field.get_string());

EXPECT_ERROR(variant.get_object_by_key(metadata, "unknow"));
EXPECT_EQ(variant.get_object_by_key(metadata, "unknow")->type(), VariantType::NULL_TYPE);
}

TEST_F(ParquetVariantTest, ObjectEmpty) {
Expand All @@ -411,7 +411,8 @@ TEST_F(ParquetVariantTest, ObjectEmpty) {
EXPECT_EQ(VariantType::OBJECT, variant.type());
EXPECT_EQ(0, *variant.num_elements());

EXPECT_ERROR(variant.get_object_by_key(metadata, "key"));
EXPECT_EQ(variant.get_object_by_key(metadata, "key")->type(), VariantType::NULL_TYPE);
EXPECT_ERROR(variant.get_element_at_index(metadata, 0));
}

TEST_F(ParquetVariantTest, ArrayPrimitive) {
Expand Down Expand Up @@ -445,7 +446,7 @@ TEST_F(ParquetVariantTest, ArrayEmpty) {
EXPECT_EQ(VariantType::ARRAY, variant.type());
EXPECT_EQ(0, *variant.num_elements());

EXPECT_ERROR(variant.get_element_at_index(metadata, 0));
EXPECT_EQ(variant.get_element_at_index(metadata, 0)->type(), VariantType::NULL_TYPE);
EXPECT_ERROR(variant.get_object_by_key(metadata, "key"));
}

Expand Down
2 changes: 1 addition & 1 deletion test/sql/test_iceberg/R/test_iceberg_variant_query_0
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ ORDER BY case_id;
-- result:
1 alpha 10 hot Decimal4 Decimal4 1.50000000 3.50000000 1.5 3.5
2 beta 0 cold Decimal4 Decimal4 -0.50000000 0.50000000 -0.5 0.5
3 None None None None None None None None None
3 None None None Null Null None None None None
4 delta 99 warm Decimal4 Decimal4 0E-8 2.00000000 0.0 2.0
-- !result
SELECT 'Query 3: Use variant_query + CAST to coerce nested payloads into primitives.' AS case_description;
Expand Down
Loading