Skip to content

Commit fb59842

Browse files
committed
continue cleanup
1 parent 3566dfd commit fb59842

File tree

2 files changed

+30
-9
lines changed

2 files changed

+30
-9
lines changed

cpp/src/parquet/variant.cc

+25-9
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,21 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const {
567567

568568
std::optional<VariantValue> VariantValue::getObjectValueByKey(
569569
std::string_view key) const {
570-
if (getBasicType() != VariantBasicType::Object) {
571-
throw ParquetException("Not an object type");
572-
}
573-
574570
ObjectInfo info = getObjectInfo();
575571

572+
return getObjectValueByKey(key, info);
573+
}
574+
575+
std::optional<VariantValue> VariantValue::getObjectValueByKey(
576+
std::string_view key, const VariantValue::ObjectInfo& info) const {
577+
// TODO(mwish): Currently we just linear search here. The best way here is:
578+
// 1. check the num_elements
579+
// 2.1. If the element number is less than 8(or other magic number), we can keep
580+
// current method.
581+
// 2.2. If the element number is larger than 8, and metadata.sorted_strings is true,
582+
// we can first apply binary search on the metadata, and then binary search the
583+
// field id.
584+
576585
for (uint32_t i = 0; i < info.num_elements; ++i) {
577586
std::string_view field_key;
578587
std::optional<VariantValue> field_value = getObjectFieldByFieldId(i, &field_key);
@@ -692,18 +701,20 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const {
692701
next_offset = arrow::bit_util::FromLittleEndian(next_offset);
693702

694703
if (offset > next_offset) {
695-
throw ParquetException("Invalid array value: offsets not monotonically increasing");
704+
throw ParquetException(
705+
"Invalid array value: offsets not monotonically increasing: " +
706+
std::to_string(offset) + " > " + std::to_string(next_offset));
696707
}
697708
}
698709

699710
return info;
700711
}
701712

702-
VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
703-
ArrayInfo info = getArrayInfo();
704-
713+
VariantValue VariantValue::getArrayValueByIndex(uint32_t index,
714+
const ArrayInfo& info) const {
705715
if (index >= info.num_elements) {
706-
throw ParquetException("Array index out of range");
716+
throw ParquetException("Array index out of range: " + std::to_string(index) +
717+
" >= " + std::to_string(info.num_elements));
707718
}
708719

709720
// Read the offset and next offset
@@ -725,4 +736,9 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
725736
return element_value;
726737
}
727738

739+
VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
740+
ArrayInfo info = getArrayInfo();
741+
return getArrayValueByIndex(index, info);
742+
}
743+
728744
} // namespace parquet::variant

cpp/src/parquet/variant.h

+5
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ struct VariantValue {
189189
};
190190
ObjectInfo getObjectInfo() const;
191191
std::optional<VariantValue> getObjectValueByKey(std::string_view key) const;
192+
std::optional<VariantValue> getObjectValueByKey(std::string_view key,
193+
const ObjectInfo& info) const;
192194
VariantValue getObjectFieldByFieldId(uint32_t variantId, std::string_view* key) const;
193195

194196
struct ArrayInfo {
@@ -200,6 +202,7 @@ struct VariantValue {
200202
ArrayInfo getArrayInfo() const;
201203
// Would throw ParquetException if index is out of range.
202204
VariantValue getArrayValueByIndex(uint32_t index) const;
205+
VariantValue getArrayValueByIndex(uint32_t index, const ArrayInfo& info) const;
203206

204207
private:
205208
static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011;
@@ -212,9 +215,11 @@ struct VariantValue {
212215
template <typename PrimitiveType>
213216
PrimitiveType getPrimitiveType(VariantPrimitiveType type) const;
214217

218+
// An extra function because decimal uses 1 byte for scale.
215219
template <typename DecimalType>
216220
DecimalValue<DecimalType> getPrimitiveDecimalType(VariantPrimitiveType type) const;
217221

222+
// An extra function because binary/string uses 4 bytes for length.
218223
std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const;
219224
void checkBasicType(VariantBasicType type) const;
220225
void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const;

0 commit comments

Comments
 (0)