Skip to content

Commit 718b01c

Browse files
mohsakaLakehouse Engine Bot
authored andcommitted
Fix stats collection for integer based decimal numbers
Alchemy-item: (ID = 1247) Fix stats collection for integer based decimal numbers commit 1/1 - 8531b53
1 parent 9e1474a commit 718b01c

File tree

3 files changed

+31
-7
lines changed

3 files changed

+31
-7
lines changed

velox/connectors/hive/iceberg/DataFileStatsCollector.cpp

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,18 @@ void DataFileStatsCollector::collectStats(
3838
VELOX_CHECK_NOT_NULL(fileMetadata);
3939

4040
std::unordered_set<int32_t> skipBoundsFields;
41+
std::unordered_map<int32_t, TypePtr> fieldIdToType;
4142
std::function<int32_t(IcebergDataFileStatsSettings*)> processFields =
4243
[&skipBoundsFields,
44+
&fieldIdToType,
4345
&processFields](IcebergDataFileStatsSettings* field) -> int32_t {
4446
if (field->skipBounds) {
4547
skipBoundsFields.insert(field->fieldId);
4648
}
49+
// Store the Velox type for this field
50+
if (field->veloxType) {
51+
fieldIdToType[field->fieldId] = field->veloxType;
52+
}
4753
if (field->children.empty()) {
4854
return 1;
4955
}
@@ -112,13 +118,29 @@ void DataFileStatsCollector::collectStats(
112118
}
113119
}
114120

121+
// Only 4-byte DECIMAL values (precision <= 9) need byte-swapping.
115122
for (const auto& [fieldId, minStats] : globalMinStats) {
116-
const auto lowerBound = minStats->MinValue();
123+
auto lowerBound = minStats->MinValue();
124+
auto upperBound = globalMaxStats[fieldId]->MaxValue();
125+
126+
auto typeIt = fieldIdToType.find(fieldId);
127+
if (typeIt != fieldIdToType.end() && typeIt->second &&
128+
typeIt->second->isDecimal() && lowerBound.size() == 4) {
129+
std::string lowerResult(4, '\0');
130+
auto lowerSwapped = __builtin_bswap32(
131+
*reinterpret_cast<const uint32_t*>(lowerBound.data()));
132+
std::memcpy(lowerResult.data(), &lowerSwapped, 4);
133+
lowerBound = lowerResult;
134+
135+
std::string upperResult(4, '\0');
136+
auto upperSwapped = __builtin_bswap32(
137+
*reinterpret_cast<const uint32_t*>(upperBound.data()));
138+
std::memcpy(upperResult.data(), &upperSwapped, 4);
139+
upperBound = upperResult;
140+
}
141+
117142
dataFileStats->lowerBounds[fieldId] =
118143
encoding::Base64::encode(lowerBound.data(), lowerBound.size());
119-
}
120-
for (const auto& [fieldId, maxStats] : globalMaxStats) {
121-
const auto upperBound = maxStats->MaxValue();
122144
dataFileStats->upperBounds[fieldId] =
123145
encoding::Base64::encode(upperBound.data(), upperBound.size());
124146
}

velox/connectors/hive/iceberg/DataFileStatsCollector.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#pragma once
1717

1818
#include "velox/dwio/common/DataFileStatsCollector.h"
19+
#include "velox/type/Type.h"
1920

2021
namespace facebook::velox::connector::hive::iceberg {
2122

@@ -26,10 +27,11 @@ struct IcebergDataFileStatsSettings
2627
: public dwio::common::DataFileStatsSettings {
2728
int32_t fieldId;
2829
bool skipBounds;
30+
TypePtr veloxType;
2931
std::vector<std::unique_ptr<IcebergDataFileStatsSettings>> children;
3032

31-
IcebergDataFileStatsSettings(int32_t id, bool skip)
32-
: fieldId(id), skipBounds(skip), children() {}
33+
IcebergDataFileStatsSettings(int32_t id, bool skip, TypePtr type = nullptr)
34+
: fieldId(id), skipBounds(skip), veloxType(type), children() {}
3335
};
3436

3537
class DataFileStatsCollector : public dwio::common::FileStatsCollector {

velox/connectors/hive/iceberg/IcebergDataSink.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ IcebergDataSink::IcebergDataSink(
221221
bool skipBounds) -> IcebergDataFileStatsSettings {
222222
VELOX_CHECK_NOT_NULL(type, "Input column type cannot be null.");
223223
bool currentSkipBounds = skipBounds || type->isMap() || type->isArray();
224-
IcebergDataFileStatsSettings field(f.id, currentSkipBounds);
224+
IcebergDataFileStatsSettings field(f.id, currentSkipBounds, type);
225225
if (!f.children.empty()) {
226226
VELOX_CHECK_EQ(f.children.size(), type->size());
227227
field.children.reserve(f.children.size());

0 commit comments

Comments
 (0)