Skip to content

Commit 9e22029

Browse files
committed
Adjust transform unittest after changing partition name
1 parent b4ebd69 commit 9e22029

File tree

3 files changed

+160
-41
lines changed

3 files changed

+160
-41
lines changed

velox/connectors/hive/iceberg/tests/IcebergPartitionIdGeneratorTest.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -135,24 +135,57 @@ TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithMixedTransforms) {
135135
auto bigintVector =
136136
makeFlatVector<int64_t>(1, [](auto) { return 9'876'543'210; });
137137
auto varcharVector = makeFlatVector<StringView>({"test string"});
138-
auto timestampVector = makeFlatVector<Timestamp>(
138+
auto yearVector = makeFlatVector<Timestamp>(
139139
1, [](auto) { return Timestamp(1'577'836'800, 0); });
140+
auto monthVector = makeFlatVector<Timestamp>(
141+
1, [](auto) { return Timestamp(1'578'836'800, 0); });
142+
auto dayVector = makeFlatVector<Timestamp>(
143+
1, [](auto) { return Timestamp(1'579'836'800, 0); });
144+
auto hourVector = makeFlatVector<Timestamp>(
145+
1, [](auto) { return Timestamp(1'57'936'800, 0); });
146+
auto boolVector = makeFlatVector<bool>(1, [](auto) { return true; });
140147

141148
std::vector<std::string> columnNames = {
142-
"c_int", "c_bigint", "c_varchar", "c_timestamp"};
149+
"c_int",
150+
"c_bigint",
151+
"c_varchar",
152+
"c_year",
153+
"c_month",
154+
"c_day",
155+
"c_hour",
156+
"c_bool"};
143157

144158
std::vector<VectorPtr> columns = {
145-
intVector, bigintVector, varcharVector, timestampVector};
159+
intVector,
160+
bigintVector,
161+
varcharVector,
162+
yearVector,
163+
monthVector,
164+
dayVector,
165+
hourVector,
166+
boolVector};
146167

147-
std::vector<TypePtr> types = {INTEGER(), BIGINT(), VARCHAR(), TIMESTAMP()};
168+
std::vector<TypePtr> types = {
169+
INTEGER(),
170+
BIGINT(),
171+
VARCHAR(),
172+
TIMESTAMP(),
173+
TIMESTAMP(),
174+
TIMESTAMP(),
175+
TIMESTAMP(),
176+
BOOLEAN()};
148177

149178
auto rowVector = createRowVector(columnNames, columns);
150179

151180
std::vector<TransformType> transformTypes = {
152181
TransformType::kBucket,
153182
TransformType::kTruncate,
154183
TransformType::kTruncate,
155-
TransformType::kYear};
184+
TransformType::kYear,
185+
TransformType::kMonth,
186+
TransformType::kDay,
187+
TransformType::kHour,
188+
TransformType::kIdentity};
156189

157190
std::vector<std::optional<int32_t>> parameters = {4, 1'000, 5, std::nullopt};
158191
auto transforms =
@@ -167,7 +200,11 @@ TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithMixedTransforms) {
167200
"c_int_bucket=2",
168201
"c_bigint_trunc=9876543000",
169202
"c_varchar_trunc=test+",
170-
"c_timestamp_year=50"};
203+
"c_year_year=2020",
204+
"c_month_month=2020-01",
205+
"c_day_day=2020-01-24",
206+
"c_hour_hour=1975-01-02-23",
207+
"c_bool=true"};
171208
verifyPartitionComponents(partitionName, expectedComponents);
172209
}
173210

velox/connectors/hive/iceberg/tests/IcebergTransformE2ETest.cpp

Lines changed: 107 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -424,18 +424,17 @@ TEST_F(IcebergTransformE2ETest, yearPartitioning) {
424424

425425
auto dataPath = fmt::format("{}", outputDirectory->getPath());
426426
auto partitionDirs = listFirstLevelDirectories(dataPath);
427-
std::unordered_map<int32_t, int32_t> yearToYearsSince1970 = {
428-
{2020, 50}, {2021, 51}, {2022, 52}, {2023, 53}, {2024, 54}, {2025, 55}};
429427

430-
for (const auto& [year, yearsSince1970] : yearToYearsSince1970) {
431-
const auto expectedDirName = fmt::format("c_date_year={}", yearsSince1970);
428+
for (int32_t year = 2020; year <= 2025; year++) {
429+
const auto expectedDirName = fmt::format("c_date_year={}", year);
432430
bool foundPartition = false;
433431
auto yearFilter = [](const int32_t year) -> std::string {
434432
return fmt::format(
435433
"YEAR(DATE '{}-01-01')={}",
436434
std::to_string(year),
437435
std::to_string(year));
438436
};
437+
439438
for (const auto& dir : partitionDirs) {
440439
const auto dirName = std::filesystem::path(dir).filename().string();
441440
if (dirName == expectedDirName) {
@@ -791,10 +790,9 @@ TEST_F(IcebergTransformE2ETest, timestampYearPartitioning) {
791790
for (const auto& dir : partitionDirs) {
792791
const auto dirName = std::filesystem::path(dir).filename().string();
793792
auto [c, v] = buildFilter(dirName);
794-
auto yearsSince1970 = std::stoi(v);
795-
auto year = 1970 + yearsSince1970;
793+
auto year = std::stoi(v);
796794
std::string filter = fmt::format("YEAR(c_timestamp) = {}", year);
797-
auto expectedRowCount = yearToExpectedCount[year];
795+
auto expectedRowCount = yearToExpectedCount.at(year);
798796
verifyPartitionData(rowType_, dir, filter, expectedRowCount);
799797
}
800798
}
@@ -820,28 +818,44 @@ TEST_F(IcebergTransformE2ETest, timestampMonthPartitioning) {
820818

821819
auto dataPath = fmt::format("{}", outputDirectory->getPath());
822820
auto partitionDirs = listFirstLevelDirectories(dataPath);
821+
std::unordered_map<std::string, int32_t> monthToExpectedCount;
822+
823+
for (const auto& batch : batches) {
824+
auto timestampVector = batch->childAt(6)->as<SimpleVector<Timestamp>>();
825+
for (vector_size_t i = 0; i < batch->size(); i++) {
826+
if (!timestampVector->isNullAt(i)) {
827+
Timestamp ts = timestampVector->valueAt(i);
828+
std::tm tm;
829+
if (Timestamp::epochToCalendarUtc(ts.getSeconds(), tm)) {
830+
int32_t year = tm.tm_year + 1900;
831+
int32_t month = tm.tm_mon + 1;
832+
std::string monthKey = fmt::format("{:04d}-{:02d}", year, month);
833+
monthToExpectedCount[monthKey]++;
834+
}
835+
}
836+
}
837+
}
838+
823839
for (const auto& dir : partitionDirs) {
824840
const auto dirName = std::filesystem::path(dir).filename().string();
825841
auto [c, v] = buildFilter(dirName);
826-
auto monthsSince1970 = std::stoi(v);
827-
auto yearsSince1970 = monthsSince1970 / 12;
828-
auto monthOfYear = (monthsSince1970 % 12) + 1;
829-
auto year = 1970 + yearsSince1970;
830-
std::string filter = fmt::format(
831-
"YEAR(c_timestamp) = {} AND MONTH(c_timestamp) = {}",
832-
year,
833-
monthOfYear);
842+
size_t dashPos = v.find('-');
843+
ASSERT_NE(dashPos, std::string::npos) << "Invalid month format: " << v;
834844

835-
verifyPartitionData(rowType_, dir, filter, 0, true);
845+
int32_t year = std::stoi(v.substr(0, dashPos));
846+
int32_t month = std::stoi(v.substr(dashPos + 1));
847+
std::string filter = fmt::format(
848+
"YEAR(c_timestamp) = {} AND MONTH(c_timestamp) = {}", year, month);
849+
std::string monthKey = fmt::format("{:04d}-{:02d}", year, month);
850+
auto expectedCount = monthToExpectedCount[monthKey];
851+
verifyPartitionData(rowType_, dir, filter, expectedCount);
836852
}
837853
}
838854

839855
TEST_F(IcebergTransformE2ETest, timestampDayPartitioning) {
840856
constexpr auto numBatches = 2;
841857
constexpr auto rowsPerBatch = 100;
842-
843858
auto batches = createTestData(numBatches, rowsPerBatch);
844-
845859
auto outputDirectory = TempDirectoryPath::create();
846860
auto dataSink = createIcebergDataSink(
847861
rowType_, outputDirectory->getPath(), {"day(c_timestamp)"});
@@ -858,27 +872,55 @@ TEST_F(IcebergTransformE2ETest, timestampDayPartitioning) {
858872

859873
auto dataPath = fmt::format("{}", outputDirectory->getPath());
860874
auto partitionDirs = listFirstLevelDirectories(dataPath);
875+
std::unordered_map<std::string, int32_t> dayToExpectedCount;
876+
for (const auto& batch : batches) {
877+
auto timestampVector = batch->childAt(6)->as<SimpleVector<Timestamp>>();
878+
for (vector_size_t i = 0; i < batch->size(); i++) {
879+
if (!timestampVector->isNullAt(i)) {
880+
Timestamp ts = timestampVector->valueAt(i);
881+
std::tm tm;
882+
if (Timestamp::epochToCalendarUtc(ts.getSeconds(), tm)) {
883+
int32_t year = tm.tm_year + 1900;
884+
int32_t month = tm.tm_mon + 1;
885+
int32_t day = tm.tm_mday;
886+
std::string dayKey =
887+
fmt::format("{:04d}-{:02d}-{:02d}", year, month, day);
888+
dayToExpectedCount[dayKey]++;
889+
}
890+
}
891+
}
892+
}
893+
861894
for (const auto& dir : partitionDirs) {
862895
const auto dirName = std::filesystem::path(dir).filename().string();
863896
auto [c, v] = buildFilter(dirName);
864-
auto daysSince1970 = std::stoi(v);
897+
std::vector<std::string> dateParts;
898+
folly::split('-', v, dateParts);
899+
ASSERT_EQ(dateParts.size(), 3) << "Invalid day format: " << v;
900+
901+
int32_t year = std::stoi(dateParts[0]);
902+
int32_t month = std::stoi(dateParts[1]);
903+
int32_t day = std::stoi(dateParts[2]);
865904

866905
std::string filter = fmt::format(
867-
"c_timestamp >= TIMESTAMP '1970-01-01' + INTERVAL {} DAY AND "
868-
"c_timestamp < TIMESTAMP '1970-01-01' + INTERVAL {} DAY",
869-
daysSince1970,
870-
daysSince1970 + 1);
906+
"YEAR(c_timestamp) = {} AND MONTH(c_timestamp) = {} AND DAY(c_timestamp) = {}",
907+
year,
908+
month,
909+
day);
871910

872-
verifyPartitionData(rowType_, dir, filter, 0, true);
911+
// Get expected count for this day
912+
std::string dayKey = fmt::format("{:04d}-{:02d}-{:02d}", year, month, day);
913+
auto expectedCount = dayToExpectedCount[dayKey];
914+
915+
// Verify partition data with actual row count check
916+
verifyPartitionData(rowType_, dir, filter, expectedCount);
873917
}
874918
}
875919

876920
TEST_F(IcebergTransformE2ETest, timestampHourPartitioning) {
877921
constexpr auto numBatches = 2;
878922
constexpr auto rowsPerBatch = 100;
879-
880923
auto batches = createTestData(numBatches, rowsPerBatch);
881-
882924
auto outputDirectory = TempDirectoryPath::create();
883925
auto dataSink = createIcebergDataSink(
884926
rowType_, outputDirectory->getPath(), {"hour(c_timestamp)"});
@@ -895,18 +937,50 @@ TEST_F(IcebergTransformE2ETest, timestampHourPartitioning) {
895937

896938
auto dataPath = fmt::format("{}", outputDirectory->getPath());
897939
auto partitionDirs = listFirstLevelDirectories(dataPath);
940+
std::unordered_map<std::string, int32_t> hourToExpectedCount;
941+
942+
for (const auto& batch : batches) {
943+
auto timestampVector = batch->childAt(6)->as<SimpleVector<Timestamp>>();
944+
for (vector_size_t i = 0; i < batch->size(); i++) {
945+
if (!timestampVector->isNullAt(i)) {
946+
Timestamp ts = timestampVector->valueAt(i);
947+
std::tm tm;
948+
if (Timestamp::epochToCalendarUtc(ts.getSeconds(), tm)) {
949+
int32_t year = tm.tm_year + 1900;
950+
int32_t month = tm.tm_mon + 1;
951+
int32_t day = tm.tm_mday;
952+
int32_t hour = tm.tm_hour;
953+
std::string hourKey = fmt::format(
954+
"{:04d}-{:02d}-{:02d}-{:02d}", year, month, day, hour);
955+
hourToExpectedCount[hourKey]++;
956+
}
957+
}
958+
}
959+
}
898960

899961
for (const auto& dir : partitionDirs) {
900962
const auto dirName = std::filesystem::path(dir).filename().string();
901963
auto [c, v] = buildFilter(dirName);
902-
auto hoursSince1970 = std::stoi(v);
903-
std::string filter = fmt::format(
904-
"c_timestamp >= TIMESTAMP '1970-01-01' + INTERVAL {} HOUR AND "
905-
"c_timestamp < TIMESTAMP '1970-01-01' + INTERVAL {} HOUR",
906-
hoursSince1970,
907-
hoursSince1970 + 1);
964+
std::vector<std::string> dateParts;
965+
folly::split('-', v, dateParts);
966+
ASSERT_EQ(dateParts.size(), 4) << "Invalid hour format: " << v;
908967

909-
verifyPartitionData(rowType_, dir, filter, 0, true);
968+
int32_t year = std::stoi(dateParts[0]);
969+
int32_t month = std::stoi(dateParts[1]);
970+
int32_t day = std::stoi(dateParts[2]);
971+
int32_t hour = std::stoi(dateParts[3]);
972+
973+
std::string filter = fmt::format(
974+
"YEAR(c_timestamp) = {} AND MONTH(c_timestamp) = {} AND "
975+
"DAY(c_timestamp) = {} AND HOUR(c_timestamp) = {}",
976+
year,
977+
month,
978+
day,
979+
hour);
980+
std::string hourKey =
981+
fmt::format("{:04d}-{:02d}-{:02d}-{:02d}", year, month, day, hour);
982+
auto expectedCount = hourToExpectedCount[hourKey];
983+
verifyPartitionData(rowType_, dir, filter, expectedCount);
910984
}
911985
}
912986

velox/connectors/hive/iceberg/tests/IcebergTransformUnitTest.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,16 @@ TEST_F(IcebergTransformUnitTest, testIdentityTransform) {
153153
EXPECT_EQ(varcharTransform.type->kind(), TypeKind::VARCHAR);
154154
testTransform<StringView, StringView>(
155155
varcharTransform,
156-
{StringView("a"), StringView(""), StringView("velox")},
157-
{StringView("a"), StringView(""), StringView("velox")});
156+
{StringView("a"),
157+
StringView(""),
158+
StringView("velox"),
159+
StringView(
160+
"Velox is a composable execution engine distributed as an open source C++ library. It provides reusable, extensible, and high-performance data processing components that can be (re-)used to build data management systems focused on different analytical workloads, including batch, interactive, stream processing, and AI/ML. Velox was created by Meta and it is currently developed in partnership with IBM/Ahana, Intel, Voltron Data, Microsoft, ByteDance and many other companies.")},
161+
{StringView("a"),
162+
StringView(""),
163+
StringView("velox"),
164+
StringView(
165+
"Velox is a composable execution engine distributed as an open source C++ library. It provides reusable, extensible, and high-performance data processing components that can be (re-)used to build data management systems focused on different analytical workloads, including batch, interactive, stream processing, and AI/ML. Velox was created by Meta and it is currently developed in partnership with IBM/Ahana, Intel, Voltron Data, Microsoft, ByteDance and many other companies.")});
158166

159167
auto& varbinaryTransform = partitionSpec->fields[3];
160168
EXPECT_EQ(varbinaryTransform.transformType, TransformType::kIdentity);

0 commit comments

Comments
 (0)