Skip to content

Commit e8badd7

Browse files
committed
Add unittest for partition folder name
1 parent bbba59c commit e8badd7

File tree

1 file changed

+131
-0
lines changed

1 file changed

+131
-0
lines changed

velox/connectors/hive/iceberg/tests/IcebergTransformE2ETest.cpp

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,21 @@ class IcebergTransformE2ETest : public IcebergTestBase {
157157
return partitionDirs;
158158
}
159159

160+
std::vector<std::string> listDirectoriesRecursively(const std::string& path) {
161+
std::vector<std::string> allDirs;
162+
auto firstLevelDirs = listFirstLevelDirectories(path);
163+
allDirs.insert(allDirs.end(), firstLevelDirs.begin(), firstLevelDirs.end());
164+
165+
for (const auto& dir : firstLevelDirs) {
166+
if (std::filesystem::is_directory(dir)) {
167+
auto subDirs = listDirectoriesRecursively(dir);
168+
allDirs.insert(allDirs.end(), subDirs.begin(), subDirs.end());
169+
}
170+
}
171+
172+
return allDirs;
173+
}
174+
160175
// Verify the number of partitions and their naming convention.
161176
void verifyPartitionCount(
162177
const std::string& outputPath,
@@ -895,4 +910,120 @@ TEST_F(IcebergTransformE2ETest, timestampHourPartitioning) {
895910
}
896911
}
897912

913+
TEST_F(IcebergTransformE2ETest, partitionFolderNamingConventions) {
914+
auto intVector = makeFlatVector<int32_t>(1, [](auto) { return 42; });
915+
auto bigintVector =
916+
makeFlatVector<int64_t>(1, [](auto) { return 9876543210; });
917+
auto varcharVector =
918+
BaseVector::create<FlatVector<StringView>>(VARCHAR(), 1, opPool_.get());
919+
varcharVector->set(0, StringView("test string"));
920+
921+
auto varcharVector2 =
922+
BaseVector::create<FlatVector<StringView>>(VARCHAR(), 1, opPool_.get());
923+
varcharVector2->setNull(0, true);
924+
925+
auto decimalVector =
926+
BaseVector::create<FlatVector<int64_t>>(DECIMAL(18, 3), 1, opPool_.get());
927+
decimalVector->set(0, 1234567890);
928+
929+
auto varbinaryVector =
930+
BaseVector::create<FlatVector<StringView>>(VARBINARY(), 1, opPool_.get());
931+
std::string binaryData = "binary\0data\1\2\3";
932+
varbinaryVector->set(0, StringView(binaryData));
933+
934+
auto rowVector = makeRowVector(
935+
{"c_int",
936+
"c_bigint",
937+
"c_varchar",
938+
"c_varchar2",
939+
"c_decimal",
940+
"c_varbinary"},
941+
{intVector,
942+
bigintVector,
943+
varcharVector,
944+
varcharVector2,
945+
decimalVector,
946+
varbinaryVector});
947+
auto outputDirectory = TempDirectoryPath::create();
948+
auto dataSink = createIcebergDataSink(
949+
asRowType(rowVector->type()),
950+
outputDirectory->getPath(),
951+
{"c_int",
952+
"c_bigint",
953+
"c_varchar",
954+
"c_decimal",
955+
"c_varbinary",
956+
"c_varchar2"});
957+
958+
dataSink->appendData(rowVector);
959+
ASSERT_TRUE(dataSink->finish());
960+
dataSink->close();
961+
962+
verifyTotalRowCount(
963+
asRowType(rowVector->type()), outputDirectory->getPath(), 1);
964+
auto dataPath = fmt::format("{}", outputDirectory->getPath());
965+
auto partitionDirs = listDirectoriesRecursively(dataPath);
966+
967+
const std::string expectedIntFolder = "c_int=42";
968+
const std::string expectedBigintFolder = "c_bigint=9876543210";
969+
const std::string expectedVarcharFolder = "c_varchar=test+string";
970+
const std::string expectedVarcharFolder2 = "c_varchar2=null";
971+
const std::string expectedDecimalFolder = "c_decimal=1234567.890";
972+
const std::string expectedVarbinary = "c_varbinary=" +
973+
encoding::Base64::encode(binaryData.data(), binaryData.size());
974+
975+
bool foundIntPartition = false;
976+
bool foundBigintPartition = false;
977+
bool foundVarcharPartition = false;
978+
bool foundVarcharPartition2 = false;
979+
bool foundDecimalPartition = false;
980+
bool foundVarbinaryPartition = false;
981+
982+
for (const auto& dir : partitionDirs) {
983+
const auto dirName = std::filesystem::path(dir).filename().string();
984+
985+
if (dirName == expectedIntFolder) {
986+
foundIntPartition = true;
987+
verifyPartitionData(asRowType(rowVector->type()), dir, "c_int = 42", 1);
988+
} else if (dirName == expectedBigintFolder) {
989+
foundBigintPartition = true;
990+
verifyPartitionData(
991+
asRowType(rowVector->type()), dir, "c_bigint = 9876543210", 1);
992+
} else if (dirName == expectedVarcharFolder) {
993+
foundVarcharPartition = true;
994+
verifyPartitionData(
995+
asRowType(rowVector->type()), dir, "c_varchar = 'test string'", 1);
996+
} else if (dirName == expectedVarcharFolder2) {
997+
foundVarcharPartition2 = true;
998+
verifyPartitionData(
999+
asRowType(rowVector->type()), dir, "c_varchar2 IS NULL", 1);
1000+
} else if (dirName == expectedDecimalFolder) {
1001+
foundDecimalPartition = true;
1002+
verifyPartitionData(
1003+
asRowType(rowVector->type()),
1004+
dir,
1005+
"c_decimal = DECIMAL '1234567.890'",
1006+
1);
1007+
} else if (dirName.find(expectedVarbinary) == 0) {
1008+
foundVarbinaryPartition = true;
1009+
verifyPartitionData(
1010+
asRowType(rowVector->type()), dir, "c_varbinary IS NOT NULL", 1);
1011+
}
1012+
}
1013+
1014+
ASSERT_TRUE(foundIntPartition)
1015+
<< "Integer partition folder not found: " << expectedIntFolder;
1016+
ASSERT_TRUE(foundBigintPartition)
1017+
<< "Bigint partition folder not found: " << expectedBigintFolder;
1018+
ASSERT_TRUE(foundVarcharPartition)
1019+
<< "Varchar partition folder not found: " << expectedVarcharFolder;
1020+
ASSERT_TRUE(foundVarcharPartition2)
1021+
<< "Varchar2 partition folder not found: " << expectedVarcharFolder2;
1022+
ASSERT_TRUE(foundDecimalPartition)
1023+
<< "Decimal partition folder not found: " << expectedDecimalFolder;
1024+
ASSERT_TRUE(foundVarbinaryPartition)
1025+
<< "Varbinary partition folder not found with prefix: "
1026+
<< expectedVarbinary;
1027+
}
1028+
8981029
} // namespace facebook::velox::connector::hive::iceberg::test

0 commit comments

Comments
 (0)