Skip to content

Commit 000d968

Browse files
committed
Add new unittest for partition folder name
1 parent 43abe26 commit 000d968

File tree

2 files changed

+312
-0
lines changed

2 files changed

+312
-0
lines changed

velox/connectors/hive/iceberg/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ if(NOT VELOX_DISABLE_GOOGLETEST)
4242
velox_hive_iceberg_test
4343
ColumnTransformTest.cpp
4444
IcebergInsertTest.cpp
45+
IcebergPartitionIdGeneratorTest.cpp
4546
IcebergReadTest.cpp
4647
IcebergSplitReaderBenchmarkTest.cpp
4748
IcebergTestBase.cpp
Lines changed: 311 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,311 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "velox/connectors/hive/iceberg/IcebergPartitionIdGenerator.h"
18+
#include "velox/connectors/hive/iceberg/ColumnTransform.h"
19+
#include "velox/connectors/hive/iceberg/Transforms.h"
20+
#include "velox/connectors/hive/iceberg/tests/IcebergTestBase.h"
21+
22+
using namespace facebook::velox;
23+
24+
namespace facebook::velox::connector::hive::iceberg::test {
25+
26+
class IcebergPartitionIdGeneratorTest : public IcebergTestBase {
27+
protected:
28+
std::vector<ColumnTransform> createColumnTransforms(
29+
const std::vector<std::string>& columnNames,
30+
const std::vector<TypePtr>& types,
31+
const std::vector<TransformType>& transformTypes,
32+
const std::vector<std::optional<int32_t>>& parameters = {}) {
33+
std::vector<IcebergPartitionSpec::Field> fields;
34+
fields.reserve(columnNames.size());
35+
36+
for (size_t i = 0; i < columnNames.size(); ++i) {
37+
std::optional<int32_t> parameter =
38+
parameters.size() > i ? parameters[i] : std::nullopt;
39+
40+
fields.emplace_back(
41+
columnNames[i], types[i], transformTypes[i], parameter);
42+
}
43+
44+
return parsePartitionTransformSpecs(fields, pool_.get());
45+
}
46+
47+
std::unique_ptr<IcebergPartitionIdGenerator> createGenerator(
48+
const std::vector<ColumnTransform>& transforms,
49+
bool partitionPathAsLowerCase = false) {
50+
std::vector<column_index_t> partitionChannels;
51+
for (size_t i = 0; i < transforms.size(); ++i) {
52+
partitionChannels.push_back(i);
53+
}
54+
55+
return std::make_unique<IcebergPartitionIdGenerator>(
56+
partitionChannels,
57+
128,
58+
pool_.get(),
59+
transforms,
60+
partitionPathAsLowerCase);
61+
}
62+
63+
RowVectorPtr createRowVector(
64+
const std::vector<std::string>& names,
65+
const std::vector<VectorPtr>& children) {
66+
std::vector<TypePtr> types;
67+
for (const auto& child : children) {
68+
types.push_back(child->type());
69+
}
70+
71+
return makeRowVector(names, children);
72+
}
73+
74+
void verifyPartitionComponents(
75+
const std::string& partitionName,
76+
const std::vector<std::string>& expectedComponents) {
77+
std::vector<std::string> actualComponents;
78+
folly::split('/', partitionName, actualComponents);
79+
ASSERT_EQ(actualComponents.size(), expectedComponents.size());
80+
for (size_t i = 0; i < expectedComponents.size(); ++i) {
81+
ASSERT_EQ(actualComponents[i], expectedComponents[i]);
82+
}
83+
}
84+
};
85+
86+
TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithIdentityTransforms) {
87+
auto intVector = makeFlatVector<int32_t>(1, [](auto) { return 42; });
88+
auto bigintVector =
89+
makeFlatVector<int64_t>(1, [](auto) { return 9'876'543'210; });
90+
auto varcharVector =
91+
BaseVector::create<FlatVector<StringView>>(VARCHAR(), 1, opPool_.get());
92+
varcharVector->set(0, StringView("test string"));
93+
auto decimalVector =
94+
BaseVector::create<FlatVector<int64_t>>(DECIMAL(18, 4), 1, opPool_.get());
95+
decimalVector->set(0, 12'345'678'901'234);
96+
auto boolVector = makeFlatVector<bool>(1, [](auto) { return true; });
97+
auto dateVector =
98+
BaseVector::create<FlatVector<int32_t>>(DATE(), 1, opPool_.get());
99+
dateVector->set(0, 18'262);
100+
101+
std::vector<std::string> columnNames = {
102+
"c_int", "c_bigint", "c_varchar", "c_decimal", "c_bool", "c_date"};
103+
104+
std::vector<VectorPtr> columns = {
105+
intVector,
106+
bigintVector,
107+
varcharVector,
108+
decimalVector,
109+
boolVector,
110+
dateVector};
111+
112+
std::vector<TypePtr> types = {
113+
INTEGER(), BIGINT(), VARCHAR(), DECIMAL(18, 4), BOOLEAN(), DATE()};
114+
auto rowVector = createRowVector(columnNames, columns);
115+
std::vector<TransformType> transformTypes(
116+
columnNames.size(), TransformType::kIdentity);
117+
auto transforms = createColumnTransforms(columnNames, types, transformTypes);
118+
auto generator = createGenerator(transforms);
119+
raw_vector<uint64_t> partitionIds(1);
120+
generator->run(rowVector, partitionIds);
121+
122+
std::string partitionName = generator->partitionName(partitionIds[0], "null");
123+
std::vector<std::string> expectedComponents = {
124+
"c_int=42",
125+
"c_bigint=9876543210",
126+
"c_varchar=test+string",
127+
"c_decimal=1234567890.1234",
128+
"c_bool=true",
129+
"c_date=2020-01-01"};
130+
verifyPartitionComponents(partitionName, expectedComponents);
131+
}
132+
133+
TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithMixedTransforms) {
134+
auto intVector = makeFlatVector<int32_t>(1, [](auto) { return 42; });
135+
auto bigintVector =
136+
makeFlatVector<int64_t>(1, [](auto) { return 9'876'543'210; });
137+
auto varcharVector = makeFlatVector<StringView>({"test string"});
138+
auto timestampVector = makeFlatVector<Timestamp>(
139+
1, [](auto) { return Timestamp(1'577'836'800, 0); });
140+
141+
std::vector<std::string> columnNames = {
142+
"c_int", "c_bigint", "c_varchar", "c_timestamp"};
143+
144+
std::vector<VectorPtr> columns = {
145+
intVector, bigintVector, varcharVector, timestampVector};
146+
147+
std::vector<TypePtr> types = {INTEGER(), BIGINT(), VARCHAR(), TIMESTAMP()};
148+
149+
auto rowVector = createRowVector(columnNames, columns);
150+
151+
std::vector<TransformType> transformTypes = {
152+
TransformType::kBucket,
153+
TransformType::kTruncate,
154+
TransformType::kTruncate,
155+
TransformType::kYear};
156+
157+
std::vector<std::optional<int32_t>> parameters = {4, 1'000, 5, std::nullopt};
158+
auto transforms =
159+
createColumnTransforms(columnNames, types, transformTypes, parameters);
160+
161+
auto generator = createGenerator(transforms);
162+
raw_vector<uint64_t> partitionIds(1);
163+
generator->run(rowVector, partitionIds);
164+
165+
std::string partitionName = generator->partitionName(partitionIds[0], "null");
166+
std::vector<std::string> expectedComponents = {
167+
"c_int_bucket=2",
168+
"c_bigint_trunc=9876543000",
169+
"c_varchar_trunc=test+",
170+
"c_timestamp_year=50"};
171+
verifyPartitionComponents(partitionName, expectedComponents);
172+
}
173+
174+
TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithNullValues) {
175+
auto intVector = makeNullableFlatVector<int32_t>({std::nullopt});
176+
auto varcharVector = makeNullableFlatVector<StringView>({std::nullopt});
177+
auto decimalVector =
178+
BaseVector::create<FlatVector<int64_t>>(DECIMAL(18, 4), 1, opPool_.get());
179+
decimalVector->setNull(0, true);
180+
181+
std::vector<std::string> columnNames = {"c_int", "c_varchar", "c_decimal"};
182+
std::vector<VectorPtr> columns = {intVector, varcharVector, decimalVector};
183+
std::vector<TypePtr> types = {INTEGER(), VARCHAR(), DECIMAL(18, 3)};
184+
auto rowVector = createRowVector(columnNames, columns);
185+
186+
std::vector<TransformType> transformTypes = {
187+
TransformType::kBucket,
188+
TransformType::kTruncate,
189+
TransformType::kIdentity};
190+
std::vector<std::optional<int32_t>> parameters = {4, 1'000, std::nullopt};
191+
auto transforms =
192+
createColumnTransforms(columnNames, types, transformTypes, parameters);
193+
auto generator = createGenerator(transforms);
194+
raw_vector<uint64_t> partitionIds(1);
195+
generator->run(rowVector, partitionIds);
196+
197+
std::string partitionName = generator->partitionName(partitionIds[0], "null");
198+
std::vector<std::string> expectedComponents = {
199+
"c_int_bucket=null", "c_varchar_trunc=null", "c_decimal=null"};
200+
verifyPartitionComponents(partitionName, expectedComponents);
201+
}
202+
203+
TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithLowerCase) {
204+
auto varcharVector = makeFlatVector<StringView>({"MiXeD_CaSe"});
205+
std::vector<std::string> columnNames = {"MiXeD_CoLuMn"};
206+
std::vector<VectorPtr> columns = {varcharVector};
207+
std::vector<TypePtr> types = {VARCHAR()};
208+
auto rowVector = createRowVector(columnNames, columns);
209+
std::vector<TransformType> transformTypes = {TransformType::kIdentity};
210+
auto transforms = createColumnTransforms(columnNames, types, transformTypes);
211+
auto generator = createGenerator(transforms, true);
212+
raw_vector<uint64_t> partitionIds(1);
213+
generator->run(rowVector, partitionIds);
214+
std::string partitionName = generator->partitionName(partitionIds[0], "null");
215+
std::vector<std::string> expectedComponents = {"mixed_column=MiXeD_CaSe"};
216+
verifyPartitionComponents(partitionName, expectedComponents);
217+
218+
generator = createGenerator(transforms);
219+
generator->run(rowVector, partitionIds);
220+
partitionName = generator->partitionName(partitionIds[0], "null");
221+
expectedComponents = {"MiXeD_CoLuMn=MiXeD_CaSe"};
222+
verifyPartitionComponents(partitionName, expectedComponents);
223+
}
224+
225+
TEST_F(IcebergPartitionIdGeneratorTest, urlEncodingForSpecialChars) {
226+
std::vector<std::pair<std::string, std::string>> testCases = {
227+
{"space test", "space+test"},
228+
{"slash/test", "slash%2Ftest"},
229+
{"question?test", "question%3Ftest"},
230+
{"percent%test", "percent%25test"},
231+
{"hash#test", "hash%23test"},
232+
{"ampersand&test", "ampersand%26test"},
233+
{"equals=test", "equals%3Dtest"},
234+
{"plus+test", "plus%2Btest"},
235+
{"comma,test", "comma%2Ctest"},
236+
{"semicolon;test", "semicolon%3Btest"},
237+
{"at@test", "at%40test"},
238+
{"dollar$test", "dollar%24test"},
239+
{"backslash\\test", "backslash%5Ctest"},
240+
{"quote\"test", "quote%22test"},
241+
{"apostrophe'test", "apostrophe%27test"},
242+
{"less<than", "less%3Cthan"},
243+
{"greater>than", "greater%3Ethan"},
244+
{"colon:test", "colon%3Atest"},
245+
{"pipe|test", "pipe%7Ctest"},
246+
{"bracket[test", "bracket%5Btest"},
247+
{"bracket]test", "bracket%5Dtest"},
248+
{"brace{test", "brace%7Btest"},
249+
{"brace}test", "brace%7Dtest"},
250+
{"caret^test", "caret%5Etest"},
251+
{"tilde~test", "tilde%7Etest"},
252+
{"backtick`test", "backtick%60test"},
253+
{"unicode\u00A9test", "unicode%C2%A9test"},
254+
{"[email protected]", "email%40example.com"},
255+
{"user:password@host:port/path", "user%3Apassword%40host%3Aport%2Fpath"},
256+
{"https://github.ibm.com/IBM/velox",
257+
"https%3A%2F%2Fgithub.ibm.com%2FIBM%2Fvelox"},
258+
{"a+b=c&d=e+f", "a%2Bb%3Dc%26d%3De%2Bf"},
259+
{"special!@#$%^&*()_+", "special%21%40%23%24%25%5E%26*%28%29_%2B"},
260+
};
261+
262+
for (const auto& [input, expectedEncoded] : testCases) {
263+
auto varcharVector = makeFlatVector<StringView>({StringView(input)});
264+
std::vector<std::string> columnNames = {"ColumnWithSpecialChars"};
265+
auto rowVector = createRowVector(columnNames, {varcharVector});
266+
267+
std::vector<TransformType> transformTypes = {TransformType::kIdentity};
268+
std::vector<TypePtr> types = {VARCHAR()};
269+
auto transforms =
270+
createColumnTransforms(columnNames, types, transformTypes);
271+
272+
auto generator = createGenerator(transforms);
273+
raw_vector<uint64_t> partitionIds(1);
274+
generator->run(rowVector, partitionIds);
275+
276+
std::string partitionName =
277+
generator->partitionName(partitionIds[0], "null");
278+
std::string expectedComponent =
279+
fmt::format("{}={}", columnNames[0], expectedEncoded);
280+
ASSERT_EQ(partitionName, expectedComponent);
281+
}
282+
}
283+
284+
TEST_F(IcebergPartitionIdGeneratorTest, multipleRows) {
285+
auto intVector = makeFlatVector<int32_t>({10, 20, 30});
286+
auto varcharVector =
287+
makeFlatVector<StringView>({"value1", "value2", "value3"});
288+
std::vector<std::string> columnNames = {"c_int", "c_varchar"};
289+
auto rowVector = createRowVector(columnNames, {intVector, varcharVector});
290+
291+
std::vector<TypePtr> types = {INTEGER(), VARCHAR()};
292+
std::vector<TransformType> transformTypes(
293+
columnNames.size(), TransformType::kIdentity);
294+
auto transforms = createColumnTransforms(columnNames, types, transformTypes);
295+
auto generator = createGenerator(transforms);
296+
raw_vector<uint64_t> partitionIds(3);
297+
generator->run(rowVector, partitionIds);
298+
299+
std::vector<std::string> expectedNames = {
300+
"c_int=10/c_varchar=value1",
301+
"c_int=20/c_varchar=value2",
302+
"c_int=30/c_varchar=value3"};
303+
304+
for (size_t i = 0; i < 3; ++i) {
305+
std::string partitionName =
306+
generator->partitionName(partitionIds[i], "null");
307+
ASSERT_EQ(partitionName, expectedNames[i]);
308+
}
309+
}
310+
311+
} // namespace facebook::velox::connector::hive::iceberg::test

0 commit comments

Comments
 (0)