| 
 | 1 | +/*  | 
 | 2 | + * Copyright (c) Facebook, Inc. and its affiliates.  | 
 | 3 | + *  | 
 | 4 | + * Licensed under the Apache License, Version 2.0 (the "License");  | 
 | 5 | + * you may not use this file except in compliance with the License.  | 
 | 6 | + * You may obtain a copy of the License at  | 
 | 7 | + *  | 
 | 8 | + *     http://www.apache.org/licenses/LICENSE-2.0  | 
 | 9 | + *  | 
 | 10 | + * Unless required by applicable law or agreed to in writing, software  | 
 | 11 | + * distributed under the License is distributed on an "AS IS" BASIS,  | 
 | 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
 | 13 | + * See the License for the specific language governing permissions and  | 
 | 14 | + * limitations under the License.  | 
 | 15 | + */  | 
 | 16 | + | 
 | 17 | +#include "velox/connectors/hive/iceberg/IcebergPartitionIdGenerator.h"  | 
 | 18 | +#include "velox/connectors/hive/iceberg/ColumnTransform.h"  | 
 | 19 | +#include "velox/connectors/hive/iceberg/Transforms.h"  | 
 | 20 | +#include "velox/connectors/hive/iceberg/tests/IcebergTestBase.h"  | 
 | 21 | + | 
 | 22 | +using namespace facebook::velox;  | 
 | 23 | + | 
 | 24 | +namespace facebook::velox::connector::hive::iceberg::test {  | 
 | 25 | + | 
 | 26 | +class IcebergPartitionIdGeneratorTest : public IcebergTestBase {  | 
 | 27 | + protected:  | 
 | 28 | +  std::vector<ColumnTransform> createColumnTransforms(  | 
 | 29 | +      const std::vector<std::string>& columnNames,  | 
 | 30 | +      const std::vector<TypePtr>& types,  | 
 | 31 | +      const std::vector<TransformType>& transformTypes,  | 
 | 32 | +      const std::vector<std::optional<int32_t>>& parameters = {}) {  | 
 | 33 | +    std::vector<IcebergPartitionSpec::Field> fields;  | 
 | 34 | +    fields.reserve(columnNames.size());  | 
 | 35 | + | 
 | 36 | +    for (size_t i = 0; i < columnNames.size(); ++i) {  | 
 | 37 | +      std::optional<int32_t> parameter =  | 
 | 38 | +          parameters.size() > i ? parameters[i] : std::nullopt;  | 
 | 39 | + | 
 | 40 | +      fields.emplace_back(  | 
 | 41 | +          columnNames[i], types[i], transformTypes[i], parameter);  | 
 | 42 | +    }  | 
 | 43 | + | 
 | 44 | +    return parsePartitionTransformSpecs(fields, pool_.get());  | 
 | 45 | +  }  | 
 | 46 | + | 
 | 47 | +  std::unique_ptr<IcebergPartitionIdGenerator> createGenerator(  | 
 | 48 | +      const std::vector<ColumnTransform>& transforms,  | 
 | 49 | +      bool partitionPathAsLowerCase = false) {  | 
 | 50 | +    std::vector<column_index_t> partitionChannels;  | 
 | 51 | +    for (size_t i = 0; i < transforms.size(); ++i) {  | 
 | 52 | +      partitionChannels.push_back(i);  | 
 | 53 | +    }  | 
 | 54 | + | 
 | 55 | +    return std::make_unique<IcebergPartitionIdGenerator>(  | 
 | 56 | +        partitionChannels,  | 
 | 57 | +        128,  | 
 | 58 | +        pool_.get(),  | 
 | 59 | +        transforms,  | 
 | 60 | +        partitionPathAsLowerCase);  | 
 | 61 | +  }  | 
 | 62 | + | 
 | 63 | +  RowVectorPtr createRowVector(  | 
 | 64 | +      const std::vector<std::string>& names,  | 
 | 65 | +      const std::vector<VectorPtr>& children) {  | 
 | 66 | +    std::vector<TypePtr> types;  | 
 | 67 | +    for (const auto& child : children) {  | 
 | 68 | +      types.push_back(child->type());  | 
 | 69 | +    }  | 
 | 70 | + | 
 | 71 | +    return makeRowVector(names, children);  | 
 | 72 | +  }  | 
 | 73 | + | 
 | 74 | +  void verifyPartitionComponents(  | 
 | 75 | +      const std::string& partitionName,  | 
 | 76 | +      const std::vector<std::string>& expectedComponents) {  | 
 | 77 | +    std::vector<std::string> actualComponents;  | 
 | 78 | +    folly::split('/', partitionName, actualComponents);  | 
 | 79 | +    ASSERT_EQ(actualComponents.size(), expectedComponents.size());  | 
 | 80 | +    for (size_t i = 0; i < expectedComponents.size(); ++i) {  | 
 | 81 | +      ASSERT_EQ(actualComponents[i], expectedComponents[i]);  | 
 | 82 | +    }  | 
 | 83 | +  }  | 
 | 84 | +};  | 
 | 85 | + | 
 | 86 | +TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithIdentityTransforms) {  | 
 | 87 | +  auto intVector = makeFlatVector<int32_t>(1, [](auto) { return 42; });  | 
 | 88 | +  auto bigintVector =  | 
 | 89 | +      makeFlatVector<int64_t>(1, [](auto) { return 9'876'543'210; });  | 
 | 90 | +  auto varcharVector =  | 
 | 91 | +      BaseVector::create<FlatVector<StringView>>(VARCHAR(), 1, opPool_.get());  | 
 | 92 | +  varcharVector->set(0, StringView("test string"));  | 
 | 93 | +  auto decimalVector =  | 
 | 94 | +      BaseVector::create<FlatVector<int64_t>>(DECIMAL(18, 4), 1, opPool_.get());  | 
 | 95 | +  decimalVector->set(0, 12'345'678'901'234);  | 
 | 96 | +  auto boolVector = makeFlatVector<bool>(1, [](auto) { return true; });  | 
 | 97 | +  auto dateVector =  | 
 | 98 | +      BaseVector::create<FlatVector<int32_t>>(DATE(), 1, opPool_.get());  | 
 | 99 | +  dateVector->set(0, 18'262);  | 
 | 100 | + | 
 | 101 | +  std::vector<std::string> columnNames = {  | 
 | 102 | +      "c_int", "c_bigint", "c_varchar", "c_decimal", "c_bool", "c_date"};  | 
 | 103 | + | 
 | 104 | +  std::vector<VectorPtr> columns = {  | 
 | 105 | +      intVector,  | 
 | 106 | +      bigintVector,  | 
 | 107 | +      varcharVector,  | 
 | 108 | +      decimalVector,  | 
 | 109 | +      boolVector,  | 
 | 110 | +      dateVector};  | 
 | 111 | + | 
 | 112 | +  std::vector<TypePtr> types = {  | 
 | 113 | +      INTEGER(), BIGINT(), VARCHAR(), DECIMAL(18, 4), BOOLEAN(), DATE()};  | 
 | 114 | +  auto rowVector = createRowVector(columnNames, columns);  | 
 | 115 | +  std::vector<TransformType> transformTypes(  | 
 | 116 | +      columnNames.size(), TransformType::kIdentity);  | 
 | 117 | +  auto transforms = createColumnTransforms(columnNames, types, transformTypes);  | 
 | 118 | +  auto generator = createGenerator(transforms);  | 
 | 119 | +  raw_vector<uint64_t> partitionIds(1);  | 
 | 120 | +  generator->run(rowVector, partitionIds);  | 
 | 121 | + | 
 | 122 | +  std::string partitionName = generator->partitionName(partitionIds[0], "null");  | 
 | 123 | +  std::vector<std::string> expectedComponents = {  | 
 | 124 | +      "c_int=42",  | 
 | 125 | +      "c_bigint=9876543210",  | 
 | 126 | +      "c_varchar=test+string",  | 
 | 127 | +      "c_decimal=1234567890.1234",  | 
 | 128 | +      "c_bool=true",  | 
 | 129 | +      "c_date=2020-01-01"};  | 
 | 130 | +  verifyPartitionComponents(partitionName, expectedComponents);  | 
 | 131 | +}  | 
 | 132 | + | 
 | 133 | +TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithMixedTransforms) {  | 
 | 134 | +  auto intVector = makeFlatVector<int32_t>(1, [](auto) { return 42; });  | 
 | 135 | +  auto bigintVector =  | 
 | 136 | +      makeFlatVector<int64_t>(1, [](auto) { return 9'876'543'210; });  | 
 | 137 | +  auto varcharVector = makeFlatVector<StringView>({"test string"});  | 
 | 138 | +  auto timestampVector = makeFlatVector<Timestamp>(  | 
 | 139 | +      1, [](auto) { return Timestamp(1'577'836'800, 0); });  | 
 | 140 | + | 
 | 141 | +  std::vector<std::string> columnNames = {  | 
 | 142 | +      "c_int", "c_bigint", "c_varchar", "c_timestamp"};  | 
 | 143 | + | 
 | 144 | +  std::vector<VectorPtr> columns = {  | 
 | 145 | +      intVector, bigintVector, varcharVector, timestampVector};  | 
 | 146 | + | 
 | 147 | +  std::vector<TypePtr> types = {INTEGER(), BIGINT(), VARCHAR(), TIMESTAMP()};  | 
 | 148 | + | 
 | 149 | +  auto rowVector = createRowVector(columnNames, columns);  | 
 | 150 | + | 
 | 151 | +  std::vector<TransformType> transformTypes = {  | 
 | 152 | +      TransformType::kBucket,  | 
 | 153 | +      TransformType::kTruncate,  | 
 | 154 | +      TransformType::kTruncate,  | 
 | 155 | +      TransformType::kYear};  | 
 | 156 | + | 
 | 157 | +  std::vector<std::optional<int32_t>> parameters = {4, 1'000, 5, std::nullopt};  | 
 | 158 | +  auto transforms =  | 
 | 159 | +      createColumnTransforms(columnNames, types, transformTypes, parameters);  | 
 | 160 | + | 
 | 161 | +  auto generator = createGenerator(transforms);  | 
 | 162 | +  raw_vector<uint64_t> partitionIds(1);  | 
 | 163 | +  generator->run(rowVector, partitionIds);  | 
 | 164 | + | 
 | 165 | +  std::string partitionName = generator->partitionName(partitionIds[0], "null");  | 
 | 166 | +  std::vector<std::string> expectedComponents = {  | 
 | 167 | +      "c_int_bucket=2",  | 
 | 168 | +      "c_bigint_trunc=9876543000",  | 
 | 169 | +      "c_varchar_trunc=test+",  | 
 | 170 | +      "c_timestamp_year=50"};  | 
 | 171 | +  verifyPartitionComponents(partitionName, expectedComponents);  | 
 | 172 | +}  | 
 | 173 | + | 
 | 174 | +TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithNullValues) {  | 
 | 175 | +  auto intVector = makeNullableFlatVector<int32_t>({std::nullopt});  | 
 | 176 | +  auto varcharVector = makeNullableFlatVector<StringView>({std::nullopt});  | 
 | 177 | +  auto decimalVector =  | 
 | 178 | +      BaseVector::create<FlatVector<int64_t>>(DECIMAL(18, 4), 1, opPool_.get());  | 
 | 179 | +  decimalVector->setNull(0, true);  | 
 | 180 | + | 
 | 181 | +  std::vector<std::string> columnNames = {"c_int", "c_varchar", "c_decimal"};  | 
 | 182 | +  std::vector<VectorPtr> columns = {intVector, varcharVector, decimalVector};  | 
 | 183 | +  std::vector<TypePtr> types = {INTEGER(), VARCHAR(), DECIMAL(18, 3)};  | 
 | 184 | +  auto rowVector = createRowVector(columnNames, columns);  | 
 | 185 | + | 
 | 186 | +  std::vector<TransformType> transformTypes = {  | 
 | 187 | +      TransformType::kBucket,  | 
 | 188 | +      TransformType::kTruncate,  | 
 | 189 | +      TransformType::kIdentity};  | 
 | 190 | +  std::vector<std::optional<int32_t>> parameters = {4, 1'000, std::nullopt};  | 
 | 191 | +  auto transforms =  | 
 | 192 | +      createColumnTransforms(columnNames, types, transformTypes, parameters);  | 
 | 193 | +  auto generator = createGenerator(transforms);  | 
 | 194 | +  raw_vector<uint64_t> partitionIds(1);  | 
 | 195 | +  generator->run(rowVector, partitionIds);  | 
 | 196 | + | 
 | 197 | +  std::string partitionName = generator->partitionName(partitionIds[0], "null");  | 
 | 198 | +  std::vector<std::string> expectedComponents = {  | 
 | 199 | +      "c_int_bucket=null", "c_varchar_trunc=null", "c_decimal=null"};  | 
 | 200 | +  verifyPartitionComponents(partitionName, expectedComponents);  | 
 | 201 | +}  | 
 | 202 | + | 
 | 203 | +TEST_F(IcebergPartitionIdGeneratorTest, partitionNameWithLowerCase) {  | 
 | 204 | +  auto varcharVector = makeFlatVector<StringView>({"MiXeD_CaSe"});  | 
 | 205 | +  std::vector<std::string> columnNames = {"MiXeD_CoLuMn"};  | 
 | 206 | +  std::vector<VectorPtr> columns = {varcharVector};  | 
 | 207 | +  std::vector<TypePtr> types = {VARCHAR()};  | 
 | 208 | +  auto rowVector = createRowVector(columnNames, columns);  | 
 | 209 | +  std::vector<TransformType> transformTypes = {TransformType::kIdentity};  | 
 | 210 | +  auto transforms = createColumnTransforms(columnNames, types, transformTypes);  | 
 | 211 | +  auto generator = createGenerator(transforms, true);  | 
 | 212 | +  raw_vector<uint64_t> partitionIds(1);  | 
 | 213 | +  generator->run(rowVector, partitionIds);  | 
 | 214 | +  std::string partitionName = generator->partitionName(partitionIds[0], "null");  | 
 | 215 | +  std::vector<std::string> expectedComponents = {"mixed_column=MiXeD_CaSe"};  | 
 | 216 | +  verifyPartitionComponents(partitionName, expectedComponents);  | 
 | 217 | + | 
 | 218 | +  generator = createGenerator(transforms);  | 
 | 219 | +  generator->run(rowVector, partitionIds);  | 
 | 220 | +  partitionName = generator->partitionName(partitionIds[0], "null");  | 
 | 221 | +  expectedComponents = {"MiXeD_CoLuMn=MiXeD_CaSe"};  | 
 | 222 | +  verifyPartitionComponents(partitionName, expectedComponents);  | 
 | 223 | +}  | 
 | 224 | + | 
 | 225 | +TEST_F(IcebergPartitionIdGeneratorTest, urlEncodingForSpecialChars) {  | 
 | 226 | +  std::vector<std::pair<std::string, std::string>> testCases = {  | 
 | 227 | +      {"space test", "space+test"},  | 
 | 228 | +      {"slash/test", "slash%2Ftest"},  | 
 | 229 | +      {"question?test", "question%3Ftest"},  | 
 | 230 | +      {"percent%test", "percent%25test"},  | 
 | 231 | +      {"hash#test", "hash%23test"},  | 
 | 232 | +      {"ampersand&test", "ampersand%26test"},  | 
 | 233 | +      {"equals=test", "equals%3Dtest"},  | 
 | 234 | +      {"plus+test", "plus%2Btest"},  | 
 | 235 | +      {"comma,test", "comma%2Ctest"},  | 
 | 236 | +      {"semicolon;test", "semicolon%3Btest"},  | 
 | 237 | +      {"at@test", "at%40test"},  | 
 | 238 | +      {"dollar$test", "dollar%24test"},  | 
 | 239 | +      {"backslash\\test", "backslash%5Ctest"},  | 
 | 240 | +      {"quote\"test", "quote%22test"},  | 
 | 241 | +      {"apostrophe'test", "apostrophe%27test"},  | 
 | 242 | +      {"less<than", "less%3Cthan"},  | 
 | 243 | +      {"greater>than", "greater%3Ethan"},  | 
 | 244 | +      {"colon:test", "colon%3Atest"},  | 
 | 245 | +      {"pipe|test", "pipe%7Ctest"},  | 
 | 246 | +      {"bracket[test", "bracket%5Btest"},  | 
 | 247 | +      {"bracket]test", "bracket%5Dtest"},  | 
 | 248 | +      {"brace{test", "brace%7Btest"},  | 
 | 249 | +      {"brace}test", "brace%7Dtest"},  | 
 | 250 | +      {"caret^test", "caret%5Etest"},  | 
 | 251 | +      {"tilde~test", "tilde%7Etest"},  | 
 | 252 | +      {"backtick`test", "backtick%60test"},  | 
 | 253 | +      {"unicode\u00A9test", "unicode%C2%A9test"},  | 
 | 254 | +      { "[email protected]",  "email%40example.com"},  | 
 | 255 | +      {"user:password@host:port/path", "user%3Apassword%40host%3Aport%2Fpath"},  | 
 | 256 | +      {"https://github.ibm.com/IBM/velox",  | 
 | 257 | +       "https%3A%2F%2Fgithub.ibm.com%2FIBM%2Fvelox"},  | 
 | 258 | +      {"a+b=c&d=e+f", "a%2Bb%3Dc%26d%3De%2Bf"},  | 
 | 259 | +      {"special!@#$%^&*()_+", "special%21%40%23%24%25%5E%26*%28%29_%2B"},  | 
 | 260 | +  };  | 
 | 261 | + | 
 | 262 | +  for (const auto& [input, expectedEncoded] : testCases) {  | 
 | 263 | +    auto varcharVector = makeFlatVector<StringView>({StringView(input)});  | 
 | 264 | +    std::vector<std::string> columnNames = {"ColumnWithSpecialChars"};  | 
 | 265 | +    auto rowVector = createRowVector(columnNames, {varcharVector});  | 
 | 266 | + | 
 | 267 | +    std::vector<TransformType> transformTypes = {TransformType::kIdentity};  | 
 | 268 | +    std::vector<TypePtr> types = {VARCHAR()};  | 
 | 269 | +    auto transforms =  | 
 | 270 | +        createColumnTransforms(columnNames, types, transformTypes);  | 
 | 271 | + | 
 | 272 | +    auto generator = createGenerator(transforms);  | 
 | 273 | +    raw_vector<uint64_t> partitionIds(1);  | 
 | 274 | +    generator->run(rowVector, partitionIds);  | 
 | 275 | + | 
 | 276 | +    std::string partitionName =  | 
 | 277 | +        generator->partitionName(partitionIds[0], "null");  | 
 | 278 | +    std::string expectedComponent =  | 
 | 279 | +        fmt::format("{}={}", columnNames[0], expectedEncoded);  | 
 | 280 | +    ASSERT_EQ(partitionName, expectedComponent);  | 
 | 281 | +  }  | 
 | 282 | +}  | 
 | 283 | + | 
 | 284 | +TEST_F(IcebergPartitionIdGeneratorTest, multipleRows) {  | 
 | 285 | +  auto intVector = makeFlatVector<int32_t>({10, 20, 30});  | 
 | 286 | +  auto varcharVector =  | 
 | 287 | +      makeFlatVector<StringView>({"value1", "value2", "value3"});  | 
 | 288 | +  std::vector<std::string> columnNames = {"c_int", "c_varchar"};  | 
 | 289 | +  auto rowVector = createRowVector(columnNames, {intVector, varcharVector});  | 
 | 290 | + | 
 | 291 | +  std::vector<TypePtr> types = {INTEGER(), VARCHAR()};  | 
 | 292 | +  std::vector<TransformType> transformTypes(  | 
 | 293 | +      columnNames.size(), TransformType::kIdentity);  | 
 | 294 | +  auto transforms = createColumnTransforms(columnNames, types, transformTypes);  | 
 | 295 | +  auto generator = createGenerator(transforms);  | 
 | 296 | +  raw_vector<uint64_t> partitionIds(3);  | 
 | 297 | +  generator->run(rowVector, partitionIds);  | 
 | 298 | + | 
 | 299 | +  std::vector<std::string> expectedNames = {  | 
 | 300 | +      "c_int=10/c_varchar=value1",  | 
 | 301 | +      "c_int=20/c_varchar=value2",  | 
 | 302 | +      "c_int=30/c_varchar=value3"};  | 
 | 303 | + | 
 | 304 | +  for (size_t i = 0; i < 3; ++i) {  | 
 | 305 | +    std::string partitionName =  | 
 | 306 | +        generator->partitionName(partitionIds[i], "null");  | 
 | 307 | +    ASSERT_EQ(partitionName, expectedNames[i]);  | 
 | 308 | +  }  | 
 | 309 | +}  | 
 | 310 | + | 
 | 311 | +} // namespace facebook::velox::connector::hive::iceberg::test  | 
0 commit comments