| 
 | 1 | +/*  | 
 | 2 | + * Copyright (c) Facebook, Inc. and its affiliates.  | 
 | 3 | + *  | 
 | 4 | + * Licensed under the Apache License, Version 2.0 (the "License");  | 
 | 5 | + * you may not use this file except in compliance with the License.  | 
 | 6 | + * You may obtain a copy of the License at  | 
 | 7 | + *  | 
 | 8 | + *     http://www.apache.org/licenses/LICENSE-2.0  | 
 | 9 | + *  | 
 | 10 | + * Unless required by applicable law or agreed to in writing, software  | 
 | 11 | + * distributed under the License is distributed on an "AS IS" BASIS,  | 
 | 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
 | 13 | + * See the License for the specific language governing permissions and  | 
 | 14 | + * limitations under the License.  | 
 | 15 | + */  | 
 | 16 | + | 
 | 17 | +#include "velox/dwio/parquet/common/UnicodeUtil.h"  | 
 | 18 | + | 
 | 19 | +namespace facebook::velox::parquet {  | 
 | 20 | + | 
 | 21 | +std::string UnicodeUtil::truncateString(  | 
 | 22 | +    const std::string& input,  | 
 | 23 | +    int32_t length) {  | 
 | 24 | +  VELOX_CHECK_GT(length, 0, "Truncate length should be positive");  | 
 | 25 | +  return input.substr(  | 
 | 26 | +      0, functions::stringImpl::cappedByteLength<false>(input, length));  | 
 | 27 | +}  | 
 | 28 | + | 
 | 29 | +std::optional<std::string> UnicodeUtil::truncateStringMin(  | 
 | 30 | +    const std::optional<std::string>& input,  | 
 | 31 | +    int32_t length) {  | 
 | 32 | +  if (!input.has_value()) {  | 
 | 33 | +    return std::nullopt;  | 
 | 34 | +  }  | 
 | 35 | +  return truncateString(input.value(), length);  | 
 | 36 | +}  | 
 | 37 | + | 
 | 38 | +std::optional<std::string> UnicodeUtil::truncateStringMax(  | 
 | 39 | +    const std::optional<std::string>& input,  | 
 | 40 | +    int32_t length) {  | 
 | 41 | +  if (!input.has_value()) {  | 
 | 42 | +    return std::nullopt;  | 
 | 43 | +  }  | 
 | 44 | + | 
 | 45 | +  const std::string& inputStr = input.value();  | 
 | 46 | +  const auto truncated = truncateString(inputStr, length);  | 
 | 47 | +  if (truncated.length() == inputStr.length()) {  | 
 | 48 | +    return inputStr;  | 
 | 49 | +  }  | 
 | 50 | + | 
 | 51 | +  // Try to increment the last code point.  | 
 | 52 | +  for (auto i = length - 1; i >= 0; --i) {  | 
 | 53 | +    // Find the byte offset for the i-th code point.  | 
 | 54 | +    const char* data = truncated.data();  | 
 | 55 | +    const char* end = data + truncated.size();  | 
 | 56 | +    const char* current = data;  | 
 | 57 | +    int32_t currentCodePoint = 0;  | 
 | 58 | + | 
 | 59 | +    while (current < end && currentCodePoint < i) {  | 
 | 60 | +      int32_t charLength;  | 
 | 61 | +      utf8proc_codepoint(current, end, charLength);  | 
 | 62 | +      current += charLength;  | 
 | 63 | +      currentCodePoint++;  | 
 | 64 | +    }  | 
 | 65 | + | 
 | 66 | +    // Get the code point at this position.  | 
 | 67 | +    int32_t charLength;  | 
 | 68 | +    auto codePoint = utf8proc_codepoint(current, end, charLength);  | 
 | 69 | +    auto nextCodePoint = codePoint + 1;  | 
 | 70 | + | 
 | 71 | +    // Check if the incremented code point is valid.  | 
 | 72 | +    if (nextCodePoint != 0 && utf8proc_codepoint_valid(nextCodePoint)) {  | 
 | 73 | +      auto result = truncated.substr(0, current - data);  | 
 | 74 | +      // Append the incremented code point.  | 
 | 75 | +      char buffer[4]; // UTF-8 uses up to 4 bytes per code point.  | 
 | 76 | +      auto bytesWritten = utf8proc_encode_char(  | 
 | 77 | +          nextCodePoint, reinterpret_cast<utf8proc_uint8_t*>(buffer));  | 
 | 78 | +      result.append(buffer, bytesWritten);  | 
 | 79 | +      return result;  | 
 | 80 | +    }  | 
 | 81 | +  }  | 
 | 82 | +  return std::nullopt;  | 
 | 83 | +}  | 
 | 84 | + | 
 | 85 | +} // namespace facebook::velox::parquet  | 
0 commit comments