| 
 | 1 | +/*  | 
 | 2 | + * Copyright (c) Facebook, Inc. and its affiliates.  | 
 | 3 | + *  | 
 | 4 | + * Licensed under the Apache License, Version 2.0 (the "License");  | 
 | 5 | + * you may not use this file except in compliance with the License.  | 
 | 6 | + * You may obtain a copy of the License at  | 
 | 7 | + *  | 
 | 8 | + *     http://www.apache.org/licenses/LICENSE-2.0  | 
 | 9 | + *  | 
 | 10 | + * Unless required by applicable law or agreed to in writing, software  | 
 | 11 | + * distributed under the License is distributed on an "AS IS" BASIS,  | 
 | 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
 | 13 | + * See the License for the specific language governing permissions and  | 
 | 14 | + * limitations under the License.  | 
 | 15 | + */  | 
 | 16 | + | 
 | 17 | +#include "velox/dwio/parquet/common/UnicodeUtil.h"  | 
 | 18 | + | 
 | 19 | +#include "velox/external/utf8proc/utf8proc.h"  | 
 | 20 | +#include "velox/functions/lib/string/StringImpl.h"  | 
 | 21 | + | 
 | 22 | +namespace facebook::velox::parquet {  | 
 | 23 | + | 
 | 24 | +std::string UnicodeUtil::truncateStringMin(  | 
 | 25 | +    const char* input,  | 
 | 26 | +    int32_t inputLength,  | 
 | 27 | +    int32_t numCodePoints) {  | 
 | 28 | +  auto length = functions::stringImpl::cappedByteLength<false>(  | 
 | 29 | +      StringView(input, inputLength), numCodePoints);  | 
 | 30 | +  return std::string(input, length);  | 
 | 31 | +}  | 
 | 32 | + | 
 | 33 | +std::string UnicodeUtil::truncateStringMax(  | 
 | 34 | +    const char* input,  | 
 | 35 | +    int32_t inputLength,  | 
 | 36 | +    int32_t numCodePoints) {  | 
 | 37 | +  auto truncatedLength = functions::stringImpl::cappedByteLength<false>(  | 
 | 38 | +      StringView(input, inputLength), numCodePoints);  | 
 | 39 | + | 
 | 40 | +  if (truncatedLength == inputLength) {  | 
 | 41 | +    return std::string(input, inputLength);  | 
 | 42 | +  }  | 
 | 43 | + | 
 | 44 | +  // Try to increment the last code point.  | 
 | 45 | +  for (auto i = numCodePoints - 1; i >= 0; --i) {  | 
 | 46 | +    const char* current = input;  | 
 | 47 | +    int32_t currentCodePoint = 0;  | 
 | 48 | + | 
 | 49 | +    // Find the i-th code point position.  | 
 | 50 | +    while (current < input + truncatedLength && currentCodePoint < i) {  | 
 | 51 | +      int32_t charLength;  | 
 | 52 | +      utf8proc_codepoint(current, input + truncatedLength, charLength);  | 
 | 53 | +      current += charLength;  | 
 | 54 | +      currentCodePoint++;  | 
 | 55 | +    }  | 
 | 56 | + | 
 | 57 | +    if (current >= input + truncatedLength)  | 
 | 58 | +      continue;  | 
 | 59 | + | 
 | 60 | +    int32_t charLength;  | 
 | 61 | +    auto codePoint =  | 
 | 62 | +        utf8proc_codepoint(current, input + truncatedLength, charLength);  | 
 | 63 | +    auto nextCodePoint = codePoint + 1;  | 
 | 64 | + | 
 | 65 | +    // Check if the incremented code point is valid.  | 
 | 66 | +    if (nextCodePoint != 0 && utf8proc_codepoint_valid(nextCodePoint)) {  | 
 | 67 | +      std::string result;  | 
 | 68 | +      result.reserve(truncatedLength + 4);  | 
 | 69 | +      result.assign(input, current - input);  | 
 | 70 | +      char buffer[4];  | 
 | 71 | +      auto bytesWritten = utf8proc_encode_char(  | 
 | 72 | +          nextCodePoint, reinterpret_cast<utf8proc_uint8_t*>(buffer));  | 
 | 73 | +      result.append(buffer, bytesWritten);  | 
 | 74 | +      return result;  | 
 | 75 | +    }  | 
 | 76 | +  }  | 
 | 77 | + | 
 | 78 | +  // Return truncated string without intermediate string creation  | 
 | 79 | +  return std::string(input, truncatedLength);  | 
 | 80 | +}  | 
 | 81 | + | 
 | 82 | +} // namespace facebook::velox::parquet  | 
0 commit comments