IBM
diff --git a/‎velox/dwio/common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎velox/dwio/common/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎velox/dwio/common/IcebergStatistics.cpp‎
Lines changed: 43 additions & 0 deletions b/‎velox/dwio/common/IcebergStatistics.cpp‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎velox/dwio/common/IcebergStatistics.h‎
Lines changed: 38 additions & 0 deletions b/‎velox/dwio/common/IcebergStatistics.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎velox/dwio/common/Options.h‎
Lines changed: 1 addition & 0 deletions b/‎velox/dwio/common/Options.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎velox/dwio/common/Statistics.h‎
Lines changed: 1 addition & 0 deletions b/‎velox/dwio/common/Statistics.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎velox/dwio/common/Writer.h‎
Lines changed: 9 additions & 0 deletions b/‎velox/dwio/common/Writer.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎velox/dwio/parquet/common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎velox/dwio/parquet/common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎velox/dwio/parquet/common/UnicodeUtil.cpp‎
Lines changed: 85 additions & 0 deletions b/‎velox/dwio/parquet/common/UnicodeUtil.cpp‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎velox/dwio/parquet/common/UnicodeUtil.h‎
Lines changed: 47 additions & 0 deletions b/‎velox/dwio/parquet/common/UnicodeUtil.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎velox/dwio/parquet/tests/common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎velox/dwio/parquet/tests/common/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
@@ -42,6 +42,7 @@ velox_add_library(
   OnDemandUnitLoader.cpp
   InputStream.cpp
   IntDecoder.cpp
+  IcebergStatistics.cpp
   MetadataFilter.cpp
   Options.cpp
   OutputStream.cpp
 
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/dwio/common/IcebergStatistics.h"
+
+namespace facebook::velox::dwio::common {
+
+folly::dynamic IcebergDataFileStatistics::toJson() const {
+  folly::dynamic json = folly::dynamic::object;
+  json["recordCount"] = numRecords;
+
+  auto mapToJson = [](const auto& map) {
+    folly::dynamic result = folly::dynamic::object;
+    for (const auto& pair : map) {
+      result[folly::to<std::string>(pair.first)] = pair.second;
+    }
+    return result;
+  };
+
+  json["columnSizes"] = mapToJson(columnsSizes);
+  json["valueCounts"] = mapToJson(valueCounts);
+  json["nullValueCounts"] = mapToJson(nullValueCounts);
+  json["nanValueCounts"] = mapToJson(nanValueCounts);
+  json["lowerBounds"] = mapToJson(lowerBounds);
+  json["upperBounds"] = mapToJson(upperBounds);
+
+  return json;
+}
+
+} // namespace facebook::velox::dwio::common
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/json/dynamic.h>
+
+namespace facebook::velox::dwio::common {
+
+// Iceberg data_file struct fields
+struct IcebergDataFileStatistics {
+  int64_t numRecords;
+  std::unordered_map<int32_t, int64_t> columnsSizes;
+  std::unordered_map<int32_t, int64_t> valueCounts;
+  std::unordered_map<int32_t, int64_t> nullValueCounts;
+  std::unordered_map<int32_t, int64_t> nanValueCounts;
+  std::unordered_map<int32_t, std::string> lowerBounds;
+  std::unordered_map<int32_t, std::string> upperBounds;
+
+  IcebergDataFileStatistics() : numRecords(0) {}
+
+  folly::dynamic toJson() const;
+};
+
+} // namespace facebook::velox::dwio::common
@@ -705,6 +705,7 @@ struct WriterOptions {
 
   std::string sessionTimezoneName;
   bool adjustTimestampToTimezone{false};
+  std::shared_ptr<std::vector<int32_t>> sourceColumnIndices{nullptr};
 
   // WriterOption implementations can implement this function to specify how to
   // process format-specific session and connector configs.
 
@@ -18,6 +18,7 @@
 
 #include <folly/Hash.h>
 #include <folly/container/F14Map.h>
+#include <folly/json/dynamic.h>
 
 #include "velox/common/base/Exceptions.h"
 #include "velox/common/base/RuntimeMetrics.h"
 
@@ -21,6 +21,8 @@
 #include <optional>
 #include <string>
 
+#include "velox/dwio/common/IcebergStatistics.h"
+#include "velox/dwio/common/Statistics.h"
 #include "velox/vector/ComplexVector.h"
 
 namespace facebook::velox::dwio::common {
@@ -79,6 +81,11 @@ class Writer {
   /// Data can no longer be written.
   virtual void abort() = 0;
 
+  /// Return statistics based on each Iceberg data file
+  std::shared_ptr<IcebergDataFileStatistics> dataFileStats() const {
+    return icebergDataFileStats_;
+  };
+
  protected:
   bool isRunning() const;
   bool isFinishing() const;
@@ -92,6 +99,8 @@ class Writer {
   static void checkStateTransition(State oldState, State newState);
 
   State state_{State::kInit};
+  std::shared_ptr<IcebergDataFileStatistics> icebergDataFileStats_;
+  std::shared_ptr<std::vector<int32_t>> sourceColumnIndices_;
 };
 
 FOLLY_ALWAYS_INLINE std::ostream& operator<<(
 
@@ -17,7 +17,8 @@ velox_add_library(
   BloomFilter.cpp
   XxHasher.cpp
   LevelComparison.cpp
-  LevelConversion.cpp)
+  LevelConversion.cpp
+  UnicodeUtil.cpp)
 
 velox_link_libraries(
   velox_dwio_parquet_common
 
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/dwio/parquet/common/UnicodeUtil.h"
+
+namespace facebook::velox::parquet {
+
+std::string UnicodeUtil::truncateString(
+    const std::string& input,
+    int32_t length) {
+  VELOX_CHECK_GT(length, 0, "Truncate length should be positive");
+  return input.substr(
+      0, functions::stringImpl::cappedByteLength<false>(input, length));
+}
+
+std::optional<std::string> UnicodeUtil::truncateStringMin(
+    const std::optional<std::string>& input,
+    int32_t length) {
+  if (!input.has_value()) {
+    return std::nullopt;
+  }
+  return truncateString(input.value(), length);
+}
+
+std::optional<std::string> UnicodeUtil::truncateStringMax(
+    const std::optional<std::string>& input,
+    int32_t length) {
+  if (!input.has_value()) {
+    return std::nullopt;
+  }
+
+  const std::string& inputStr = input.value();
+  const auto truncated = truncateString(inputStr, length);
+  if (truncated.length() == inputStr.length()) {
+    return inputStr;
+  }
+
+  // Try to increment the last code point.
+  for (auto i = length - 1; i >= 0; --i) {
+    // Find the byte offset for the i-th code point.
+    const char* data = truncated.data();
+    const char* end = data + truncated.size();
+    const char* current = data;
+    int32_t currentCodePoint = 0;
+
+    while (current < end && currentCodePoint < i) {
+      int32_t charLength;
+      utf8proc_codepoint(current, end, charLength);
+      current += charLength;
+      currentCodePoint++;
+    }
+
+    // Get the code point at this position.
+    int32_t charLength;
+    auto codePoint = utf8proc_codepoint(current, end, charLength);
+    auto nextCodePoint = codePoint + 1;
+
+    // Check if the incremented code point is valid.
+    if (nextCodePoint != 0 && utf8proc_codepoint_valid(nextCodePoint)) {
+      auto result = truncated.substr(0, current - data);
+      // Append the incremented code point.
+      char buffer[4]; // UTF-8 uses up to 4 bytes per code point.
+      auto bytesWritten = utf8proc_encode_char(
+          nextCodePoint, reinterpret_cast<utf8proc_uint8_t*>(buffer));
+      result.append(buffer, bytesWritten);
+      return result;
+    }
+  }
+  return std::nullopt;
+}
+
+} // namespace facebook::velox::parquet
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "velox/common/base/Exceptions.h"
+#include "velox/expression/VectorFunction.h"
+#include "velox/external/utf8proc/utf8proc.h"
+#include "velox/functions/lib/string/StringImpl.h"
+
+namespace facebook::velox::parquet {
+
+class UnicodeUtil {
+ private:
+  UnicodeUtil() = delete;
+
+ public:
+  static bool isCharHighSurrogate(char16_t ch) {
+    return (ch & 0xFC00) == 0xD800;
+  }
+
+  // Truncates a string to the specified number of Unicode code points.
+  static std::string truncateString(const std::string& input, int32_t length);
+
+  static std::optional<std::string> truncateStringMin(
+      const std::optional<std::string>& input,
+      int32_t length);
+
+  static std::optional<std::string> truncateStringMax(
+      const std::optional<std::string>& input,
+      int32_t length);
+};
+
+} // namespace facebook::velox::parquet
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_executable(velox_dwio_parquet_common_test LevelConversionTest.cpp)
+add_executable(velox_dwio_parquet_common_test LevelConversionTest.cpp
+                                              UnicodeUtilTest.cpp)
 
 add_test(velox_dwio_parquet_common_test velox_dwio_parquet_common_test)
 target_link_libraries(