Skip to content

Commit ce612c6

Browse files
res-lifeChong Gaogreptile-apps[bot]
authored
Add Iceberg partition transforms: year/month/day/hour (#3903)
contributes to NVIDIA/spark-rapids#13384 contributes to NVIDIA/spark-rapids#13385 contributes to NVIDIA/spark-rapids#13386 contributes to NVIDIA/spark-rapids#13387 Add Iceberg partition transforms: year/month/day/hour. CPU code is in `iceberg org.apache.iceberg.util.DateTimeUtil`, [link](https://github.com/apache/iceberg/blob/apache-iceberg-1.6.1/api/src/main/java/org/apache/iceberg/util/DateTimeUtil.java) ### details We can not use `ColumnView.year`: input MUST be timestamp type instead of date type. This PR adds the following four APIs: ```java /** * Calculates the difference in years between the epoch year (1970) and the * given date/timestamp column. E.g.: for date '1971-01-01', the result would be * 1: (1 year after epoch year) * * @param input The input date/timestamp column. * @return A column of type INT32 containing the year differences from epoch. */ public static ColumnVector toYears(ColumnView input) /** * Calculates the difference in months between the epoch month (1970-01) and the * given date/timestamp column. E.g.: for date '1971-02-01', the result would be * 13: (1 year and 1 month after epoch month) * * @param input The input date/timestamp column. * @return A column of type INT32 containing the month differences from epoch. */ public static ColumnVector toMonths(ColumnView input) /** * Calculates the difference in days between the epoch day (1970-01-01) and the * given date/timestamp column. E.g.: for date '1970-01-21', the result would be * 20: (20 days after epoch day) * * @param input The input date/timestamp column. * @return A column of type Date. */ public static ColumnVector toDays(ColumnView input) /** * Calculates the difference in hours between the epoch hour * (1970-01-01T00:00:00) and the given timestamp column. * E.g.: for timestamp '1970-01-01 01:00:00', the result would be 1 * (1 hour after epoch hour) * * @param timestamp The input timestamp column. * @return A column of type INT32 containing the hour differences from epoch. */ public static ColumnVector toHours(ColumnView input) ``` Signed-off-by: Chong Gao <res_life@163.com> --------- Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com> Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent aeffc02 commit ce612c6

File tree

9 files changed

+817
-1
lines changed

9 files changed

+817
-1
lines changed

pom.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
<parquet.version>1.15.2</parquet.version>
113113
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
114114
<slf4j.version>1.7.30</slf4j.version>
115+
<iceberg.version>1.6.1</iceberg.version>
115116
<submodule.check.skip>false</submodule.check.skip>
116117
<!-- This skips applying and unapplying patchs to CUDF. Be aware that if you
117118
try to build the main jar without patching CUDF it could result in build
@@ -194,6 +195,12 @@
194195
<version>${hilbert.version}</version>
195196
<scope>test</scope>
196197
</dependency>
198+
<dependency>
199+
<groupId>org.apache.iceberg</groupId>
200+
<artifactId>iceberg-core</artifactId>
201+
<version>${iceberg.version}</version>
202+
<scope>test</scope>
203+
</dependency>
197204
</dependencies>
198205

199206
<profiles>

src/main/cpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ add_library(
198198
src/HistogramJni.cpp
199199
src/HostTableJni.cpp
200200
src/HyperLogLogPlusPlusHostUDFJni.cpp
201+
src/IcebergDateTimeUtilJni.cpp
201202
src/JSONUtilsJni.cpp
202203
src/KudoGpuSerializerJni.cpp
203204
src/ListSliceJni.cpp
@@ -235,6 +236,7 @@ add_library(
235236
src/hive_hash.cu
236237
src/hyper_log_log_plus_plus.cu
237238
src/hyper_log_log_plus_plus_host_udf.cu
239+
src/iceberg_datetime_util.cu
238240
src/json_utils.cu
239241
src/list_slice.cu
240242
src/map.cu
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "cudf_jni_apis.hpp"
18+
#include "iceberg_datetime_util.hpp"
19+
20+
extern "C" {
21+
22+
JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_iceberg_IcebergDateTimeUtil_yearsFromEpoch(
23+
JNIEnv* env, jclass, jlong input)
24+
{
25+
JNI_NULL_CHECK(env, input, "input column is null", 0);
26+
27+
JNI_TRY
28+
{
29+
cudf::jni::auto_set_device(env);
30+
auto const input_cv = reinterpret_cast<cudf::column_view const*>(input);
31+
auto output = spark_rapids_jni::years_from_epoch(*input_cv);
32+
return reinterpret_cast<jlong>(output.release());
33+
}
34+
JNI_CATCH(env, 0);
35+
}
36+
37+
JNIEXPORT jlong JNICALL
38+
Java_com_nvidia_spark_rapids_jni_iceberg_IcebergDateTimeUtil_monthsFromEpoch(JNIEnv* env,
39+
jclass,
40+
jlong input)
41+
{
42+
JNI_NULL_CHECK(env, input, "input column is null", 0);
43+
44+
JNI_TRY
45+
{
46+
cudf::jni::auto_set_device(env);
47+
auto const input_cv = reinterpret_cast<cudf::column_view const*>(input);
48+
auto output = spark_rapids_jni::months_from_epoch(*input_cv);
49+
return reinterpret_cast<jlong>(output.release());
50+
}
51+
JNI_CATCH(env, 0);
52+
}
53+
54+
JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_iceberg_IcebergDateTimeUtil_daysFromEpoch(
55+
JNIEnv* env, jclass, jlong input)
56+
{
57+
JNI_NULL_CHECK(env, input, "input column is null", 0);
58+
59+
JNI_TRY
60+
{
61+
cudf::jni::auto_set_device(env);
62+
auto const input_cv = reinterpret_cast<cudf::column_view const*>(input);
63+
auto output = spark_rapids_jni::days_from_epoch(*input_cv);
64+
return reinterpret_cast<jlong>(output.release());
65+
}
66+
JNI_CATCH(env, 0);
67+
}
68+
69+
JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_iceberg_IcebergDateTimeUtil_hoursFromEpoch(
70+
JNIEnv* env, jclass, jlong input)
71+
{
72+
JNI_NULL_CHECK(env, input, "input column is null", 0);
73+
74+
JNI_TRY
75+
{
76+
cudf::jni::auto_set_device(env);
77+
auto const input_cv = reinterpret_cast<cudf::column_view const*>(input);
78+
auto output = spark_rapids_jni::hours_from_epoch(*input_cv);
79+
return reinterpret_cast<jlong>(output.release());
80+
}
81+
JNI_CATCH(env, 0);
82+
}
83+
84+
} // extern "C"

src/main/cpp/src/datetime_utils.cuh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
#pragma once
1818

19+
#include "integer_utils.cuh"
20+
1921
#include <cudf/lists/list_device_view.cuh>
2022
#include <cudf/lists/lists_column_device_view.cuh>
2123
#include <cudf/types.hpp>
@@ -230,7 +232,7 @@ struct date_time_utils {
230232
__device__ static int32_t get_year(int64_t seconds)
231233
{
232234
constexpr int64_t seconds_per_day = 86400L;
233-
int64_t days = seconds / seconds_per_day;
235+
int64_t days = integer_utils::floor_div(seconds, seconds_per_day);
234236
int32_t year, month, day;
235237
to_date(days, year, month, day);
236238
return year;

0 commit comments

Comments
 (0)