Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ add_library(
src/KudoGpuSerializerJni.cpp
src/ListSliceJni.cpp
src/MapJni.cpp
src/MapUtilsJni.cpp
src/MapZipWithUtilsJNI.cpp
src/JoinPrimitivesJni.cpp
src/NativeParquetJni.cpp
Expand Down Expand Up @@ -252,6 +253,7 @@ add_library(
src/join_primitives.cu
src/list_slice.cu
src/map.cu
src/map_utils.cu
src/map_zip_with_utils.cu
src/multiply.cu
src/number_converter.cu
Expand Down
37 changes: 37 additions & 0 deletions src/main/cpp/src/MapUtilsJni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
Comment thread
thirtiseven marked this conversation as resolved.
Outdated
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "cudf_jni_apis.hpp"
#include "jni_utils.hpp"
#include "map_utils.hpp"

extern "C" {

JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_mapFromEntries(
JNIEnv* env, jclass, jlong input_handle, jboolean throw_on_null_key)
{
JNI_NULL_CHECK(env, input_handle, "input column is null", 0);
JNI_TRY
{
cudf::jni::auto_set_device(env);
auto const& input = *reinterpret_cast<cudf::column_view const*>(input_handle);
return cudf::jni::release_as_jlong(
spark_rapids_jni::map_from_entries(input, static_cast<bool>(throw_on_null_key)));
}
JNI_CATCH(env, 0);
}

} // extern "C"
152 changes: 152 additions & 0 deletions src/main/cpp/src/map_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
Comment thread
thirtiseven marked this conversation as resolved.
Outdated
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "map_utils.hpp"

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/copying.hpp>
#include <cudf/lists/contains.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/reduction.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/transform.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

namespace spark_rapids_jni {

std::unique_ptr<cudf::column> map_from_entries(cudf::column_view const& input,
bool throw_on_null_key,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(input.type().id() == cudf::type_id::LIST,
"map_from_entries: input must be a LIST column");

if (input.size() == 0) { return cudf::make_empty_column(input.type()); }

auto const lists_cv = cudf::lists_column_view(input);
auto const structs = lists_cv.child();
CUDF_EXPECTS(structs.type().id() == cudf::type_id::STRUCT,
"map_from_entries: list child must be a STRUCT column");
CUDF_EXPECTS(structs.num_children() >= 1,
"map_from_entries: struct must have at least one child column (KEY)");
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The early return for input.size() == 0 happens before validating that the list child is a STRUCT with at least one child. This means an empty LIST<non-STRUCT> column would incorrectly be accepted, despite the API contract/documentation saying non-LIST(STRUCT(...)) inputs should throw. Move the lists_column_view/STRUCT checks above the empty-size return (or keep the return but after validation).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The LIST type check (CUDF_EXPECTS(input.type().id() == cudf::type_id::LIST)) fires before the early return, so a non-LIST input is rejected. The STRUCT/arity checks are intentionally after the early return: a zero-row column has no entries to validate structure for, and cudf::empty_like preserves the full column type faithfully.


// Step 1: Per-row flag — does row i contain any null struct entry?
// contains_nulls returns BOOL8, size = input.size().
// A null outer row itself yields null in has_null_entry; copy_if_else handles that correctly.
auto has_null_entry = cudf::lists::contains_nulls(lists_cv, stream, mr);

// Fast path: no null struct entries anywhere — simple global null-key check.
auto any_null_entry_scalar = cudf::reduce(
*has_null_entry,
*cudf::make_any_aggregation<cudf::reduce_aggregation>(),
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);
bool const any_null_entry =
any_null_entry_scalar->is_valid(stream) &&
static_cast<cudf::numeric_scalar<bool>*>(any_null_entry_scalar.get())->value(stream);

if (!any_null_entry) {
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function always computes lists::contains_nulls and then reduces it to find any_null_entry, which adds allocations/kernels (and a host read) even when there are no null struct entries. Since the fast-path condition is simply “no null struct entries anywhere”, you can check structs.null_count(stream) == 0 first and return immediately, only computing contains_nulls/row masking when structs.null_count > 0. This matches the stated goal of keeping the fast path cheap.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The structs.null_count() == 0 short-circuit is a valid micro-optimisation. For now the current design (contains_nulls + reduce(any)) is one extra kernel on the fast path and readable. Happy to add the null_count guard if preferred — please let us know.

// All struct entries are valid. Any null key in the flat key column is a real null key.
auto const keys = structs.child(0);
if (throw_on_null_key && keys.null_count(stream) > 0) {
throw cudf::logic_error("Cannot use null as map key.");
}
return std::make_unique<cudf::column>(input, stream, mr);
}

// Slow path: at least one row contains a null struct entry.
//
// CPU semantics: if a row's array has any null struct entry the entire output row is null,
// regardless of whether another entry in that row also has a null key. We must therefore
// throw "Cannot use null as map key" only for rows that satisfy BOTH:
// (a) the row has NO null struct entry (has_null_entry = false), AND
// (b) at least one entry's key is null inside a valid (non-null) struct.
//
// Per-entry boolean: null_key_in_valid[j] = key_is_null[j] AND struct_is_valid[j]
auto const keys = structs.child(0);
auto key_is_null = cudf::is_null(keys, stream, mr); // flat BOOL8
auto struct_is_null = cudf::is_null(structs, stream, mr); // flat BOOL8
auto struct_is_valid = cudf::unary_operation(
*struct_is_null, cudf::unary_operator::NOT, stream, mr); // flat BOOL8
auto null_key_in_valid = cudf::binary_operation(
*key_is_null,
*struct_is_valid,
cudf::binary_operator::BITWISE_AND,
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 BITWISE_AND used for boolean logic

cudf::binary_operator::LOGICAL_AND would express the intent more clearly for boolean columns. While both are equivalent for strictly {0, 1} BOOL8 values produced by is_null(), using the semantic operator avoids any ambiguity about intent and is the conventional choice for bool algebra in cuDF.

Suggested change
auto null_key_in_valid = cudf::binary_operation(
*key_is_null,
*struct_is_valid,
cudf::binary_operator::BITWISE_AND,
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);
auto null_key_in_valid = cudf::binary_operation(
*key_is_null,
*struct_is_valid,
cudf::binary_operator::LOGICAL_AND,
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);

Same applies to the should_throw computation at line 124–130.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed — changed to cudf::binary_operator::LOGICAL_AND throughout.


// Reduce per-list: does this row contain any entry where null_key_in_valid = true?
// segmented_reduce(max) over the flat boolean values using the list offsets as boundaries.
// is_null() always returns a fully-valid boolean column, so null_key_in_valid has no nulls;
// EXCLUDE null_policy only affects empty-list rows (which yield a null result, safely
// treated as false in the AND below).
auto const offsets_col = lists_cv.offsets();
auto const offsets_span = cudf::device_span<cudf::size_type const>(
offsets_col.data<cudf::size_type>(), offsets_col.size());
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 offsets_col.size() assumes non-sliced input

lists_column_view::offsets() returns a column_view adjusted for the list column's offset, so offsets_col.data<cudf::size_type>() points to the first valid offset and offsets_col.size() equals input.size() + 1. This is correct for segmented_reduce for a non-sliced column (the typical JNI case). A brief comment noting this assumption would help future readers; a sliced input would still work correctly since the absolute offsets index into the full null_key_in_valid, but it is an implicit precondition.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed — the code now uses lists_cv.offsets_begin() (offset-aware) to build offsets_span, covering exactly the visible rows of a sliced input. A comment at map_utils.cu:97-100 explains this. The output-path offsets are also explicitly sliced with raw_offsets.offset() + input.offset().

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed — offsets_span is now built from lists_cv.offsets_begin() with an explicit input.size() + 1 length (map_utils.cu:130-131). offsets_begin() already accounts for input.offset(), so the slice-aware precondition is enforced rather than implicit, and the sliced-input regression tests added in e1edb87 exercise the path.


auto row_has_null_key = cudf::segmented_reduce(
*null_key_in_valid,
offsets_span,
*cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>(),
cudf::data_type{cudf::type_id::BOOL8},
cudf::null_policy::EXCLUDE,
stream,
mr);

// Throw only when: row has no null struct entry AND row has a null key in a valid struct.
// For rows with null struct entries (has_null_entry = true), the whole output row is masked
// to null below, so their null keys are irrelevant — no exception should be thrown for them.
if (throw_on_null_key) {
auto no_null_entry = cudf::unary_operation(
*has_null_entry, cudf::unary_operator::NOT, stream, mr);
// NULL AND anything = NULL; reduce(any) skips nulls, so null rows are safely ignored.
auto should_throw = cudf::binary_operation(
*no_null_entry,
*row_has_null_key,
cudf::binary_operator::BITWISE_AND,
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);
auto any_throw_scalar = cudf::reduce(
*should_throw,
*cudf::make_any_aggregation<cudf::reduce_aggregation>(),
cudf::data_type{cudf::type_id::BOOL8},
stream,
mr);
bool const any_throw =
any_throw_scalar->is_valid(stream) &&
static_cast<cudf::numeric_scalar<bool>*>(any_throw_scalar.get())->value(stream);
if (any_throw) { throw cudf::logic_error("Cannot use null as map key."); }
}

// Null-mask rows that contain null struct entries.
// copy_if_else(lhs=null_scalar, rhs=input, mask=has_null_entry):
// mask[i] = true → null_scalar (row had a null struct entry → output null row)
// mask[i] = false → input[i] (row was fine → keep original)
// mask[i] = null → input[i] (outer null row stays null via input[i])
auto null_scalar = cudf::make_default_constructed_scalar(input.type(), stream, mr);
Copy link

Copilot AI Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_default_constructed_scalar creates a valid scalar (e.g., for LIST it will be an empty list), so copy_if_else will replace rows with null-struct entries with a valid default value rather than a null outer row. To actually mask rows to null per Spark semantics, construct an invalid scalar (or explicitly mark this scalar invalid) before passing it to copy_if_else.

Suggested change
auto null_scalar = cudf::make_default_constructed_scalar(input.type(), stream, mr);
auto null_scalar = cudf::make_default_constructed_scalar(input.type(), stream, mr);
null_scalar->set_valid_async(false, stream);

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code path no longer exists. The current slow-path implementation does not use make_default_constructed_scalar or copy_if_else at all — it builds the null mask directly via bools_to_mask + bitmask_and + purge_nonempty_nulls.

return cudf::copy_if_else(*null_scalar, input, *has_null_entry, stream, mr);
}

} // namespace spark_rapids_jni
58 changes: 58 additions & 0 deletions src/main/cpp/src/map_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
Comment thread
thirtiseven marked this conversation as resolved.
Outdated
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace spark_rapids_jni {

/**
* @brief Converts a LIST(STRUCT(KEY, VALUE)) column to a map column following Spark semantics.
*
* Spark semantics for map_from_entries:
* - If a row's array contains any null struct entry (the whole struct is null), the output row
* is null — even if another entry in that same row has a null key.
* - If a row's array contains no null struct entry but does contain a null key inside a valid
* struct, behavior depends on throw_on_null_key:
* - true → throws a logic_error ("Cannot use null as map key.")
* - false → returns the row as-is (caller is responsible for deduplication policy)
*
* This function only handles null-struct masking and null-key validation.
* Duplicate-key deduplication is left to the caller.
*
* @param input Input LIST(STRUCT(KEY, VALUE)) column.
* @param throw_on_null_key When true, throw if any valid-struct entry has a null key.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's memory.
* @return A new column equal to @p input except that rows containing null struct entries are
* replaced with a null outer row.
* @throws cudf::logic_error if the input is not a LIST(STRUCT(KEY,...)) column.
* @throws cudf::logic_error if @p throw_on_null_key is true and any row (with no null struct
* entries) contains a null key inside a valid struct.
*/
std::unique_ptr<cudf::column> map_from_entries(
cudf::column_view const& input,
bool throw_on_null_key,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

} // namespace spark_rapids_jni
64 changes: 64 additions & 0 deletions src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
Comment thread
thirtiseven marked this conversation as resolved.
Outdated
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids.jni;

import ai.rapids.cudf.ColumnVector;
import ai.rapids.cudf.ColumnView;
import ai.rapids.cudf.NativeDepsLoader;

/**
* Utility APIs for map column operations that require Spark-specific semantics
* not available in the standard cuDF Java bindings.
*/
public class MapUtils {
static {
NativeDepsLoader.loadNativeDeps();
}

/**
* Converts a LIST(STRUCT(KEY, VALUE)) column to a map column following Spark semantics.
*
* <p>Spark semantics for {@code map_from_entries}:
* <ul>
* <li>If a row's array contains any null struct entry (the whole struct is null), the output
* row is null — even if another entry in that same row has a null key inside a valid
* struct.</li>
* <li>If a row's array contains no null struct entry but a valid struct's key is null,
* behavior depends on {@code throwOnNullKey}:
* <ul>
* <li>{@code true} — throws a {@link RuntimeException}.</li>
* <li>{@code false} — returns the row as-is (caller handles dedup policy).</li>
* </ul>
* </li>
* </ul>
*
* <p>Duplicate-key deduplication is intentionally left to the caller so that the EXCEPTION
* and LAST_WIN policies can be applied after this function returns.
*
* @param input Input LIST(STRUCT(KEY, VALUE)) column.
* @param throwOnNullKey When {@code true}, throw if any valid-struct entry has a null key.
* @return A new column equal to {@code input} except that rows containing null struct entries
* are replaced with a null outer row.
* @throws RuntimeException if {@code throwOnNullKey} is true and any row (with no null struct
* entry) contains a null key inside a valid struct.
*/
public static ColumnVector mapFromEntries(ColumnView input, boolean throwOnNullKey) {
return new ColumnVector(mapFromEntries(input.getNativeView(), throwOnNullKey));
}

private static native long mapFromEntries(long inputHandle, boolean throwOnNullKey);
}
Loading
Loading