Skip to content

Fix integer overflows in pylibcudf from_column_view_of_arbitrary #18758

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/strings_column_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,6 +43,8 @@ class strings_column_view : private column_view {
* @param strings_column The column view to wrap.
*/
strings_column_view(column_view strings_column);
// So we can use this from cython.
strings_column_view() = default;
strings_column_view(strings_column_view&&) = default; ///< Move constructor
strings_column_view(strings_column_view const&) = default; ///< Copy constructor
~strings_column_view() override = default;
Expand Down
18 changes: 6 additions & 12 deletions python/pylibcudf/pylibcudf/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ from pylibcudf.libcudf.interop cimport (
to_arrow_device_raw,
to_arrow_schema_raw,
)
from pylibcudf.libcudf.scalar.scalar cimport scalar, numeric_scalar
from pylibcudf.libcudf.null_mask cimport bitmask_allocation_size_bytes
from pylibcudf.libcudf.scalar.scalar cimport scalar
from pylibcudf.libcudf.strings.strings_column_view cimport strings_column_view
from pylibcudf.libcudf.types cimport size_type, size_of as cpp_size_of, bitmask_type
from pylibcudf.libcudf.utilities.traits cimport is_fixed_width
from pylibcudf.libcudf.copying cimport get_element
Expand All @@ -49,7 +51,6 @@ from ._interop_helpers cimport (
_release_device_array,
_metadata_to_libcudf,
)
from .null_mask cimport bitmask_allocation_size_bytes
from .utils cimport _get_stream

from .gpumemoryview import _datatype_from_dtype_desc
Expand Down Expand Up @@ -89,21 +90,14 @@ cdef class OwnerWithCAI:
# The default size of 0 will be applied for any type that stores data in the
# children (such that the parent size is 0).
size = 0
cdef column_view offsets_column
cdef unique_ptr[scalar] last_offset
if cv.type().id() == type_id.EMPTY:
size = cv.size()
elif is_fixed_width(cv.type()):
# Cast to Python integers before multiplyling to avoid overflow.
# Cast to Python integers before multiplying to avoid overflow.
size = int(cv.size()) * int(cpp_size_of(cv.type()))
elif cv.type().id() == type_id.STRING:
# A strings column with no children is created for empty/all null, in which
# case the size remains 0. Otherwise, the size of the character array stored
# in the parent is the last offset in the offsets child.
if cv.num_children():
offsets_column = cv.child(0)
last_offset = get_element(offsets_column, offsets_column.size() - 1)
size = (<numeric_scalar[size_type] *> last_offset.get()).value()
# TODO: stream-ordered
size = strings_column_view(cv).chars_size(_get_stream().view())

obj.cai = {
"shape": (size,),
Expand Down
5 changes: 2 additions & 3 deletions python/pylibcudf/pylibcudf/gpumemoryview.pyx
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

from libc.stddef cimport size_t
import functools
import operator

from .types cimport DataType, size_of, type_id

from pylibcudf.libcudf.types cimport size_type


__all__ = ["gpumemoryview"]

Expand Down Expand Up @@ -62,7 +61,7 @@ cdef class gpumemoryview:
self.ptr = cai["data"][0]

# Compute the buffer size.
cdef size_type itemsize = size_of(
cdef size_t itemsize = size_of(
_datatype_from_dtype_desc(
cai["typestr"][1:] # ignore the byteorder (the first char).
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) 2025, NVIDIA CORPORATION.

from libc.stdint cimport int64_t
from pylibcudf.exception_handler cimport libcudf_exception_handler
from pylibcudf.libcudf.column.column_view cimport column_view

from rmm.librmm.cuda_stream_view cimport cuda_stream_view

cdef extern from "cudf/strings/strings_column_view.hpp" namespace "cudf" nogil:
cdef cppclass strings_column_view:
strings_column_view(column_view) except +libcudf_exception_handler
int64_t chars_size(cuda_stream_view) except +libcudf_exception_handler
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/libcudf/types.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from libc.stddef cimport size_t
from libc.stdint cimport int32_t, uint32_t
from libcpp cimport bool
from pylibcudf.exception_handler cimport libcudf_exception_handler
Expand Down Expand Up @@ -100,4 +101,4 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
MIDPOINT
NEAREST

cdef size_type size_of(data_type t) except +libcudf_exception_handler
cdef size_t size_of(data_type t) except +libcudf_exception_handler
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/types.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

from libc.stddef cimport size_t
from libc.stdint cimport int32_t
from libcpp cimport bool as cbool
from pylibcudf.libcudf.types cimport (
Expand Down Expand Up @@ -27,4 +28,4 @@ cdef class DataType:
@staticmethod
cdef DataType from_libcudf(data_type dt)

cpdef size_type size_of(DataType t)
cpdef size_t size_of(DataType t)
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/types.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.

from libc.stddef cimport size_t
from libc.stdint cimport int32_t
from pylibcudf.libcudf.types cimport (
data_type,
Expand Down Expand Up @@ -92,7 +93,7 @@ cdef class DataType:
ret.c_obj = dt
return ret

cpdef size_type size_of(DataType t):
cpdef size_t size_of(DataType t):
"""Returns the size in bytes of elements of the specified data_type.

Only fixed-width types are supported.
Expand Down
Loading