Skip to content

Commit cc75b05

Browse files
authored
Change IPv4 convert APIs to support UINT32 instead of INT64 (#16489)
Changes the integer type for `cudf::strings::ipv4_to_integers` and `cudf::strings::integers_to_ipv4` to use UINT32 types instead of INT64. The INT64 type was originally chosen because libcudf did not support unsigned types at the time. This is a breaking change since the basic input/output type is changed. Closes #16324 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - https://github.com/brandon-b-miller - Karthikeyan (https://github.com/karthikeyann) URL: #16489
1 parent a94512a commit cc75b05

File tree

5 files changed

+20
-23
lines changed

5 files changed

+20
-23
lines changed

cpp/include/cudf/strings/convert/convert_ipv4.hpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,12 @@ namespace strings {
4444
* No checking is done on the format. If a string is not in IPv4 format, the resulting
4545
* integer is undefined.
4646
*
47-
* The resulting 32-bit integer is placed in an int64_t to avoid setting the sign-bit
48-
* in an int32_t type. This could be changed if cudf supported a UINT32 type in the future.
49-
*
5047
* Any null entries will result in corresponding null entries in the output column.
5148
*
5249
* @param input Strings instance for this operation
5350
* @param stream CUDA stream used for device memory operations and kernel launches
5451
* @param mr Device memory resource used to allocate the returned column's device memory
55-
* @return New INT64 column converted from strings
52+
* @return New UINT32 column converted from strings
5653
*/
5754
std::unique_ptr<column> ipv4_to_integers(
5855
strings_column_view const& input,
@@ -68,13 +65,11 @@ std::unique_ptr<column> ipv4_to_integers(
6865
* Each input integer is dissected into four integers by dividing the input into 8-bit sections.
6966
* These sub-integers are then converted into [0-9] characters and placed between '.' characters.
7067
*
71-
* No checking is done on the input integer value. Only the lower 32-bits are used.
72-
*
7368
* Any null entries will result in corresponding null entries in the output column.
7469
*
75-
* @throw cudf::logic_error if the input column is not INT64 type.
70+
* @throw cudf::logic_error if the input column is not UINT32 type.
7671
*
77-
* @param integers Integer (INT64) column to convert
72+
* @param integers Integer (UINT32) column to convert
7873
* @param stream CUDA stream used for device memory operations and kernel launches
7974
* @param mr Device memory resource used to allocate the returned column's device memory
8075
* @return New strings column

cpp/src/strings/convert/convert_ipv4.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ namespace {
4646
struct ipv4_to_integers_fn {
4747
column_device_view const d_strings;
4848

49-
__device__ int64_t operator()(size_type idx)
49+
__device__ uint32_t operator()(size_type idx)
5050
{
5151
if (d_strings.is_null(idx)) return 0;
5252
string_view d_str = d_strings.element<string_view>(idx);
@@ -66,7 +66,7 @@ struct ipv4_to_integers_fn {
6666
}
6767
}
6868
uint32_t result = (ipvals[0] << 24) + (ipvals[1] << 16) + (ipvals[2] << 8) + ipvals[3];
69-
return static_cast<int64_t>(result);
69+
return result;
7070
}
7171
};
7272

@@ -79,18 +79,18 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
7979
{
8080
size_type strings_count = input.size();
8181
if (strings_count == 0) {
82-
return make_numeric_column(data_type{type_id::INT64}, 0, mask_state::UNALLOCATED, stream);
82+
return make_numeric_column(data_type{type_id::UINT32}, 0, mask_state::UNALLOCATED, stream);
8383
}
8484

8585
auto strings_column = column_device_view::create(input.parent(), stream);
8686
// create output column copying the strings' null-mask
87-
auto results = make_numeric_column(data_type{type_id::INT64},
87+
auto results = make_numeric_column(data_type{type_id::UINT32},
8888
strings_count,
8989
cudf::detail::copy_bitmask(input.parent(), stream, mr),
9090
input.null_count(),
9191
stream,
9292
mr);
93-
auto d_results = results->mutable_view().data<int64_t>();
93+
auto d_results = results->mutable_view().data<uint32_t>();
9494
// fill output column with ipv4 integers
9595
thrust::transform(rmm::exec_policy(stream),
9696
thrust::make_counting_iterator<size_type>(0),
@@ -135,7 +135,7 @@ struct integers_to_ipv4_fn {
135135
return;
136136
}
137137

138-
auto const ip_number = d_column.element<int64_t>(idx);
138+
auto const ip_number = d_column.element<uint32_t>(idx);
139139

140140
char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
141141
int shift_bits = 24;
@@ -165,7 +165,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
165165
{
166166
if (integers.is_empty()) return make_empty_column(type_id::STRING);
167167

168-
CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
168+
CUDF_EXPECTS(integers.type().id() == type_id::UINT32, "Input column must be UINT32 type");
169169

170170
auto d_column = column_device_view::create(integers, stream);
171171
auto [offsets_column, chars] =

cpp/tests/strings/ipv4_tests.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
4040
auto strings_view = cudf::strings_column_view(strings);
4141
auto results = cudf::strings::ipv4_to_integers(strings_view);
4242

43-
std::vector<int64_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
44-
cudf::test::fixed_width_column_wrapper<int64_t> expected(
43+
std::vector<uint32_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
44+
cudf::test::fixed_width_column_wrapper<uint32_t> expected(
4545
h_expected.cbegin(),
4646
h_expected.cend(),
4747
thrust::make_transform_iterator(h_strings.begin(),
@@ -59,8 +59,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
5959
thrust::make_transform_iterator(h_strings.begin(),
6060
[](auto const str) { return str != nullptr; }));
6161

62-
std::vector<int64_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
63-
cudf::test::fixed_width_column_wrapper<int64_t> column(
62+
std::vector<uint32_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
63+
cudf::test::fixed_width_column_wrapper<uint32_t> column(
6464
h_column.cbegin(),
6565
h_column.cend(),
6666
thrust::make_transform_iterator(h_strings.begin(),

python/cudf/cudf/core/column/numerical.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,8 @@ def normalize_binop_value(
313313
return NotImplemented
314314

315315
def int2ip(self) -> "cudf.core.column.StringColumn":
316-
if self.dtype != cudf.dtype("int64"):
317-
raise TypeError("Only int64 type can be converted to ip")
316+
if self.dtype != cudf.dtype("uint32"):
317+
raise TypeError("Only uint32 type can be converted to ip")
318318

319319
return libcudf.string_casting.int2ip(self)
320320

python/cudf/cudf/tests/test_string.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,7 +2672,9 @@ def test_string_ip4_to_int():
26722672

26732673

26742674
def test_string_int_to_ipv4():
2675-
gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
2675+
gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype(
2676+
"uint32"
2677+
)
26762678
expected = cudf.Series(
26772679
["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
26782680
)
@@ -2718,7 +2720,7 @@ def test_string_isipv4():
27182720

27192721

27202722
@pytest.mark.parametrize(
2721-
"dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"}))
2723+
"dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"}))
27222724
)
27232725
def test_string_int_to_ipv4_dtype_fail(dtype):
27242726
gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype)

0 commit comments

Comments
 (0)