Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Python Neighborhood Sample #4988

Open
wants to merge 7 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions file_to_store/debug_sampling_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cupy
import cudf

import warnings


def sampling_results_from_cupy_array_dict(
cupy_array_dict,
weight_t,
num_hops,
return_offsets=False,
renumber=False,
):
"""
Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
"""
results_df = cudf.DataFrame()

major_col_name = "majors"

majors = cupy_array_dict["majors"]
if majors is not None:
results_df["majors"] = majors

results_df_cols = [
"minors",
"weight",
"edge_id",
"edge_type",
]

for col in results_df_cols:
if col in cupy_array_dict.keys():
array = cupy_array_dict[col]
# The length of each of these arrays should be the same
results_df[col] = array

label_hop_offsets = cupy_array_dict["label_hop_offsets"]
batch_ids = cupy_array_dict["batch_id"]
print("hello world")

if renumber:
renumber_df = cudf.DataFrame(
{
"renumber_map": cupy_array_dict["renumber_map"],
}
)

if not return_offsets:
if len(batch_ids) > 0:
#"""
batch_ids_r = cudf.Series(cupy.unique(batch_ids)).repeat(
cupy.diff(cupy_array_dict["renumber_map_offsets"])
)
batch_ids_r.reset_index(drop=True, inplace=True)
#"""
print("renumber_df = \n", renumber_df)
print("batch_ids = ", batch_ids)
renumber_df["batch_id"] = batch_ids
else:
renumber_df["batch_id"] = None

if return_offsets:
batches_series = cudf.Series(
batch_ids,
name="batch_id",
)
offsets_df = cudf.Series(
label_hop_offsets,
name="offsets",
).to_frame()

if len(batches_series) > len(offsets_df):
# this is extremely rare so the inefficiency is ok
offsets_df = offsets_df.join(batches_series, how="outer").sort_index()
else:
offsets_df["batch_id"] = batches_series

if renumber:
renumber_offset_series = cudf.Series(
cupy_array_dict["renumber_map_offsets"], name="renumber_map_offsets"
)

if len(renumber_offset_series) > len(offsets_df):
# this is extremely rare so the inefficiency is ok
offsets_df = offsets_df.join(
renumber_offset_series, how="outer"
).sort_index()
else:
offsets_df["renumber_map_offsets"] = renumber_offset_series

else:
if len(batch_ids) > 0:

results_df["batch_id"] = batch_ids
else:
results_df["batch_id"] = None

if len(batch_ids) > 0:
print("checking hop ids")
hop_ids_r = cudf.Series(cupy.arange(num_hops))
hop_ids_r = cudf.concat([hop_ids_r] * len(cudf.Series(batch_ids).unique()), ignore_index=True)

# generate the hop column
hop_ids_r = (
cudf.Series(hop_ids_r, name="hop_id")
.repeat(cupy.diff(label_hop_offsets))
.reset_index(drop=True)
)
else:
hop_ids_r = cudf.Series(name="hop_id", dtype="int32")

results_df = results_df.join(hop_ids_r, how="outer").sort_index()

if major_col_name not in results_df:

major_offsets_series = cudf.Series(
cupy_array_dict["major_offsets"], name="major_offsets"
)
if len(major_offsets_series) > len(results_df):
# this is extremely rare so the inefficiency is ok
results_df = results_df.join(
major_offsets_series, how="outer"
).sort_index()
else:
results_df["major_offsets"] = major_offsets_series

if return_offsets:
if renumber:
return results_df, offsets_df, renumber_df
else:
return results_df, offsets_df

if renumber:
return results_df, renumber_df

return (results_df,)
Loading
Loading