From 2c1ab8520f37c2b3153fd208404adbec71dd7391 Mon Sep 17 00:00:00 2001 From: Andrew Naylor Date: Fri, 12 May 2023 11:12:25 -0700 Subject: [PATCH 1/2] Add cudaSetDevice to scan to enable PrefixSumCUDA to run on cuda:1,2,3... --- prefix_sum.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/prefix_sum.cu b/prefix_sum.cu index 9aaef36..a5376b1 100644 --- a/prefix_sum.cu +++ b/prefix_sum.cu @@ -39,7 +39,7 @@ // scan.cuh void sequential_scan(int* output, int* input, int length); void blockscan(int *output, int *input, int length, bool bcao); -void scan(int *output, int *input, int length, bool bcao); +void scan(int *output, int *input, int length, int device, bool bcao); void scanLargeDeviceArray(int *output, int *input, int length, bool bcao); void scanSmallDeviceArray(int *d_out, int *d_in, int length, bool bcao); @@ -80,6 +80,7 @@ void PrefixSumCUDA( grid_off.contiguous().data_ptr(), grid_cnt.contiguous().data_ptr(), num_grids, + grid_cnt.device().index(), true ); @@ -132,7 +133,8 @@ void blockscan(int *d_out, int *d_in, int length, bool bcao) { return; } -void scan(int *d_out, int *d_in, int length, bool bcao) { +void scan(int *d_out, int *d_in, int length, int device, bool bcao) { + cudaSetDevice(device); if (length > ELEMENTS_PER_BLOCK) { scanLargeDeviceArray(d_out, d_in, length, bcao); } From 008f3239e3f3e65e48c04796d606c5595f2417bf Mon Sep 17 00:00:00 2001 From: Andrew Naylor Date: Thu, 25 May 2023 00:05:34 -0700 Subject: [PATCH 2/2] Try build with ninja --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fee3f3a..98bba14 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ class BuildExtension(torch.utils.cpp_extension.BuildExtension): def __init__(self, *args, **kwargs): - super().__init__(use_ninja=False, *args, **kwargs) + super().__init__(use_ninja=True, *args, **kwargs) nvcc_args = [] nvcc_flags_env = os.getenv("NVCC_FLAGS", "")