Skip to content

Commit 85b8f35

Browse files
Launch bounds + Minor Documentation Fixes (#115)
* Minor docfixes. * Checking if launch bound has any performance impact. * Added launch bounds to all kernels. * Added link to repo. * Modified dates of AMD cloud experiments. * More minor updates.
1 parent 27b4d4c commit 85b8f35

8 files changed

Lines changed: 75 additions & 41 deletions

File tree

docs/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
OpenEquivariance
77
==============================
88

9-
OpenEquivariance is a CUDA and HIP kernel generator for the Clebsch-Gordon
9+
`OpenEquivariance <https://github.com/PASSIONLab/OpenEquivariance>`_ is a CUDA and
10+
HIP kernel generator for the Clebsch-Gordon
1011
tensor product, a key kernel in equivariant graph neural networks. We offer
1112
an identical interface to e3nn and produce the same results
1213
(up to numerical roundoff). Our package exhibits up to an order of magnitude

docs/installation.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ NERSC Perlmutter (NVIDIA A100)
5959
""""""""""""""""""""""""""""""
6060

6161
.. code-block:: bash
62-
:caption: env.sh (last updated June 2024)
62+
:caption: env.sh (last updated June 2025)
6363
6464
module load gcc
6565
module load conda
@@ -79,7 +79,7 @@ To do this, follow the steps `here <https://docs.olcf.ornl.gov/software/analytic
7979

8080

8181
.. code-block:: bash
82-
:caption: env.sh (last updated June 2024)
82+
:caption: env.sh (last updated June 2025)
8383
8484
module load PrgEnv-gnu/8.6.0
8585
module load miniforge3/23.11.0-0

docs/tests_and_benchmarks.rst

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ To set up the editable install and run the entire testsuite, use:
2525
.. code-block:: bash
2626
2727
git clone https://github.com/PASSIONLab/OpenEquivariance
28-
pip install -e .[dev]
2928
cd OpenEquivariance
29+
pip install -e .[dev]
3030
pytest
3131
3232
Browse the ``tests`` directory to run specific components.
@@ -41,8 +41,8 @@ and generate plots from our paper.
4141
.. code-block:: bash
4242
4343
git clone https://github.com/PASSIONLab/OpenEquivariance
44-
pip install -e .[bench]
4544
cd OpenEquivariance
45+
pip install -e .[bench]
4646
python tests/benchmark.py -o outputs/uvu uvu --plot
4747
python tests/benchmark.py -o outputs/uvw uvw --plot
4848
python tests/benchmark.py -o outputs/roofline roofline --plot
@@ -71,7 +71,8 @@ List of GPUs Tested
7171
OpenEquivariance has been tested successfully the following GPUs. Submit a pull
7272
request if you'd like to add your own!
7373

74-
- NVIDIA A100-SXM-40GB and A100-SXM-80GB (A. Glover, NERSC Perlmutter)
75-
- NVIDIA A5000 (V. Bharadwaj, UCB SLICE)
76-
- AMD MI250x (V. Bharadwaj, OLCF Frontier)
77-
- AMD MI300x (V. Bharadwaj, AMD Cloud)
74+
- NVIDIA A100-SXM-40GB and A100-SXM-80GB (A. Glover, NERSC Perlmutter, June 2025)
75+
- NVIDIA A5000 (V. Bharadwaj, UCB SLICE, June 2025)
76+
- NVIDIA V100 (V. Bharadwaj, LBNL Einsteinium, June 2025)
77+
- AMD MI250x (V. Bharadwaj, OLCF Frontier, June 2025)
78+
- AMD MI300x (V. Bharadwaj, AMD Cloud, February 2025)

openequivariance/templates/loop_unroll_batch.cuh

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
transpose_load, transpose_store,
66
load_ir_segments, load_ir_segments_force,
77
store_ir_segments, declare_smem_variables,
8-
set_launch_bound_variables with context %}
8+
set_launch_bound_variables, launch_bounds
9+
with context%}
910

1011
{%- from 'loop_unroll_tp.cuh' import
1112
generate_segment_kernel_forward,
@@ -21,8 +22,9 @@ using WEIGHT_T = {{ forward_schedule.weight_dtype_cstr }};
2122
{{ generate_segment_kernel_forward(i, segment, forward_schedule.launch_config.warp_size) }}
2223
{%- endfor %}
2324

24-
__global__ void forward(
25-
size_t num_products, IRREP_T* L1_in, IRREP_T* L2_in, IRREP_T* L3_out, WEIGHT_T* weights) {
25+
__global__ void
26+
{{ launch_bounds(forward_schedule) }}
27+
forward(size_t num_products, IRREP_T* L1_in, IRREP_T* L2_in, IRREP_T* L3_out, WEIGHT_T* weights) {
2628

2729
extern __shared__ char s[];
2830
{{ set_launch_bound_variables(forward_schedule.launch_config) }}
@@ -64,8 +66,9 @@ __global__ void forward(
6466
{{ generate_segment_kernel_backward(i, segment, backward_schedule.launch_config.warp_size) }}
6567
{%- endfor %}
6668

67-
__global__ void backward(
68-
size_t num_products,
69+
__global__ void
70+
{{ launch_bounds(backward_schedule) }}
71+
backward(size_t num_products,
6972
IRREP_T* L1_in, IRREP_T* L1_grad,
7073
IRREP_T* L2_in, IRREP_T* L2_grad,
7174
WEIGHT_T* weights, WEIGHT_T* weights_grad,
@@ -142,7 +145,9 @@ __global__ void backward(
142145
* The double backward kernel involves two passes: one combining three forward calls (A),
143146
* and the second combining three backward calls (B).
144147
*/
145-
__global__ void double_backward_A(
148+
__global__ void
149+
{{ launch_bounds(forward_schedule) }}
150+
double_backward_A(
146151
size_t num_products,
147152
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad, // Inputs of backward op
148153
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad, // Gradients w.r.t outputs of backward op
@@ -210,7 +215,9 @@ __global__ void double_backward_A(
210215
{%- endfor %}
211216

212217
{% set schedule = double_backward_schedule %}
213-
__global__ void double_backward_B(
218+
__global__ void
219+
{{ launch_bounds(double_backward_schedule) }}
220+
double_backward_B(
214221
size_t num_products,
215222
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad, // Inputs of backward op
216223
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad, // Gradients w.r.t outputs of backward op

openequivariance/templates/loop_unroll_conv_atomic.cuh

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
load_ir_segments, load_ir_segments_force,
88
store_ir_segments,
99
declare_smem_variables,
10-
set_launch_bound_variables with context %}
10+
set_launch_bound_variables, launch_bounds
11+
with context %}
1112

1213
#define THREADS_PER_WARP {{ forward_schedule.launch_config.warp_size }} // Warp size should be the same for forward and backward
1314
#define FULL_MASK 0xffffffff
@@ -30,20 +31,27 @@ struct ConvData {
3031
unsigned long node_count;
3132
};
3233

33-
__global__ void fixup_forward(void* workspace, IRREP_T* dst_ptr) {
34+
__global__ void
35+
{{ launch_bounds(forward_schedule) }}
36+
fixup_forward(void* workspace, IRREP_T* dst_ptr) {
3437
// Empty, no fixup
3538
}
3639

37-
__global__ void fixup_backward(void* workspace, IRREP_T* dst_ptr) {
40+
__global__ void
41+
{{ launch_bounds(backward_schedule) }}
42+
fixup_backward(void* workspace, IRREP_T* dst_ptr) {
3843
// Empty, no fixup
3944
}
4045

41-
__global__ void fixup_double_backwardB(void* workspace, IRREP_T* dst_ptr) {
46+
__global__ void
47+
{{ launch_bounds(double_backward_schedule) }}
48+
fixup_double_backwardB(void* workspace, IRREP_T* dst_ptr) {
4249
// Empty, no fixup
4350
}
4451

45-
__global__ void forward(
46-
IRREP_T* L1_in,
52+
__global__ void
53+
{{ launch_bounds(forward_schedule) }}
54+
forward(IRREP_T* L1_in,
4755
IRREP_T* L2_in,
4856
WEIGHT_T* weights,
4957
IRREP_T* L3_out,
@@ -96,8 +104,9 @@ __global__ void forward(
96104
{{ generate_segment_kernel_backward(i, segment, backward_schedule.launch_config.warp_size) }}
97105
{%- endfor %}
98106

99-
__global__ void backward(
100-
IRREP_T* L1_in, IRREP_T* L1_grad,
107+
__global__ void
108+
{{ launch_bounds(backward_schedule) }}
109+
backward(IRREP_T* L1_in, IRREP_T* L1_grad,
101110
IRREP_T* L2_in, IRREP_T* L2_grad,
102111
WEIGHT_T* weights, WEIGHT_T* weights_grad,
103112
IRREP_T* L3_grad, ConvData c, void* workspace, unsigned {{idx_type}}* transpose_perm) {
@@ -170,8 +179,9 @@ __global__ void backward(
170179
}
171180
}
172181

173-
__global__ void double_backward_A(
174-
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
182+
__global__ void
183+
{{ launch_bounds(forward_schedule) }}
184+
double_backward_A(IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
175185
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad,
176186
IRREP_T* L1_grad, IRREP_T* L2_grad, WEIGHT_T* W_grad, IRREP_T* L3_dgrad,
177187
ConvData c, void* workspace, unsigned {{idx_type}}* transpose_perm) {
@@ -247,8 +257,9 @@ __global__ void double_backward_A(
247257

248258
{% set schedule = double_backward_schedule %}
249259

250-
__global__ void double_backward_B(
251-
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
260+
__global__ void
261+
{{ launch_bounds(double_backward_schedule) }}
262+
double_backward_B(IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
252263
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad,
253264
IRREP_T* L1_grad, IRREP_T* L2_grad, WEIGHT_T* W_grad, IRREP_T* L3_dgrad,
254265
ConvData c, void* workspace, unsigned {{idx_type}}* transpose_perm) {

openequivariance/templates/loop_unroll_conv_det.cuh

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
transpose_load, transpose_store,
77
load_ir_segments, store_ir_segments,
88
declare_smem_variables,
9-
set_launch_bound_variables with context %}
9+
set_launch_bound_variables, launch_bounds
10+
with context %}
1011

1112
#define THREADS_PER_WARP {{ forward_schedule.launch_config.warp_size }} // Warp size should be the same for forward and backward
1213
#define FULL_MASK 0xffffffff
@@ -30,8 +31,11 @@ struct ConvData {
3031
};
3132

3233

33-
{%- macro generate_fixup_kernel(name, warp_size, dim, fixup_offset) %}
34-
__global__ void {{name}}(void* workspace, IRREP_T* dst_ptr) {
34+
{%- macro generate_fixup_kernel(name, schedule, dim, fixup_offset) %}
35+
{%- set warp_size = schedule.launch_config.warp_size %}
36+
__global__ void
37+
{{ launch_bounds(schedule) }}
38+
{{name}}(void* workspace, IRREP_T* dst_ptr) {
3539
/*
3640
* Workspace consists of:
3741
* fixup_dim * warps_launched * sizeof(IRREP_T): Data
@@ -61,7 +65,7 @@ __global__ void {{name}}(void* workspace, IRREP_T* dst_ptr) {
6165
}
6266
{%- endmacro %}
6367

64-
{{ generate_fixup_kernel("fixup_forward", forward_schedule.launch_config.warp_size, forward_schedule.L3.dim, forward_workspace_offset) }}
68+
{{ generate_fixup_kernel("fixup_forward", forward_schedule, forward_schedule.L3.dim, forward_workspace_offset) }}
6569

6670
template<int ROW_LEN>
6771
__device__ __forceinline__ void kahanAdd(IRREP_T* c_arr, IRREP_T* sum_arr, int lane_id) {
@@ -88,7 +92,9 @@ __device__ __forceinline__ void kahanAdd(IRREP_T* c_arr, IRREP_T* sum_arr, int l
8892
}
8993
}
9094

91-
__global__ void forward(
95+
__global__ void
96+
{{ launch_bounds(forward_schedule) }}
97+
forward(
9298
IRREP_T* L1_in,
9399
IRREP_T* L2_in,
94100
WEIGHT_T* weights,
@@ -174,10 +180,11 @@ __global__ void forward(
174180
{{ generate_segment_kernel_backward(i, segment, backward_schedule.launch_config.warp_size) }}
175181
{%- endfor %}
176182

177-
{{ generate_fixup_kernel("fixup_backward", backward_schedule.launch_config.warp_size, backward_schedule.L1.dim, backward_workspace_offset) }}
183+
{{ generate_fixup_kernel("fixup_backward", backward_schedule, backward_schedule.L1.dim, backward_workspace_offset) }}
178184

179-
__global__ void backward(
180-
IRREP_T* L1_in, IRREP_T* L1_grad,
185+
__global__ void
186+
{{ launch_bounds(backward_schedule) }}
187+
backward(IRREP_T* L1_in, IRREP_T* L1_grad,
181188
IRREP_T* L2_in, IRREP_T* L2_grad,
182189
WEIGHT_T* weights, WEIGHT_T* weights_grad,
183190
IRREP_T* L3_grad, ConvData c, void* workspace_raw,
@@ -284,8 +291,9 @@ __global__ void backward(
284291
}
285292

286293

287-
__global__ void double_backward_A(
288-
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
294+
__global__ void
295+
{{ launch_bounds(forward_schedule) }}
296+
double_backward_A(IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
289297
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad,
290298
IRREP_T* L1_grad, IRREP_T* L2_grad, WEIGHT_T* W_grad, IRREP_T* L3_dgrad,
291299
ConvData c, void* workspace_raw, unsigned {{idx_type}}* transpose_perm) {
@@ -391,16 +399,17 @@ __global__ void double_backward_A(
391399
} {%- endfor %}
392400
}
393401

394-
{{ generate_fixup_kernel("fixup_double_backwardB", double_backward_schedule.launch_config.warp_size, double_backward_schedule.L1.dim, double_backwardB_offset) }}
402+
{{ generate_fixup_kernel("fixup_double_backwardB", double_backward_schedule, double_backward_schedule.L1.dim, double_backwardB_offset) }}
395403

396404
{%- for i, segment in enumerate(double_backward_schedule.segments) %}
397405
{{ generate_segment_kernel_backward(i, segment, double_backward_schedule.launch_config.warp_size, double_bwd=True) }}
398406
{%- endfor %}
399407

400408
{% set schedule = double_backward_schedule %}
401409

402-
__global__ void double_backward_B(
403-
IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
410+
__global__ void
411+
{{ launch_bounds(double_backward_schedule) }}
412+
double_backward_B(IRREP_T* L1_in, IRREP_T* L2_in, WEIGHT_T* W, IRREP_T* L3_grad,
404413
IRREP_T* L1_dgrad, IRREP_T* L2_dgrad, IRREP_T* W_dgrad,
405414
IRREP_T* L1_grad, IRREP_T* L2_grad, WEIGHT_T* W_grad, IRREP_T* L3_dgrad,
406415
ConvData c, void* workspace_raw, unsigned {{idx_type}}* transpose_perm) {

openequivariance/templates/macros.jinja

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,7 @@ Keys map to lists of tuples with (name, dtype, num_elements) of each subarray.
159159
{%- endfor %}
160160
}
161161
{%- endmacro %}
162+
163+
{%- macro launch_bounds(schedule) %}
164+
__launch_bounds__({{schedule.launch_config.num_threads}})
165+
{%- endmacro %}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ bench = [
3131
]
3232

3333
dev = [
34+
"e3nn",
3435
"pre-commit",
3536
"ruff",
3637
"pytest",

0 commit comments

Comments
 (0)