Skip to content

Commit 937656d

Browse files
author
N
committed
another tentative
1 parent da74b8a commit 937656d

8 files changed

Lines changed: 2591 additions & 3 deletions

File tree

examples/multi_tensor.ptx

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
.version 8.0
2+
.target sm_52
3+
.address_size 64
4+
5+
6+
7+
.visible .entry _Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i(
8+
.param .u64 _Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_0,
9+
.param .u64 _Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_1,
10+
.param .u64 _Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_2,
11+
.param .u32 _Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_3
12+
)
13+
{
14+
.reg .pred %p<4>;
15+
.reg .f32 %f<4>;
16+
.reg .b32 %r<15>;
17+
.reg .b64 %rd<21>;
18+
19+
20+
ld.param.u64 %rd7, [_Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_0];
21+
ld.param.u64 %rd8, [_Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_1];
22+
ld.param.u64 %rd9, [_Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_2];
23+
ld.param.u32 %r7, [_Z36multi_tensor_add_kernel_nested_arrayPPfS0_S0_i_param_3];
24+
cvta.to.global.u64 %rd1, %rd9;
25+
cvta.to.global.u64 %rd2, %rd8;
26+
cvta.to.global.u64 %rd3, %rd7;
27+
mov.u32 %r9, %ntid.x;
28+
mov.u32 %r10, %ctaid.x;
29+
mov.u32 %r11, %tid.x;
30+
mad.lo.s32 %r1, %r10, %r9, %r11;
31+
mov.u32 %r12, %nctaid.x;
32+
mul.lo.s32 %r2, %r9, %r12;
33+
mov.u32 %r13, 0;
34+
35+
$L__BB0_1:
36+
setp.ge.s32 %p1, %r1, %r7;
37+
@%p1 bra $L__BB0_4;
38+
39+
mul.wide.s32 %rd10, %r13, 8;
40+
add.s64 %rd4, %rd3, %rd10;
41+
add.s64 %rd5, %rd2, %rd10;
42+
add.s64 %rd6, %rd1, %rd10;
43+
mov.u32 %r14, %r1;
44+
45+
$L__BB0_3:
46+
ld.global.u64 %rd11, [%rd4];
47+
cvta.to.global.u64 %rd12, %rd11;
48+
mul.wide.s32 %rd13, %r14, 4;
49+
add.s64 %rd14, %rd12, %rd13;
50+
ld.global.u64 %rd15, [%rd5];
51+
cvta.to.global.u64 %rd16, %rd15;
52+
add.s64 %rd17, %rd16, %rd13;
53+
ld.global.f32 %f1, [%rd17];
54+
ld.global.f32 %f2, [%rd14];
55+
add.ftz.f32 %f3, %f2, %f1;
56+
ld.global.u64 %rd18, [%rd6];
57+
cvta.to.global.u64 %rd19, %rd18;
58+
add.s64 %rd20, %rd19, %rd13;
59+
st.global.f32 [%rd20], %f3;
60+
add.s32 %r14, %r14, %r2;
61+
setp.lt.s32 %p2, %r14, %r7;
62+
@%p2 bra $L__BB0_3;
63+
64+
$L__BB0_4:
65+
add.s32 %r13, %r13, 1;
66+
setp.lt.u32 %p3, %r13, 1024;
67+
@%p3 bra $L__BB0_1;
68+
69+
ret;
70+
71+
}
72+
73+
.visible .entry _Z33multi_tensor_add_kernel_meta_dataP18TensorListMetaDatai(
74+
.param .u64 _Z33multi_tensor_add_kernel_meta_dataP18TensorListMetaDatai_param_0,
75+
.param .u32 _Z33multi_tensor_add_kernel_meta_dataP18TensorListMetaDatai_param_1
76+
)
77+
{
78+
.reg .pred %p<4>;
79+
.reg .f32 %f<4>;
80+
.reg .b32 %r<15>;
81+
.reg .b64 %rd<15>;
82+
83+
84+
ld.param.u64 %rd5, [_Z33multi_tensor_add_kernel_meta_dataP18TensorListMetaDatai_param_0];
85+
ld.param.u32 %r7, [_Z33multi_tensor_add_kernel_meta_dataP18TensorListMetaDatai_param_1];
86+
cvta.to.global.u64 %rd1, %rd5;
87+
mov.u32 %r9, %ntid.x;
88+
mov.u32 %r10, %ctaid.x;
89+
mov.u32 %r11, %tid.x;
90+
mad.lo.s32 %r1, %r10, %r9, %r11;
91+
mov.u32 %r12, %nctaid.x;
92+
mul.lo.s32 %r2, %r9, %r12;
93+
mov.u32 %r13, 0;
94+
95+
$L__BB1_1:
96+
setp.ge.s32 %p1, %r1, %r7;
97+
@%p1 bra $L__BB1_4;
98+
99+
mul.wide.s32 %rd6, %r13, 8;
100+
add.s64 %rd7, %rd1, %rd6;
101+
ld.global.u64 %rd8, [%rd7];
102+
ld.global.u64 %rd9, [%rd7+8192];
103+
ld.global.u64 %rd10, [%rd7+16384];
104+
cvta.to.global.u64 %rd2, %rd10;
105+
cvta.to.global.u64 %rd3, %rd9;
106+
cvta.to.global.u64 %rd4, %rd8;
107+
mov.u32 %r14, %r1;
108+
109+
$L__BB1_3:
110+
mul.wide.s32 %rd11, %r14, 4;
111+
add.s64 %rd12, %rd4, %rd11;
112+
add.s64 %rd13, %rd3, %rd11;
113+
ld.global.f32 %f1, [%rd13];
114+
ld.global.f32 %f2, [%rd12];
115+
add.ftz.f32 %f3, %f2, %f1;
116+
add.s64 %rd14, %rd2, %rd11;
117+
st.global.f32 [%rd14], %f3;
118+
add.s32 %r14, %r14, %r2;
119+
setp.lt.s32 %p2, %r14, %r7;
120+
@%p2 bra $L__BB1_3;
121+
122+
$L__BB1_4:
123+
add.s32 %r13, %r13, 1;
124+
setp.lt.u32 %p3, %r13, 1024;
125+
@%p3 bra $L__BB1_1;
126+
127+
ret;
128+
129+
}

0 commit comments

Comments
 (0)