forked from chelsea0x3b/cudarc
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconstant_memory.ptx
More file actions
60 lines (51 loc) · 1.34 KB
/
Copy pathconstant_memory.ptx
File metadata and controls
60 lines (51 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-36424714
// Cuda compilation tools, release 13.0, V13.0.88
// Based on NVVM 7.0.1
//
.version 9.0
.target sm_75
.address_size 64
// .globl polynomial_kernel
.const .align 4 .b8 coefficients[16];
.visible .entry polynomial_kernel(
.param .u64 polynomial_kernel_param_0,
.param .u64 polynomial_kernel_param_1,
.param .u32 polynomial_kernel_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<12>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [polynomial_kernel_param_0];
ld.param.u64 %rd2, [polynomial_kernel_param_1];
ld.param.u32 %r2, [polynomial_kernel_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd3, %rd2;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.const.f32 %f1, [coefficients+4];
ld.global.f32 %f2, [%rd5];
ld.const.f32 %f3, [coefficients];
fma.rn.f32 %f4, %f2, %f1, %f3;
ld.const.f32 %f5, [coefficients+8];
mul.f32 %f6, %f2, %f5;
fma.rn.f32 %f7, %f2, %f6, %f4;
ld.const.f32 %f8, [coefficients+12];
mul.f32 %f9, %f2, %f8;
mul.f32 %f10, %f2, %f9;
fma.rn.f32 %f11, %f2, %f10, %f7;
cvta.to.global.u64 %rd6, %rd1;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f11;
$L__BB0_2:
ret;
}