@@ -51,7 +51,7 @@ function _theoretical_peakflops_gpu_cudacores(; device, dtype)
51
51
elseif dtype == Float64
52
52
max_peakflops *= 1
53
53
else
54
- throw (ArgumentError (" Unsupported dtype." ))
54
+ throw (ArgumentError (" Unsupported dtype $(dtype) ." ))
55
55
end
56
56
return max_peakflops
57
57
end
@@ -60,7 +60,9 @@ function _theoretical_peakflops_gpu_tensorcores(;
60
60
device= CUDA. device (), dtype= Float16, verbose= true
61
61
)
62
62
cap = CUDA. capability (device)
63
- if cap == v " 8.0.0"
63
+ if cap == v " 9.0.0"
64
+ devtype = :Hopper
65
+ elseif cap == v " 8.0.0"
64
66
devtype = :A100
65
67
elseif cap == v " 7.0.0"
66
68
devtype = :V100
@@ -70,10 +72,26 @@ function _theoretical_peakflops_gpu_tensorcores(;
70
72
max_clock_rate = CUDA. attribute (device, CUDA. CU_DEVICE_ATTRIBUTE_CLOCK_RATE) # in kHz
71
73
num_tensor_cores = ntensorcores (device)
72
74
max_peakflops = max_clock_rate * num_tensor_cores * 1e-9 # in TFLOP/s
73
- if devtype == :A100
75
+ if devtype == :Hopper
76
+ # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C see
77
+ # * <https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper> (figures 10-11)
78
+ # * <https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/> (figures 5-8)
74
79
if Symbol (dtype) == :Float16
75
- # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C
76
- # see e.g. https://peerj.com/articles/cs-330.pdf
80
+ max_peakflops *= 2 * 16 * 8 * 4 # XXX : Wrong result!
81
+ elseif Symbol (dtype) in (:Float32 , :TensorFloat32 , :TF32 )
82
+ max_peakflops *= 2 * 8 * 8 * 4 # XXX : Wrong result!
83
+ elseif Symbol (dtype) == :Float64
84
+ max_peakflops *= 2 * 4 * 4 * 2
85
+ elseif Symbol (dtype) == :Int8
86
+ max_peakflops *= 2 * 2 * 32 * 8 * 4 # XXX : Wrong result!
87
+ else
88
+ throw (ArgumentError (" Unsupported dtype $(dtype) ." ))
89
+ end
90
+ elseif devtype == :A100
91
+ if Symbol (dtype) == :Float16
92
+ # matrix dimensions 8x8x4, factor 2 for nflops in A*B+C see
93
+ # e.g. <https://doi.org/10.7717/peerj-cs.330> or
94
+ # <https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/nvidia-ampere-architecture-whitepaper.pdf>
77
95
max_peakflops *= 2 * 8 * 8 * 4
78
96
elseif Symbol (dtype) in (:Float32 , :TensorFloat32 , :TF32 )
79
97
max_peakflops *= 2 * 4 * 8 * 4
@@ -82,13 +100,13 @@ function _theoretical_peakflops_gpu_tensorcores(;
82
100
elseif Symbol (dtype) == :Int8
83
101
max_peakflops *= 2 * 2 * 8 * 8 * 4
84
102
else
85
- throw (ArgumentError (" Unsupported dtype." ))
103
+ throw (ArgumentError (" Unsupported dtype $(dtype) ." ))
86
104
end
87
105
elseif devtype == :V100
88
106
if Symbol (dtype) == :Float16
89
107
max_peakflops *= 2 * 4 * 4 * 4
90
108
else
91
- throw (ArgumentError (" Unsupported dtype." ))
109
+ throw (ArgumentError (" Unsupported dtype $(dtype) ." ))
92
110
end
93
111
end
94
112
return max_peakflops
0 commit comments