Skip to content

Commit e7f6439

Browse files
authored
v0.9 release (#44)
[Enhancement] Add ability to filter by shape of tensors to errata filter. [Enhancement] Adding ability to add feature vector to opGraph manually. [Enhancement] Added support for CUDNN_POINTWISE_RECIPROCAL pointwise operation. [Enhancement] Add anoption to limit the number of kernels benchmarked in find-plan. [Bug Fix] Fixed "Scale Bias Conv BNGenstats" test case where the sum and square sum channel dimensions were incorrect. [Bug Fix] Fixed a compiler error "dereferencing type-punned pointer will break strict-aliasing rules" seen in certain compiler while type-casting floating point alpha/beta to int64_t. [Bug Fix] Waived "ConvScaleBiasAct_int8 sample" for V100 because of lack of int8 support. [Samples] Updated the Fused MHA sample to use plan caching. [Samples] Added BF16/FP16/FP8 Flash Attention Fprop/Bprop samples. Co-authored-by: Anerudhan Gopal <[email protected]>
1 parent 1e32f72 commit e7f6439

23 files changed

+4384
-188
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ Errata filter gives the cuDNN team an opportunity to block certain faulty kernel
8484
operation : "" - Mandatory. Stringified version of the operation graph.
8585
engine : "" - Mandatory. Stringified version of the engine ID.
8686
knob : "" - Optional. Stringified version of the knob. If specified only the engineConfig for the engine matching the knobs will be blocked. Else, all possible combination of knobs for the engine will be blocked.
87+
input_shape : [] - Optional. Array of input shape for kernel (ex. [64, 32, 128, 128]) to be filtered out. Use -1 if you don't want to filter that dimension. (ex. [-1, -1, 128, 128] to only filter HxW for NCHW format)
88+
filter_shape : [] - Optional. Array of kernel/filter shape for kernel (ex. [32, 32, 5, 5]) to be filtered out. Use -1 if you don't want to filter that dimension. (ex. [-1, -1, 5, 5] to only filter 5x5 filter sizes)
89+
shape_format : "" - Mandatory if input_shape and/or kernel_shape is present. Optional otherwise. Shape format of tensors as a string. (Ex. "NCHW", "NHWC").
8790
cudnn_version_start : 0 - Optional. Denotes the cudnn version after which the engine started having issues.
8891
cudnn_version_end : -1 - Optional. Denotes the cudnn_version when the issue was fixed. "-1" denotes its an ongoing issue.
8992
arch : "" - Optional. Architectures where this kernel might be faulty.

include/cudnn_frontend_Errata.h

Lines changed: 184 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,81 @@ load_from_config(json &json_handle, const std::string & errata_json) {
4747
return true;
4848
}
4949

50+
/**
51+
* @brief Checks the shape of an operation to compare against errata filter height and width for kernel blocking
52+
*
53+
* @param op The operation's tensors to check
54+
* @param shape_format The shape format of the tensor (NCHW vs NHWC)
55+
* @param tensor_attr The cudnnBackendAttributeName_t of the tensor's shape we want to check
56+
* @param blocked_height The height we want to filter out
57+
* @param blocked_width The width we want to filter out
58+
* @param blocked_channels The channels we want to filter out. Defaults to -1 (not filter out channels)
59+
* @return true The passed in operation shape matches the blocked shape
60+
* @return false The passed in operation shape does not match the blocked shape
61+
*/
62+
static bool
63+
check_shape(cudnnBackendDescriptor_t &op,
64+
const std::string &shape_format,
65+
cudnnBackendAttributeName_t tensor_attr,
66+
const std::vector<int64_t> &blocked_shape) {
67+
68+
// Get backend descriptor to individual tensor to be able to get shape
69+
ManagedOpaqueDescriptor tensor = make_shared_backend_pointer(CUDNN_BACKEND_TENSOR_DESCRIPTOR);
70+
cudnnBackendDescriptor_t tensor_ = tensor->get_backend_descriptor();
71+
int64_t count = 0;
72+
cudnnStatus_t status = cudnnBackendGetAttribute(op,
73+
tensor_attr,
74+
CUDNN_TYPE_BACKEND_DESCRIPTOR,
75+
1,
76+
&count,
77+
&tensor_);
78+
if (status != CUDNN_STATUS_SUCCESS) {
79+
#ifndef NV_CUDNN_DISABLE_EXCEPTION
80+
throw cudnnException(
81+
std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(), status);
82+
#endif
83+
}
84+
85+
// Get tensor dims
86+
std::array<int64_t, 5> tensor_dims;
87+
status = cudnnBackendGetAttribute(tensor_,
88+
CUDNN_ATTR_TENSOR_DIMENSIONS,
89+
CUDNN_TYPE_INT64,
90+
5,
91+
&count,
92+
tensor_dims.data());
93+
if (status != CUDNN_STATUS_SUCCESS) {
94+
#ifndef NV_CUDNN_DISABLE_EXCEPTION
95+
throw cudnnException(
96+
std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(), status);
97+
#endif
98+
}
99+
// tensor_dims is 1 indexed
100+
int64_t first_dim = tensor_dims[1]; // batch size for input/output tensor, output channels for filter tensor
101+
int64_t blocked_first_dim = blocked_shape[0];
102+
103+
// Defaults to true becuase -1 means we don't filter that out (Wildcard). If something later blocks, then the comparison will be correct
104+
bool blocked = (blocked_first_dim != -1) ? (first_dim == blocked_first_dim) : true;
105+
106+
// Check for shape format to extract the right dimension. Filter shape will always be "NCHW" for convenience.
107+
int64_t channels = (shape_format == "NCHW") ? tensor_dims[2] : tensor_dims[4]; // channels
108+
int64_t blocked_channels = (shape_format == "NCHW") ? blocked_shape[1] : blocked_shape[3];
109+
blocked = (blocked_channels != -1) ? (blocked && channels == blocked_channels) : true;
110+
111+
int64_t height = (shape_format == "NCHW") ? tensor_dims[3] : tensor_dims[2];
112+
int64_t blocked_height = (shape_format == "NCHW") ? blocked_shape[2] : blocked_shape[1];
113+
blocked = (blocked_height != -1) ? (blocked && height == blocked_height) : true;
114+
115+
int64_t width = (shape_format == "NCHW") ? tensor_dims[4] : tensor_dims[3];
116+
int64_t blocked_width = (shape_format == "NCHW") ? blocked_shape[3] : blocked_shape[2];
117+
blocked = (blocked_width != -1) ? (blocked && width == blocked_width) : true;
118+
119+
return blocked;
120+
}
121+
50122
template <typename T>
51123
static bool
52-
check_rule(const json &json_handle, const std::string & executionPlanTag,
124+
check_rule(const json &json_handle, const std::string &executionPlanTag,
53125
cudnnHandle_t handle, T fn) {
54126
std::string operation = json_handle["operation"];
55127
int64_t engine = json_handle["engine"];
@@ -75,7 +147,98 @@ check_rule(const json &json_handle, const std::string & executionPlanTag,
75147
(executionPlanTag.find(kv) != std::string::npos);
76148
}
77149
}
150+
blocked = blocked && fn();
151+
return blocked;
152+
153+
CUDNN_FRONTEND_UNUSED(handle);
154+
}
155+
156+
// Overload for check_rule to take in an operation graph for shape filtering
157+
template <typename T>
158+
static bool
159+
check_rule(const json &json_handle, const std::string &executionPlanTag,
160+
cudnnHandle_t handle, T fn, const OperationGraph& opGraph) {
161+
std::string operation = json_handle["operation"];
162+
int64_t engine = json_handle["engine"];
163+
uint64_t cudnn_start = 0;
164+
uint64_t cudnn_end = std::numeric_limits<uint64_t>::max();
165+
if (json_handle.contains("cudnn_version_start")) {
166+
cudnn_start = json_handle["cudnn_version_start"];
167+
}
168+
if (json_handle.contains("cudnn_version_end")) {
169+
cudnn_end = json_handle["cudnn_version_end"];
170+
}
171+
std::string tag_prefix = operation + "_eng" + std::to_string(engine) + "_";
172+
std::string mod_tag = executionPlanTag + "_";
173+
bool blocked =
174+
tag_prefix.size() <= mod_tag.size() &&
175+
std::equal(tag_prefix.begin(), tag_prefix.end(), mod_tag.begin()) &&
176+
CUDNN_VERSION >= cudnn_start &&
177+
CUDNN_VERSION < cudnn_end;
178+
179+
if (blocked && json_handle.contains("knob")) { // Short circuit if operation and engine do not match
180+
for (auto& kv : json_handle["knob"]) {
181+
blocked = blocked &&
182+
(executionPlanTag.find(kv) != std::string::npos);
183+
}
184+
}
185+
186+
if (blocked && json_handle.contains("input_shape")) { // Check if user wants to block kernel for specific input shape
187+
if (!json_handle.contains("shape_format")) {
188+
std::string message = "ERROR: Please set a shape format (e.g. shape_format: \"NCWH\") for errata filters using input/kernel shape";
189+
#ifndef NV_CUDNN_DISABLE_EXCEPTION
190+
throw cudnnException(message.c_str(), CUDNN_STATUS_BAD_PARAM);
191+
#endif
192+
getLogger() << message << std::endl;
193+
return blocked;
194+
}
195+
196+
std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops = opGraph.getOps();
197+
std::array<cudnnBackendDescriptor_t, MAX_OPGRAPH_OPS> ops_;
198+
for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
199+
ops_[i] = ops[i]->get_backend_descriptor();
200+
}
201+
202+
std::string shape_format = json_handle["shape_format"];
203+
std::vector<int64_t> blocked_shape = json_handle["input_shape"];
204+
205+
// Forward conv operation
206+
if (operation == "ConvFwd") {
207+
blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, blocked_shape);
208+
209+
// Operation is conv wgrad
210+
} else if (operation == "ConvBwdFilter") {
211+
blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X, blocked_shape);
212+
213+
// Operation is conv dgrad
214+
} else if (operation == "ConvBwdData") {
215+
blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX, blocked_shape);
216+
}
217+
}
218+
219+
if (blocked && json_handle.contains("filter_shape")) { // Check if user wants to block kernel for specific filter shape
220+
std::array<ManagedOpaqueDescriptor, 50> ops = opGraph.getOps();
221+
std::array<cudnnBackendDescriptor_t, 50> ops_;
222+
for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
223+
ops_[i] = ops[i]->get_backend_descriptor();
224+
}
225+
226+
std::vector<int64_t> blocked_shape = json_handle["filter_shape"];
78227

228+
// Forward conv operation
229+
if (operation == "ConvFwd") {
230+
// Filter format is always [output channels, input channels, height, width] so we hardcode "NCHW" to match and not repeat code
231+
blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, blocked_shape);
232+
233+
// Operation is conv wgrad
234+
} else if (operation == "ConvBwdFilter") {
235+
blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW, blocked_shape);
236+
237+
// Operation is conv dgrad
238+
} else if (operation == "ConvBwdData") {
239+
blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W, blocked_shape);
240+
}
241+
}
79242
blocked = blocked && fn();
80243
return blocked;
81244

@@ -102,4 +265,24 @@ check_errata(const json &json_handle, const std::string & executionPlanTag,
102265
return false;
103266
}
104267

268+
// Overload. Takes in an initialzed json handle, an execution plan tag, and a operation graph and checks if it satisfies the
269+
// condition for running it. Returns true if the given executionPlanTag + operation graph
270+
// is faulty
271+
template <typename T>
272+
static bool
273+
check_errata(const json &json_handle, const std::string & executionPlanTag,
274+
cudnnHandle_t handle, const OperationGraph &opGraph, T fn) {
275+
276+
cudnn_frontend::getLogger() << "[cudnn_frontend] " << "Verifying " << executionPlanTag;
277+
for (auto const &rule : json_handle["rules"]) {
278+
if (check_rule<T>(rule, executionPlanTag, handle, fn, opGraph)) {
279+
cudnn_frontend::getLogger() << ". Blocking." << std::endl;
280+
return true;
281+
}
282+
}
283+
284+
cudnn_frontend::getLogger() << ". Passed." << std::endl;
285+
return false;
286+
}
287+
105288
}

include/cudnn_frontend_MatMulDesc.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ class MatMulDesc_v8 : public BackendDescriptor {
6868
operator=(MatMulDesc_v8 const &) = delete;
6969

7070
cudnnDataType_t compute_type = CUDNN_DATA_FLOAT;
71+
bool isPadded = false;
72+
double paddingValue = 0.0;
7173
};
7274

7375
////
@@ -93,6 +95,14 @@ class MatMulDescBuilder_v8 {
9395
return setComputeType(data_type_);
9496
}
9597

98+
//! Set padding value for matmul descriptor
99+
auto
100+
setPaddingValue(double paddingValue) -> MatMulDescBuilder_v8 & {
101+
m_matMulDesc.isPadded = true;
102+
m_matMulDesc.paddingValue = paddingValue;
103+
return *this;
104+
}
105+
96106
//! constructs the MatMulDesc_v8 by calling the cudnn API
97107
//! Throws the appropriate error message
98108
MatMulDesc_v8 &&
@@ -118,6 +128,24 @@ class MatMulDescBuilder_v8 {
118128
return std::move(m_matMulDesc);
119129
}
120130

131+
#if (CUDNN_VERSION >= 8900)
132+
// Setting padding value if matmul desc is padded
133+
if (m_matMulDesc.isPadded) {
134+
status = cudnnBackendSetAttribute(m_matMulDesc.pointer->get_backend_descriptor(),
135+
CUDNN_ATTR_MATMUL_PADDING_VALUE,
136+
CUDNN_TYPE_DOUBLE,
137+
1,
138+
&m_matMulDesc.paddingValue);
139+
if (status != CUDNN_STATUS_SUCCESS) {
140+
set_error_and_throw_exception(
141+
&m_matMulDesc,
142+
status,
143+
"CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_PADDING_VALUE Failed");
144+
return std::move(m_matMulDesc);
145+
}
146+
}
147+
#endif
148+
121149
// Finalizing the descriptor
122150
status = cudnnBackendFinalize(m_matMulDesc.pointer->get_backend_descriptor());
123151
if (status != CUDNN_STATUS_SUCCESS) {

include/cudnn_frontend_Operation.h

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -564,12 +564,18 @@ class OperationBuilder_v8 {
564564
m_operation.operationTag = "Identity";
565565
break;
566566
#endif
567+
#if (CUDNN_VERSION >= 8900)
568+
case CUDNN_POINTWISE_RECIPROCAL:
569+
m_operation.operationTag = "Reciprocal";
570+
break;
571+
#endif
567572
#ifndef NO_DEFAULT_IN_SWITCH
568-
default:
573+
default:
569574
m_operation.operationTag = "UNKNOWN_POINTWISE_OPERATION";
570575
break;
571576
#endif
572577
}
578+
573579

574580
status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
575581
CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
@@ -2018,8 +2024,11 @@ class OperationBuilder_v8 {
20182024
m_operation.feature_vector.push_back(yTensor_strA[i]); // n, c, (g), d, h , w
20192025
}
20202026

2021-
int64_t alpha_as_int = *reinterpret_cast<int64_t *>(&m_operation.alpha_d);
2022-
int64_t beta_as_int = *reinterpret_cast<int64_t *>(&m_operation.beta_d);
2027+
int64_t alpha_as_int;
2028+
int64_t beta_as_int;
2029+
std::memcpy((void *)&alpha_as_int, (void *)(&m_operation.alpha_s), sizeof(int64_t));
2030+
std::memcpy((void *)&beta_as_int, (void *)(&m_operation.beta_s), sizeof(int64_t));
2031+
20232032

20242033
m_operation.feature_vector.push_back(alpha_as_int);
20252034
m_operation.feature_vector.push_back(beta_as_int);
@@ -2729,6 +2738,9 @@ class OperationBuilder_v8 {
27292738
#endif
27302739
#if (CUDNN_VERSION >= 8500)
27312740
(m_operation.pointwise_mode == CUDNN_POINTWISE_ERF) ||
2741+
#endif
2742+
#if (CUDNN_VERSION >= 8900)
2743+
(m_operation.pointwise_mode == CUDNN_POINTWISE_RECIPROCAL) ||
27322744
#endif
27332745
(m_operation.pointwise_mode == CUDNN_POINTWISE_MIN) ||
27342746
(m_operation.pointwise_mode == CUDNN_POINTWISE_MAX) ||
@@ -2758,7 +2770,7 @@ class OperationBuilder_v8 {
27582770
(m_operation.pointwise_mode == CUDNN_POINTWISE_GELU_BWD) ||
27592771
#if (CUDNN_VERSION >= 8500)
27602772
(m_operation.pointwise_mode == CUDNN_POINTWISE_GELU_APPROX_TANH_BWD) ||
2761-
#endif
2773+
#endif
27622774
(m_operation.pointwise_mode == CUDNN_POINTWISE_SOFTPLUS_BWD) ||
27632775
(m_operation.pointwise_mode == CUDNN_POINTWISE_SWISH_BWD));
27642776

include/cudnn_frontend_OperationGraph.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
#include "cudnn_frontend_Operation.h"
3737
#include "cudnn_frontend_utils.h"
3838

39+
// Compile time constant for max ops in a op graph
40+
constexpr int64_t MAX_OPGRAPH_OPS = 50;
41+
3942
namespace cudnn_frontend {
4043

4144
///
@@ -100,14 +103,24 @@ class OperationGraph_v8 : public BackendDescriptor {
100103
return opGraphTag;
101104
}
102105

106+
bool
107+
setFeatureVector(feature_vector_t fv) {
108+
feature_vectors.push_back(fv);
109+
return true;
110+
}
111+
103112
feature_vector_t
104113
getFeatureVector() const {
105-
if (numOps != 1) {
106-
return {}; /// We do not support multiop opGraph at this point of time.
107-
} else {
114+
if(feature_vectors.size() != 0) {
108115
return feature_vectors[0];
116+
} else {
117+
return {};
109118
}
119+
}
110120

121+
const std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> &
122+
getOps() const {
123+
return ops;
111124
}
112125

113126
private:
@@ -117,7 +130,7 @@ class OperationGraph_v8 : public BackendDescriptor {
117130
operator=(OperationGraph_v8 const &) = delete;
118131

119132
cudnnHandle_t handle = nullptr;
120-
std::array<ManagedOpaqueDescriptor, 50> ops{};
133+
std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops{};
121134
int64_t numOps = -1;
122135
std::string opGraphTag = "";
123136
std::vector<feature_vector_t> feature_vectors;

include/cudnn_frontend_PointWiseDesc.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,12 @@ class PointWiseDesc_v8 : public BackendDescriptor {
131131
case CUDNN_POINTWISE_BINARY_SELECT:
132132
return 4;
133133
#endif
134+
#if (CUDNN_VERSION >= 8900)
135+
case CUDNN_POINTWISE_RECIPROCAL:
136+
return 2;
137+
#endif
134138
#ifndef NO_DEFAULT_IN_SWITCH
135-
default:
139+
default:
136140
return -1;
137141
#endif
138142
}

0 commit comments

Comments
 (0)