NVIDIA
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/cudnn_frontend_Errata.h‎
Lines changed: 184 additions & 1 deletion b/‎include/cudnn_frontend_Errata.h‎
Lines changed: 184 additions & 1 deletion
diff --git a/‎include/cudnn_frontend_MatMulDesc.h‎
Lines changed: 28 additions & 0 deletions b/‎include/cudnn_frontend_MatMulDesc.h‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎include/cudnn_frontend_Operation.h‎
Lines changed: 16 additions & 4 deletions b/‎include/cudnn_frontend_Operation.h‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎include/cudnn_frontend_OperationGraph.h‎
Lines changed: 17 additions & 4 deletions b/‎include/cudnn_frontend_OperationGraph.h‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎include/cudnn_frontend_PointWiseDesc.h‎
Lines changed: 5 additions & 1 deletion b/‎include/cudnn_frontend_PointWiseDesc.h‎
Lines changed: 5 additions & 1 deletion
@@ -84,6 +84,9 @@ Errata filter gives the cuDNN team an opportunity to block certain faulty kernel
     operation           : ""   - Mandatory. Stringified version of the operation graph.
     engine              : ""   - Mandatory. Stringified version of the engine ID.
     knob                : ""   - Optional.  Stringified version of the knob. If specified only the engineConfig for the engine matching the knobs will be blocked. Else, all possible combination of knobs for the engine will be blocked.
+    input_shape         : []   - Optional. Array of input shape for kernel (ex. [64, 32, 128, 128]) to be filtered out. Use -1 if you don't want to filter that dimension. (ex. [-1, -1, 128, 128] to only filter HxW for NCHW format)
+    filter_shape        : []   - Optional. Array of kernel/filter shape for kernel (ex. [32, 32, 5, 5]) to be filtered out. Use -1 if you don't want to filter that dimension. (ex. [-1, -1, 5, 5] to only filter 5x5 filter sizes)
+    shape_format        : ""   - Mandatory if input_shape and/or kernel_shape is present. Optional otherwise. Shape format of tensors as a string. (Ex. "NCHW", "NHWC").
     cudnn_version_start : 0    - Optional. Denotes the cudnn version after which the engine started having issues.
     cudnn_version_end   : -1   - Optional. Denotes the cudnn_version when the issue was fixed. "-1" denotes its an ongoing issue.
     arch                : ""   - Optional. Architectures where this kernel might be faulty.
 
@@ -47,9 +47,81 @@ load_from_config(json &json_handle, const std::string & errata_json) {
     return true;
 }
 
+/**
+ * @brief Checks the shape of an operation to compare against errata filter height and width for kernel blocking
+ * 
+ * @param op The operation's tensors to check
+ * @param shape_format The shape format of the tensor (NCHW vs NHWC)
+ * @param tensor_attr The cudnnBackendAttributeName_t of the tensor's shape we want to check
+ * @param blocked_height The height we want to filter out
+ * @param blocked_width The width we want to filter out
+ * @param blocked_channels The channels we want to filter out. Defaults to -1 (not filter out channels)
+ * @return true The passed in operation shape matches the blocked shape
+ * @return false The passed in operation shape does not match the blocked shape
+ */
+static bool
+check_shape(cudnnBackendDescriptor_t &op,
+            const std::string &shape_format,
+            cudnnBackendAttributeName_t tensor_attr,
+            const std::vector<int64_t> &blocked_shape) {
+
+    // Get backend descriptor to individual tensor to be able to get shape
+    ManagedOpaqueDescriptor tensor = make_shared_backend_pointer(CUDNN_BACKEND_TENSOR_DESCRIPTOR);
+    cudnnBackendDescriptor_t tensor_ = tensor->get_backend_descriptor();
+    int64_t count = 0;
+    cudnnStatus_t status = cudnnBackendGetAttribute(op,
+                                        tensor_attr,
+                                        CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                        1,
+                                        &count,
+                                        &tensor_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+        #ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnnException(
+            std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(), status);
+        #endif
+    }
+
+    // Get tensor dims
+    std::array<int64_t, 5> tensor_dims;
+    status = cudnnBackendGetAttribute(tensor_,
+                                        CUDNN_ATTR_TENSOR_DIMENSIONS,
+                                        CUDNN_TYPE_INT64,
+                                        5,
+                                        &count,
+                                        tensor_dims.data());
+    if (status != CUDNN_STATUS_SUCCESS) {
+        #ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnnException(
+            std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(), status);
+        #endif
+    }
+    // tensor_dims is 1 indexed
+    int64_t first_dim = tensor_dims[1]; // batch size for input/output tensor, output channels for filter tensor
+    int64_t blocked_first_dim = blocked_shape[0];
+
+    // Defaults to true becuase -1 means we don't filter that out (Wildcard). If something later blocks, then the comparison will be correct
+    bool blocked = (blocked_first_dim != -1) ? (first_dim == blocked_first_dim) : true;
+
+    // Check for shape format to extract the right dimension. Filter shape will always be "NCHW" for convenience.
+    int64_t channels = (shape_format == "NCHW") ? tensor_dims[2] : tensor_dims[4]; // channels
+    int64_t blocked_channels = (shape_format == "NCHW") ? blocked_shape[1] : blocked_shape[3];
+    blocked = (blocked_channels != -1) ? (blocked && channels == blocked_channels) : true;
+
+    int64_t height = (shape_format == "NCHW") ? tensor_dims[3] : tensor_dims[2];
+    int64_t blocked_height = (shape_format == "NCHW") ? blocked_shape[2] : blocked_shape[1];
+    blocked = (blocked_height != -1) ? (blocked && height == blocked_height) : true;
+    
+    int64_t width = (shape_format == "NCHW") ? tensor_dims[4] : tensor_dims[3];
+    int64_t blocked_width = (shape_format == "NCHW") ? blocked_shape[3] : blocked_shape[2];
+    blocked = (blocked_width != -1) ? (blocked && width == blocked_width) : true;
+
+    return blocked;
+}
+
 template <typename T>
 static bool 
-check_rule(const json &json_handle, const std::string & executionPlanTag,
+check_rule(const json &json_handle, const std::string &executionPlanTag,
     cudnnHandle_t handle, T fn) {
     std::string operation = json_handle["operation"];
     int64_t engine        =  json_handle["engine"];
@@ -75,7 +147,98 @@ check_rule(const json &json_handle, const std::string & executionPlanTag,
                 (executionPlanTag.find(kv) != std::string::npos);
         }
     }
+    blocked = blocked && fn(); 
+    return blocked;
+
+    CUDNN_FRONTEND_UNUSED(handle);
+}
+
+// Overload for check_rule to take in an operation graph for shape filtering
+template <typename T>
+static bool 
+check_rule(const json &json_handle, const std::string &executionPlanTag,
+    cudnnHandle_t handle, T fn, const OperationGraph& opGraph) {
+    std::string operation = json_handle["operation"];
+    int64_t engine        =  json_handle["engine"];
+    uint64_t cudnn_start     =  0;
+    uint64_t cudnn_end       =  std::numeric_limits<uint64_t>::max();
+    if (json_handle.contains("cudnn_version_start")) {
+        cudnn_start   =  json_handle["cudnn_version_start"];
+    }
+    if (json_handle.contains("cudnn_version_end")) {
+        cudnn_end     =  json_handle["cudnn_version_end"];
+    }
+    std::string tag_prefix = operation + "_eng" + std::to_string(engine) + "_"; 
+    std::string mod_tag    = executionPlanTag + "_";
+    bool blocked = 
+        tag_prefix.size() <= mod_tag.size() &&
+        std::equal(tag_prefix.begin(), tag_prefix.end(), mod_tag.begin()) &&
+        CUDNN_VERSION >= cudnn_start &&
+        CUDNN_VERSION < cudnn_end;
+
+    if (blocked && json_handle.contains("knob")) { // Short circuit if operation and engine do not match
+        for (auto& kv : json_handle["knob"]) {
+            blocked = blocked &&
+                (executionPlanTag.find(kv) != std::string::npos);
+        }
+    }
+    
+    if (blocked && json_handle.contains("input_shape")) { // Check if user wants to block kernel for specific input shape
+        if (!json_handle.contains("shape_format")) {
+            std::string message = "ERROR: Please set a shape format (e.g. shape_format: \"NCWH\") for errata filters using input/kernel shape";
+            #ifndef NV_CUDNN_DISABLE_EXCEPTION
+            throw cudnnException(message.c_str(), CUDNN_STATUS_BAD_PARAM);
+            #endif
+            getLogger() << message << std::endl;
+            return blocked;
+        }
+
+        std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops = opGraph.getOps();
+        std::array<cudnnBackendDescriptor_t, MAX_OPGRAPH_OPS> ops_;
+        for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
+            ops_[i] = ops[i]->get_backend_descriptor();
+        }
+
+        std::string shape_format = json_handle["shape_format"];
+        std::vector<int64_t> blocked_shape = json_handle["input_shape"];
+
+        // Forward conv operation
+        if (operation == "ConvFwd") {
+            blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, blocked_shape);
+
+        // Operation is conv wgrad
+        } else if (operation == "ConvBwdFilter") {
+            blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X, blocked_shape);
+
+        // Operation is conv dgrad 
+        } else if (operation == "ConvBwdData") {
+            blocked = blocked && check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX, blocked_shape);
+        }
+    }
+
+    if (blocked && json_handle.contains("filter_shape")) { // Check if user wants to block kernel for specific filter shape
+        std::array<ManagedOpaqueDescriptor, 50> ops = opGraph.getOps();
+        std::array<cudnnBackendDescriptor_t, 50> ops_;
+        for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
+            ops_[i] = ops[i]->get_backend_descriptor();
+        }
+
+        std::vector<int64_t> blocked_shape = json_handle["filter_shape"];
 
+        // Forward conv operation
+        if (operation == "ConvFwd") {
+            // Filter format is always [output channels, input channels, height, width] so we hardcode "NCHW" to match and not repeat code
+            blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, blocked_shape);
+
+        // Operation is conv wgrad
+        } else if (operation == "ConvBwdFilter") {
+            blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW, blocked_shape);
+
+        // Operation is conv dgrad 
+        } else if (operation == "ConvBwdData") {
+            blocked = blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W, blocked_shape);
+        }
+    }
     blocked = blocked && fn(); 
     return blocked;
 
@@ -102,4 +265,24 @@ check_errata(const json &json_handle, const std::string & executionPlanTag,
     return false;
 }
 
+// Overload. Takes in an initialzed json handle, an execution plan tag, and a operation graph and checks if it satisfies the 
+// condition for running it. Returns true if the given executionPlanTag + operation graph
+// is faulty
+template <typename T>
+static bool
+check_errata(const json &json_handle, const std::string & executionPlanTag,
+    cudnnHandle_t handle, const OperationGraph &opGraph, T fn) {
+
+    cudnn_frontend::getLogger() << "[cudnn_frontend] " << "Verifying " << executionPlanTag;
+    for (auto const &rule : json_handle["rules"]) {
+        if (check_rule<T>(rule, executionPlanTag, handle, fn, opGraph)) {
+            cudnn_frontend::getLogger() << ". Blocking." << std::endl;
+            return true;
+        }
+    }
+
+    cudnn_frontend::getLogger() << ". Passed." << std::endl;
+    return false;
+}
+
 }
@@ -68,6 +68,8 @@ class MatMulDesc_v8 : public BackendDescriptor {
     operator=(MatMulDesc_v8 const &) = delete;
 
     cudnnDataType_t compute_type = CUDNN_DATA_FLOAT;
+    bool isPadded = false;
+    double paddingValue = 0.0;
 };
 
 ////
@@ -93,6 +95,14 @@ class MatMulDescBuilder_v8 {
         return setComputeType(data_type_);
     }
 
+    //! Set padding value for matmul descriptor
+    auto
+    setPaddingValue(double paddingValue) -> MatMulDescBuilder_v8 & {
+        m_matMulDesc.isPadded = true;
+        m_matMulDesc.paddingValue = paddingValue;
+        return *this;
+    }
+
     //! constructs the MatMulDesc_v8 by calling the cudnn API
     //! Throws the appropriate error message
     MatMulDesc_v8 &&
@@ -118,6 +128,24 @@ class MatMulDescBuilder_v8 {
             return std::move(m_matMulDesc);
         }
 
+        #if (CUDNN_VERSION >= 8900)
+        // Setting padding value if matmul desc is padded
+        if (m_matMulDesc.isPadded) {
+            status = cudnnBackendSetAttribute(m_matMulDesc.pointer->get_backend_descriptor(),
+                                          CUDNN_ATTR_MATMUL_PADDING_VALUE,
+                                          CUDNN_TYPE_DOUBLE,
+                                          1,
+                                          &m_matMulDesc.paddingValue);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_matMulDesc,
+                    status,
+                    "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_PADDING_VALUE Failed");
+                return std::move(m_matMulDesc);
+            }
+        }
+        #endif
+        
         // Finalizing the descriptor
         status = cudnnBackendFinalize(m_matMulDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
 
@@ -564,12 +564,18 @@ class OperationBuilder_v8 {
                 m_operation.operationTag = "Identity";
                 break;
 #endif
+#if (CUDNN_VERSION >= 8900)
+            case CUDNN_POINTWISE_RECIPROCAL:
+                m_operation.operationTag = "Reciprocal";
+                break;
+#endif
 #ifndef NO_DEFAULT_IN_SWITCH
-	    default:
+            default:
                 m_operation.operationTag = "UNKNOWN_POINTWISE_OPERATION";
                 break;
 #endif
         }
+        
 
         status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
                 CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
@@ -2018,8 +2024,11 @@ class OperationBuilder_v8 {
             m_operation.feature_vector.push_back(yTensor_strA[i]); // n, c, (g), d, h , w 
         }
 
-        int64_t alpha_as_int = *reinterpret_cast<int64_t *>(&m_operation.alpha_d);
-        int64_t  beta_as_int = *reinterpret_cast<int64_t *>(&m_operation.beta_d);
+        int64_t alpha_as_int;
+        int64_t  beta_as_int;
+        std::memcpy((void *)&alpha_as_int, (void *)(&m_operation.alpha_s), sizeof(int64_t));
+        std::memcpy((void *)&beta_as_int, (void *)(&m_operation.beta_s), sizeof(int64_t));
+
 
         m_operation.feature_vector.push_back(alpha_as_int);
         m_operation.feature_vector.push_back(beta_as_int);
@@ -2729,6 +2738,9 @@ class OperationBuilder_v8 {
 #endif
 #if (CUDNN_VERSION >= 8500)
                                             (m_operation.pointwise_mode == CUDNN_POINTWISE_ERF) ||
+#endif
+#if (CUDNN_VERSION >= 8900)
+                                            (m_operation.pointwise_mode == CUDNN_POINTWISE_RECIPROCAL) ||
 #endif
                                             (m_operation.pointwise_mode == CUDNN_POINTWISE_MIN) ||
                                             (m_operation.pointwise_mode == CUDNN_POINTWISE_MAX) ||
@@ -2758,7 +2770,7 @@ class OperationBuilder_v8 {
                                                       (m_operation.pointwise_mode == CUDNN_POINTWISE_GELU_BWD) ||
 #if (CUDNN_VERSION >= 8500)
                                                       (m_operation.pointwise_mode == CUDNN_POINTWISE_GELU_APPROX_TANH_BWD) ||
-#endif
+#endif 
                                                       (m_operation.pointwise_mode == CUDNN_POINTWISE_SOFTPLUS_BWD) ||
                                                       (m_operation.pointwise_mode == CUDNN_POINTWISE_SWISH_BWD));
 
 
@@ -36,6 +36,9 @@
 #include "cudnn_frontend_Operation.h"
 #include "cudnn_frontend_utils.h"
 
+// Compile time constant for max ops in a op graph
+constexpr int64_t MAX_OPGRAPH_OPS = 50;
+
 namespace cudnn_frontend {
 
 ///
@@ -100,14 +103,24 @@ class OperationGraph_v8 : public BackendDescriptor {
         return opGraphTag;
     }
 
+    bool
+    setFeatureVector(feature_vector_t fv) {
+        feature_vectors.push_back(fv);
+        return true;
+    }
+
     feature_vector_t
     getFeatureVector() const {
-        if (numOps != 1) {
-            return {}; /// We do not support multiop opGraph at this point of time.
-        } else {
+        if(feature_vectors.size() != 0) {
             return feature_vectors[0];
+        } else {
+            return {};
         }
+    }
 
+    const std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> &
+    getOps() const {
+        return ops;
     }
 
    private:
@@ -117,7 +130,7 @@ class OperationGraph_v8 : public BackendDescriptor {
     operator=(OperationGraph_v8 const &) = delete;
 
     cudnnHandle_t handle = nullptr;
-    std::array<ManagedOpaqueDescriptor, 50> ops{};
+    std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops{};
     int64_t numOps         = -1;
     std::string opGraphTag = "";
     std::vector<feature_vector_t> feature_vectors;
 
@@ -131,8 +131,12 @@ class PointWiseDesc_v8 : public BackendDescriptor {
             case CUDNN_POINTWISE_BINARY_SELECT:
                 return 4;
 #endif
+#if (CUDNN_VERSION >= 8900)
+            case CUDNN_POINTWISE_RECIPROCAL:
+                return 2;
+#endif
 #ifndef NO_DEFAULT_IN_SWITCH
-	    default:
+            default:
                 return -1;
 #endif
         }