NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/cudnn_frontend.h‎
Lines changed: 1 addition & 1 deletion b/‎include/cudnn_frontend.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/cudnn_frontend/graph_interface.h‎
Lines changed: 69 additions & 17 deletions b/‎include/cudnn_frontend/graph_interface.h‎
Lines changed: 69 additions & 17 deletions
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.17)
 
-project(cudnn_frontend VERSION 1.10.0)
+project(cudnn_frontend VERSION 1.11.0)
 
 option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
 option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
 
@@ -29,7 +29,7 @@
  * The cuDNN Frontend API is a C++ header-only library that demonstrates how to use the cuDNN C backend API. The cuDNN C
  * backend API is documented in the cuDNN developer guide.
  *
- * \section Need Why use Frontend API
+ * \section Why use Frontend API
  *
  * Consider the following code snippet which showcases cudnnBackendTensor creation using the backend API and its
  * equivalent front-end API code. Many among the backend constructs follow similar pattern.
 
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <unordered_map>
+#include <string>
 
 #include "../cudnn_frontend_version.h"
 #include "node/batchnorm.h"
@@ -25,6 +26,7 @@
 #include "node/sdpa_fp8_bwd.h"
 #include "node/block_scale_quantize.h"
 #include "node/block_scale_dequantize.h"
+#include "node/concatenate.h"
 
 #include "backend/backend_descriptor.h"
 #include "plans.h"
@@ -293,8 +295,8 @@ class Graph : public ICudnn, public INode {
         CHECK_CUDNN_FRONTEND_ERROR(collect_tensors_in_workspace_subtree(workspace_modifications, workspace_offset));
 
         for (auto const &[uid, data] : workspace_modifications) {
-            (void)uid;
             const auto &[operation_type, offset, vec_data] = data;
+            uid_to_device_ptrs[uid]                        = static_cast<char *>(workspace) + offset;
 
             // 0 means memcpy
             if (operation_type == 0) {
@@ -304,7 +306,6 @@ class Graph : public ICudnn, public INode {
                                                                      vec_data.data(),
                                                                      vec_data.size() * sizeof(float),
                                                                      cudaMemcpyHostToDevice));
-                uid_to_device_ptrs[uid] = static_cast<char *>(workspace) + offset;
             }
             // 1 means memset
             else if (operation_type == 1) {
@@ -322,12 +323,19 @@ class Graph : public ICudnn, public INode {
 
                 CHECK_CUDA_ERROR(detail::cuda_graph_add_memset_node_set_params(current_node, &params));
             }
-            // Other values do not correspond to cuda APIs
+            // Other values do not correspond to CUDA graph nodes
+            else {
+                continue;
+            }
 
-            CHECK_CUDA_ERROR(detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_root_nodes));
+            size_t num_dependent_nodes;
+            CHECK_CUDA_ERROR(detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_dependent_nodes));
             RETURN_CUDNN_FRONTEND_ERROR_IF(
-                num_root_nodes != 1, error_code_t::INVALID_VALUE, "cudnn_cuda_graph should have exactly 1 root node.");
-            CHECK_CUDA_ERROR(detail::cuda_graph_node_get_dependent_nodes(current_node, &current_node, &num_root_nodes));
+                num_dependent_nodes != 1,
+                error_code_t::INVALID_VALUE,
+                "Each node of cudnn_cuda_graph before the backend graph node should have exactly 1 dependent node.");
+            CHECK_CUDA_ERROR(
+                detail::cuda_graph_node_get_dependent_nodes(current_node, &current_node, &num_dependent_nodes));
         }
 
         // Make sure device pointer is provided for all uids expected for this plan
@@ -357,7 +365,10 @@ class Graph : public ICudnn, public INode {
                                        error_code_t::CUDNN_BACKEND_API_FAILED,
                                        "Failed to create variant pack's backend descriptor.");
 
-        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, workspace));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, cudnn_workspace));
 
         int64_t candidate = plans.candidate;
         CHECK_CUDNN_FRONTEND_ERROR(plans.is_plan_index_executable(candidate));
@@ -367,8 +378,9 @@ class Graph : public ICudnn, public INode {
                                                     backend_cuda_graph));
 
         // There should be nothing after the backend graph
-        CHECK_CUDA_ERROR(detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_root_nodes));
-        RETURN_CUDNN_FRONTEND_ERROR_IF(num_root_nodes != 0,
+        size_t num_dependent_nodes;
+        CHECK_CUDA_ERROR(detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_dependent_nodes));
+        RETURN_CUDNN_FRONTEND_ERROR_IF(num_dependent_nodes != 0,
                                        error_code_t::INVALID_VALUE,
                                        "cudnn_cuda_graph should have no graph nodes after the backend graph node.");
 
@@ -431,8 +443,8 @@ class Graph : public ICudnn, public INode {
         CHECK_CUDNN_FRONTEND_ERROR(collect_tensors_in_workspace_subtree(workspace_modifications, workspace_offset));
 
         for (auto const &[uid, data] : workspace_modifications) {
-            (void)uid;
             const auto &[operation_type, offset, vec_data] = data;
+            uid_to_device_ptrs[uid]                        = static_cast<char *>(workspace) + offset;
 
             cudaGraphNode_t node = nullptr;
 
@@ -446,7 +458,6 @@ class Graph : public ICudnn, public INode {
                                                                        vec_data.data(),
                                                                        vec_data.size() * sizeof(float),
                                                                        cudaMemcpyHostToDevice));
-                uid_to_device_ptrs[uid] = static_cast<char *>(workspace) + offset;
             }
             // 1 means memset
             else if (operation_type == 1) {
@@ -465,7 +476,10 @@ class Graph : public ICudnn, public INode {
                 CHECK_CUDA_ERROR(detail::cuda_graph_add_memset_node(
                     &node, cudnn_cuda_graph, &last_node, last_node != nullptr, &params));
             }
-            // Other values do not correspond to cuda APIs
+            // Other values do not correspond to CUDA graph nodes
+            else {
+                continue;
+            }
 
             last_node = node;
         }
@@ -495,7 +509,11 @@ class Graph : public ICudnn, public INode {
         RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack_descriptor.get_status() != CUDNN_STATUS_SUCCESS,
                                        error_code_t::CUDNN_BACKEND_API_FAILED,
                                        "Failed to create variant pack's backend descriptor.");
-        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, workspace));
+
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, cudnn_workspace));
 
         // Get the plan candidate. It only makes to sense to make cuda graph after execution plan has been built.
         // And in that case the candidate would have been set.
@@ -1019,6 +1037,9 @@ class Graph : public ICudnn, public INode {
                                                               std::shared_ptr<Tensor_attributes>,
                                                               Block_scale_dequantize_attributes);
 
+    std::shared_ptr<Tensor_attributes> concatenate(std::vector<std::shared_ptr<Tensor_attributes>>,
+                                                   Concatenate_attributes);
+
     [[deprecated]] std::array<std::shared_ptr<Tensor_attributes>, 2>
     scaled_dot_product_flash_attention(std::shared_ptr<Tensor_attributes> q,
                                        std::shared_ptr<Tensor_attributes> k,
@@ -1168,11 +1189,23 @@ class Graph : public ICudnn, public INode {
             short_node["inputs"]  = {};
             short_node["outputs"] = {};
 
+            auto node_name = sub_node["tag"].get<std::string>();
+            auto i         = 0;
             // Process node inputs
             for (const auto &input : sub_node["inputs"]) {
-                // Extract port_name and tensor_name
-                auto port_name   = input[0].get<std::string>();
-                auto tensor_info = input[1];
+                std::string port_name;
+                json tensor_info;
+
+                if (node_name == "CONCATENATE") {
+                    // Extract port_name and tensor_name
+                    port_name   = std::to_string(i);
+                    tensor_info = input;
+                    i++;
+                } else {
+                    // Extract port_name and tensor_name
+                    port_name   = input[0].get<std::string>();
+                    tensor_info = input[1];
+                }
 
                 if (tensor_info.is_null()) {
                     continue;
@@ -2161,10 +2194,29 @@ Graph::block_scale_dequantize(std::shared_ptr<Tensor_attributes> x,
     return Y;
 }
 
+inline std::shared_ptr<Tensor_attributes>
+Graph::concatenate(std::vector<std::shared_ptr<Tensor_attributes>> x, Concatenate_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+
+    // Set outputs
+    auto Y = attributes.outputs[Concatenate_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    // Set inputs
+    for (auto &element : x) {
+        attributes.inputs.push_back(element);
+    }
+
+    sub_nodes.emplace_back(std::make_unique<ConcatenateNode>(std::move(attributes), context));
+
+    return Y;
+}
+
 static inline std::ostream &
 operator<<(std::ostream &os, Graph const &graph) {
     os << graph.print();
     return os;
 }
 
-}  // namespace cudnn_frontend::graph
+}  // namespace cudnn_frontend::graph
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`* The cuDNN Frontend API is a C++ header-only library that demonstrates how to use the cuDNN C backend API. The cuDNN C`
`30`	`30`	`* backend API is documented in the cuDNN developer guide.`
`31`	`31`	`*`
`32`		`- * \section Need Why use Frontend API`
	`32`	`+ * \section Why use Frontend API`
`33`	`33`	`*`
`34`	`34`	`* Consider the following code snippet which showcases cudnnBackendTensor creation using the backend API and its`
`35`	`35`	`* equivalent front-end API code. Many among the backend constructs follow similar pattern.`