oneapi-src · jimmytwei · May 28, 2025 · Apr 2, 2025 · Apr 28, 2025 · May 5, 2025
diff --git a/DirectProgramming/C++SYCL/SYCL-Graph/common/aspect_queries.hpp b/DirectProgramming/C++SYCL/SYCL-Graph/common/aspect_queries.hpp
@@ -0,0 +1,66 @@
+//==============================================================
+// Copyright © 2025 Codeplay Software
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+#include <iostream>
+#include <string>
+
+inline void ensure_full_aspects_support(const sycl::device &dev) {
+  std::string error_msg;
+
+  if (!dev.has(sycl::aspect::ext_oneapi_graph)) {
+    error_msg += "Error: The device does NOT support ext_oneapi_graph. \n";
+  }
+  if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
+    error_msg +=
+        "Error: The device does NOT support ext_oneapi_limited_graph. \n";
+  }
+  if (!dev.has(sycl::aspect::usm_shared_allocations)) {
+    error_msg +=
+        "Error: The device does NOT support usm_shared_allocations. \n";
+  }
+
+  if (!error_msg.empty()) {
+    std::cerr << error_msg;
+    std::exit(1);
+  }
+};
+
+inline void ensure_required_aspects_support(const sycl::device &dev) {
+  std::string error_msg;
+
+  if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
+    error_msg +=
+        "Error: The device does NOT support ext_oneapi_limited_graph. \n";
+  }
+  if (!dev.has(sycl::aspect::usm_shared_allocations)) {
+    error_msg +=
+        "Error: The device does NOT support usm_shared_allocations. \n";
+  }
+
+  if (!error_msg.empty()) {
+    std::cerr << error_msg;
+    std::exit(1);
+  }
+};
+
+inline void ensure_full_graph_support(const sycl::device &dev) {
+  if (!dev.has(sycl::aspect::ext_oneapi_graph)) {
+    std::cerr << "Error: The device does NOT support ext_oneapi_graph.\n";
+    std::exit(1);
+  }
+};
+
+inline void ensure_graph_support(const sycl::device &dev) {
+  if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
+    std::cerr
+        << "Error: The device does NOT support ext_oneapi_limited_graph.\n";
+    std::exit(1);
+  }
+};
diff --git a/DirectProgramming/C++SYCL/SYCL-Graph/common/cmake/ConfigureSYCL.cmake b/DirectProgramming/C++SYCL/SYCL-Graph/common/cmake/ConfigureSYCL.cmake
@@ -0,0 +1,90 @@
+############################################################################
+## Copyright © 2025 Codeplay Software
+##
+## SPDX-License-Identifier: MIT
+############################################################################
+
+# ------------------------------------------------
+# Detect available backends
+# ------------------------------------------------
+execute_process(
+    COMMAND bash -c "! sycl-ls | grep -q cuda"
+    RESULT_VARIABLE CUDA_BACKEND_AVAILABLE)
+execute_process(
+    COMMAND bash -c "! sycl-ls | grep -q hip"
+    RESULT_VARIABLE HIP_BACKEND_AVAILABLE)
+execute_process(
+    COMMAND bash -c "! sycl-ls | grep -q 'opencl\\|level_zero'"
+    RESULT_VARIABLE SPIR_BACKEND_AVAILABLE)
+
+set(ENABLE_CUDA ${CUDA_BACKEND_AVAILABLE} CACHE BOOL "Build with CUDA target")
+set(ENABLE_HIP ${HIP_BACKEND_AVAILABLE} CACHE BOOL "Build with HIP target")
+set(ENABLE_SPIR ${SPIR_BACKEND_AVAILABLE} CACHE BOOL "Build with spir64 target")
+set(SYCL_TARGETS "")
+
+# ------------------------------------------------
+# Configure CUDA target
+# ------------------------------------------------
+if(${ENABLE_CUDA})
+    string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "nvptx64-nvidia-cuda")
+    set(DEFAULT_CUDA_COMPUTE_CAPABILITY "50")
+    set(CUDA_COMPUTE_CAPABILITY "" CACHE BOOL
+        "CUDA architecture (compute capability), e.g. sm_80. Default value is auto-configured using nvidia-smi.")
+    # Auto-configure if not specified by user
+    if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
+        execute_process(
+            COMMAND bash -c "which nvidia-smi >/dev/null && nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | tr -d '.'"
+            OUTPUT_VARIABLE CUDA_COMPUTE_CAPABILITY
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+    # Warn if not specified and failed to auto-configure
+    if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
+        message(WARNING "Failed to autoconfigure CUDA_COMPUTE_CAPABILITY using nvidia-smi. Will default to sm_${DEFAULT_CUDA_COMPUTE_CAPABILITY}")
+        set(CUDA_COMPUTE_CAPABILITY ${DEFAULT_CUDA_COMPUTE_CAPABILITY} CACHE STRING "CUDA Compute Capability")
+    else()
+        message(STATUS "Enabled SYCL target CUDA with Compute Capability sm_${CUDA_COMPUTE_CAPABILITY}")
+    endif()
+endif()
+
+# ------------------------------------------------
+# Configure HIP target
+# ------------------------------------------------
+if(${ENABLE_HIP})
+    string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "amdgcn-amd-amdhsa")
+    set(DEFAULT_HIP_GFX_ARCH "gfx906")
+    set(HIP_GFX_ARCH "" CACHE BOOL
+        "HIP architecture tag, e.g. gfx90a. Default value is auto-configured using rocminfo.")
+    # Auto-configure if not specified by user
+    if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
+        execute_process(
+            COMMAND bash -c "which rocminfo >/dev/null && rocminfo | grep -o 'gfx[0-9]*' | head -n 1"
+            OUTPUT_VARIABLE HIP_GFX_ARCH
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
+    # Warn if not specified and failed to auto-configure
+    if ("${HIP_GFX_ARCH}" STREQUAL "")
+        message(WARNING "Failed to autoconfigure HIP_GFX_ARCH using rocminfo. Will default to ${DEFAULT_HIP_GFX_ARCH}")
+        set(HIP_GFX_ARCH ${DEFAULT_HIP_GFX_ARCH} CACHE STRING "HIP gfx arch")
+    else()
+        message(STATUS "Enabled SYCL target HIP with gfx arch ${HIP_GFX_ARCH}")
+    endif()
+endif()
+
+# ------------------------------------------------
+# Configure spir64 target
+# ------------------------------------------------
+if(${ENABLE_SPIR})
+    string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "spir64")
+    message(STATUS "Enabled SYCL target spir64")
+endif()
+
+# ------------------------------------------------
+# Configure the complete SYCL flags
+# ------------------------------------------------
+set(SYCL_FLAGS -fsycl -fsycl-targets=${SYCL_TARGETS})
+if(${ENABLE_CUDA})
+    list(APPEND SYCL_FLAGS -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_${CUDA_COMPUTE_CAPABILITY})
+endif()
+if(${ENABLE_HIP})
+    list(APPEND SYCL_FLAGS -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${HIP_GFX_ARCH})
+endif()
diff --git a/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/CMakeLists.txt b/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/CMakeLists.txt
@@ -0,0 +1,33 @@
+############################################################################
+## Copyright © 2025 Codeplay Software
+##
+## SPDX-License-Identifier: MIT
+############################################################################
+
+cmake_minimum_required(VERSION 3.12)
+project(SYCL-Graph-Samples)
+
+# Set global flags
+set(CMAKE_CXX_STANDARD 17)
+
+# Configure SYCL
+include("${CMAKE_SOURCE_DIR}/../common/cmake/ConfigureSYCL.cmake")
+
+# Output directory for executables
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} CACHE PATH "" FORCE)
+
+# Find all .cpp files in the src/ directory
+file(GLOB SAMPLE_SOURCES "${CMAKE_SOURCE_DIR}/src/*.cpp")
+
+# Add executable for each .cpp file
+foreach(SOURCE_FILE ${SAMPLE_SOURCES})
+    # Extract the file name without the extension
+    get_filename_component(EXE_NAME ${SOURCE_FILE} NAME_WE)
+
+    # Create executable
+    add_executable(${EXE_NAME} ${SOURCE_FILE})
+
+    # Add SYCL flags
+  target_compile_options(${EXE_NAME} PUBLIC ${SYCL_FLAGS})
+  target_link_options(${EXE_NAME} PUBLIC ${SYCL_FLAGS})
+endforeach()
diff --git a/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/README.md b/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/README.md
@@ -0,0 +1,108 @@
+# Diamond Dependency Sample
+
+Code example demonstrating the usage of [`sycl_ext_oneapi_graph`](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) extension with queue recording API.
+
+| Area                      | Description
+|:---                       |:---
+| What you will learn       | How to use SYCL-Graphs extension for optimizing kernel execution.
+| Time to complete          | 15 minutes
+
+
+## Purpose
+
+This code example shows how to record commands submitted to a SYCL `queue` into a `command_graph` object. Once the graph recording is complete, the graph is finalized which means a new `command_graph` object in `graph_state::executable` is created which is ready for submission. Lastly, the graph is submitted en bloc for execution to a queue with a new function `ext_oneapi_graph()`. And can be replayed as many times as needed. 
+
+
+
+## Prerequisites
+| Optimized for                     | Description
+|:---                               |:---
+| OS                                | Linux* Ubuntu* <br>Windows* 10, 11
+| Hardware                          | Intel GPU
+| Software                          | Intel® oneAPI DPC++/C++ Compiler
+
+
+## Key Implementation Details
+
+Key SYCL* concepts demonstrated in the code sample include using command graph extension with buffers and accessors. This sample is using queue recording API.
+
+>**Note**: For comprehensive information about oneAPI programming, see the *[Intel® oneAPI Programming Guide](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/overview.html)*. (Use search or the table of contents to find relevant information quickly.)
+
+
+## Set Environment Variables
+
+When working with the command-line interface (CLI), you should configure the oneAPI toolkits using environment variables. Set up your CLI environment by sourcing the `setvars` script every time you open a new terminal window. This practice ensures that your compiler, libraries, and tools are ready for development.
+
+> **Note**: You can use [Modulefiles scripts](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-modulefiles-with-linux.html) to set up your development environment. The modulefiles scripts work with all Linux shells.
+
+> **Note**: If you want only specific components or versions of those components, use a [setvars config file](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-a-config-file-for-setvars-sh-on-linux-or-macos.html) to set up your development environment.
+
+
+## Build the `Diamond Dependency` Sample for CPU and GPU
+
+> **Note**: If you have not already done so, set up your CLI
+> environment by sourcing  the `setvars` script in the root of your oneAPI installation.
+>
+> Linux*:
+> - For system wide installations: `. /opt/intel/oneapi/setvars.sh`
+> - For private installations: ` . ~/intel/oneapi/setvars.sh`
+> - For non-POSIX shells, like csh, use the following command: `bash -c 'source <install-dir>/setvars.sh ; exec csh'`
+>
+> Windows*:
+> - `C:\Program Files (x86)\Intel\oneAPI\setvars.bat`
+> - Windows PowerShell*, use the following command: `cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'`
+>
+> For more information on configuring environment variables or if you have a Unified Directory Layout, see
+*[Use the setvars and oneapi-vars Scripts with Linux*](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-the-setvars-script-with-linux-or-macos.html)* or *[Use the setvars and oneapi-vars Scripts with Windows*](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-the-setvars-script-with-windows.html)*.
+
+### On Linux*
+
+The project uses a standard CMake build configuration system. Ensure the SYCL compiler is used by the configuration either by setting the environment variable `CXX=<compiler>` or passing the configuration flag
+`-DCMAKE_CXX_COMPILER=<compiler>` where `<compiler>` is your SYCL compiler's
+executable (for example Intel `icpx` or LLVM `clang++`).
+
+1. Change to the sample directory.
+2. Build the program.
+   ```
+   mkdir -p build && cd build
+   cmake .. -DCMAKE_CXX_COMPILER=<compiler>
+   cmake --build .
+   ```
+
+The CMake configuration automatically detects the available SYCL backends and
+enables the SPIR/CUDA/HIP targets for the device code, including the corresponding 
+architecture flags. If desired, these auto-configured cmake options may be overridden 
+with the following ones:
+
+| OPTION                     | VALUE 
+|:---                        |:---
+| ENABLE_SPIR                | ON or OFF
+| ENABLE_CUDA                | ON or OFF
+| ENABLE_HIP                 | ON or OFF
+| CUDA_COMPUTE_CAPABILITY    | Integer, e.g. `70` meaning capability 7.0 (arch `sm_70`)
+| HIP_GFX_ARCH               | String, e.g. `gfx1030`
+
+#### Troubleshooting
+
+If an error occurs, you can get more details by running `make` with
+the `VERBOSE=1` argument:
+```
+make VERBOSE=1
+```
+If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the *[Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/docs/oneapi/user-guide-diagnostic-utility/current/overview.html)* for more information on using the utility.
+
+
+## Run the `Diamond Dependency` Sample
+
+### On Linux
+
+1. Run the program.
+   ```
+   ./diamondDependency
+   ```
+
+## License
+
+Code samples are licensed under the MIT license. See [License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
+
+Third-party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt).
diff --git a/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/src/diamondDependency.cpp b/DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/src/diamondDependency.cpp
@@ -0,0 +1,96 @@
+//==============================================================
+// Copyright © 2025 Codeplay Software
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include "../../common/aspect_queries.hpp"
+
+#include <sycl/sycl.hpp>
+
+namespace sycl_ext = sycl::ext::oneapi::experimental;
+using namespace sycl;
+
+int main() {
+  constexpr size_t Size = 1024;
+
+  queue Queue{};
+
+  ensure_graph_support(Queue.get_device());
+
+  std::vector<int> DataA(Size), DataB(Size), DataC(Size);
+
+  // Lifetime of buffers must exceed the lifetime of graphs they are used in.
+  buffer<int> BufferA{DataA.data(), range<1>{Size}};
+  BufferA.set_write_back(false);
+  buffer<int> BufferB{DataB.data(), range<1>{Size}};
+  BufferB.set_write_back(false);
+  buffer<int> BufferC{DataC.data(), range<1>{Size}};
+  BufferC.set_write_back(false);
+
+  {
+    // New object representing graph of command-groups
+    sycl_ext::command_graph Graph(
+        Queue.get_context(), Queue.get_device(),
+        {sycl_ext::property::graph::assume_buffer_outlives_graph{}});
+
+    // `Queue` will be put in the recording state where commands are recorded to
+    // `Graph` rather than submitted for execution immediately.
+    Graph.begin_recording(Queue);
+
+    // Record commands to `Graph` with the following topology.
+    //
+    //      increment_kernel
+    //       /         \
+    //   A->/        A->\
+    //     /             \
+    //   add_kernel  subtract_kernel
+    //     \             /
+    //   B->\        C->/
+    //       \         /
+    //     decrement_kernel
+
+    Queue.submit([&](handler &CGH) {
+      auto Pdata = BufferA.get_access<access::mode::read_write>(CGH);
+      CGH.parallel_for<class Increment_kernel>(
+          range<1>(Size), [=](item<1> Id) { Pdata[Id]++; });
+    });
+
+    Queue.submit([&](handler &CGH) {
+      auto Pdata1 = BufferA.get_access<access::mode::read>(CGH);
+      auto Pdata2 = BufferB.get_access<access::mode::read_write>(CGH);
+      CGH.parallel_for<class Add_kernel>(
+          range<1>(Size), [=](item<1> Id) { Pdata2[Id] += Pdata1[Id]; });
+    });
+
+    Queue.submit([&](handler &CGH) {
+      auto Pdata1 = BufferA.get_access<access::mode::read>(CGH);
+      auto Pdata2 = BufferC.get_access<access::mode::read_write>(CGH);
+      CGH.parallel_for<class Subtract_kernel>(
+          range<1>(Size), [=](item<1> Id) { Pdata2[Id] -= Pdata1[Id]; });
+    });
+
+    Queue.submit([&](handler &CGH) {
+      auto Pdata1 = BufferB.get_access<access::mode::read_write>(CGH);
+      auto Pdata2 = BufferC.get_access<access::mode::read_write>(CGH);
+      CGH.parallel_for<class Decrement_kernel>(range<1>(Size), [=](item<1> Id) {
+        Pdata1[Id]--;
+        Pdata2[Id]--;
+      });
+    });
+
+    // `Queue` will be returned to the executing state where commands are
+    // submitted immediately for extension.
+    Graph.end_recording();
+
+    // Finalize the modifiable graph to create an executable graph that can be
+    // submitted for execution.
+    auto Exec_graph = Graph.finalize();
+
+    // Execute graph
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(Exec_graph); })
+        .wait();
+  }
+
+  return 0;
+}