Skip to content

Add samples for SYCL-Graph extension #2643

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions DirectProgramming/C++SYCL/SYCL-Graph/common/aspect_queries.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//==============================================================
// Copyright © 2025 Codeplay Software
//
// SPDX-License-Identifier: MIT
// =============================================================

#pragma once

#include <sycl/sycl.hpp>

#include <iostream>
#include <string>

inline void ensure_full_aspects_support(const sycl::device &dev) {
std::string error_msg;

if (!dev.has(sycl::aspect::ext_oneapi_graph)) {
error_msg += "Error: The device does NOT support ext_oneapi_graph. \n";
}
if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
error_msg +=
"Error: The device does NOT support ext_oneapi_limited_graph. \n";
}
if (!dev.has(sycl::aspect::usm_shared_allocations)) {
error_msg +=
"Error: The device does NOT support usm_shared_allocations. \n";
}

if (!error_msg.empty()) {
std::cerr << error_msg;
std::exit(1);
}
};

inline void ensure_required_aspects_support(const sycl::device &dev) {
std::string error_msg;

if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
error_msg +=
"Error: The device does NOT support ext_oneapi_limited_graph. \n";
}
if (!dev.has(sycl::aspect::usm_shared_allocations)) {
error_msg +=
"Error: The device does NOT support usm_shared_allocations. \n";
}

if (!error_msg.empty()) {
std::cerr << error_msg;
std::exit(1);
}
};

inline void ensure_full_graph_support(const sycl::device &dev) {
if (!dev.has(sycl::aspect::ext_oneapi_graph)) {
std::cerr << "Error: The device does NOT support ext_oneapi_graph.\n";
std::exit(1);
}
};

inline void ensure_graph_support(const sycl::device &dev) {
if (!dev.has(sycl::aspect::ext_oneapi_limited_graph)) {
std::cerr
<< "Error: The device does NOT support ext_oneapi_limited_graph.\n";
std::exit(1);
}
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
############################################################################
## Copyright © 2025 Codeplay Software
##
## SPDX-License-Identifier: MIT
############################################################################

# ------------------------------------------------
# Detect available backends
# ------------------------------------------------
execute_process(
COMMAND bash -c "! sycl-ls | grep -q cuda"
RESULT_VARIABLE CUDA_BACKEND_AVAILABLE)
execute_process(
COMMAND bash -c "! sycl-ls | grep -q hip"
RESULT_VARIABLE HIP_BACKEND_AVAILABLE)
execute_process(
COMMAND bash -c "! sycl-ls | grep -q 'opencl\\|level_zero'"
RESULT_VARIABLE SPIR_BACKEND_AVAILABLE)

set(ENABLE_CUDA ${CUDA_BACKEND_AVAILABLE} CACHE BOOL "Build with CUDA target")
set(ENABLE_HIP ${HIP_BACKEND_AVAILABLE} CACHE BOOL "Build with HIP target")
set(ENABLE_SPIR ${SPIR_BACKEND_AVAILABLE} CACHE BOOL "Build with spir64 target")
set(SYCL_TARGETS "")

# ------------------------------------------------
# Configure CUDA target
# ------------------------------------------------
if(${ENABLE_CUDA})
string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "nvptx64-nvidia-cuda")
set(DEFAULT_CUDA_COMPUTE_CAPABILITY "50")
set(CUDA_COMPUTE_CAPABILITY "" CACHE BOOL
"CUDA architecture (compute capability), e.g. sm_80. Default value is auto-configured using nvidia-smi.")
# Auto-configure if not specified by user
if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
execute_process(
COMMAND bash -c "which nvidia-smi >/dev/null && nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | tr -d '.'"
OUTPUT_VARIABLE CUDA_COMPUTE_CAPABILITY
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
# Warn if not specified and failed to auto-configure
if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
message(WARNING "Failed to autoconfigure CUDA_COMPUTE_CAPABILITY using nvidia-smi. Will default to sm_${DEFAULT_CUDA_COMPUTE_CAPABILITY}")
set(CUDA_COMPUTE_CAPABILITY ${DEFAULT_CUDA_COMPUTE_CAPABILITY} CACHE STRING "CUDA Compute Capability")
else()
message(STATUS "Enabled SYCL target CUDA with Compute Capability sm_${CUDA_COMPUTE_CAPABILITY}")
endif()
endif()

# ------------------------------------------------
# Configure HIP target
# ------------------------------------------------
if(${ENABLE_HIP})
string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "amdgcn-amd-amdhsa")
set(DEFAULT_HIP_GFX_ARCH "gfx906")
set(HIP_GFX_ARCH "" CACHE BOOL
"HIP architecture tag, e.g. gfx90a. Default value is auto-configured using rocminfo.")
# Auto-configure if not specified by user
if ("${CUDA_COMPUTE_CAPABILITY}" STREQUAL "")
execute_process(
COMMAND bash -c "which rocminfo >/dev/null && rocminfo | grep -o 'gfx[0-9]*' | head -n 1"
OUTPUT_VARIABLE HIP_GFX_ARCH
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
# Warn if not specified and failed to auto-configure
if ("${HIP_GFX_ARCH}" STREQUAL "")
message(WARNING "Failed to autoconfigure HIP_GFX_ARCH using rocminfo. Will default to ${DEFAULT_HIP_GFX_ARCH}")
set(HIP_GFX_ARCH ${DEFAULT_HIP_GFX_ARCH} CACHE STRING "HIP gfx arch")
else()
message(STATUS "Enabled SYCL target HIP with gfx arch ${HIP_GFX_ARCH}")
endif()
endif()

# ------------------------------------------------
# Configure spir64 target
# ------------------------------------------------
if(${ENABLE_SPIR})
string(JOIN "," SYCL_TARGETS "${SYCL_TARGETS}" "spir64")
message(STATUS "Enabled SYCL target spir64")
endif()

# ------------------------------------------------
# Configure the complete SYCL flags
# ------------------------------------------------
set(SYCL_FLAGS -fsycl -fsycl-targets=${SYCL_TARGETS})
if(${ENABLE_CUDA})
list(APPEND SYCL_FLAGS -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_${CUDA_COMPUTE_CAPABILITY})
endif()
if(${ENABLE_HIP})
list(APPEND SYCL_FLAGS -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${HIP_GFX_ARCH})
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
############################################################################
## Copyright © 2025 Codeplay Software
##
## SPDX-License-Identifier: MIT
############################################################################

cmake_minimum_required(VERSION 3.12)
project(SYCL-Graph-Samples)

# Set global flags
set(CMAKE_CXX_STANDARD 17)

# Configure SYCL
include("${CMAKE_SOURCE_DIR}/../common/cmake/ConfigureSYCL.cmake")

# Output directory for executables
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} CACHE PATH "" FORCE)

# Find all .cpp files in the src/ directory
file(GLOB SAMPLE_SOURCES "${CMAKE_SOURCE_DIR}/src/*.cpp")

# Add executable for each .cpp file
foreach(SOURCE_FILE ${SAMPLE_SOURCES})
# Extract the file name without the extension
get_filename_component(EXE_NAME ${SOURCE_FILE} NAME_WE)

# Create executable
add_executable(${EXE_NAME} ${SOURCE_FILE})

# Add SYCL flags
target_compile_options(${EXE_NAME} PUBLIC ${SYCL_FLAGS})
target_link_options(${EXE_NAME} PUBLIC ${SYCL_FLAGS})
endforeach()
108 changes: 108 additions & 0 deletions DirectProgramming/C++SYCL/SYCL-Graph/diamond-dependency/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Diamond Dependency Sample

Code example demonstrating the usage of [`sycl_ext_oneapi_graph`](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) extension with queue recording API.

| Area | Description
|:--- |:---
| What you will learn | How to use SYCL-Graphs extension for optimizing kernel execution.
| Time to complete | 15 minutes


## Purpose

This code example shows how to record commands submitted to a SYCL `queue` into a `command_graph` object. Once the graph recording is complete, the graph is finalized which means a new `command_graph` object in `graph_state::executable` is created which is ready for submission. Lastly, the graph is submitted en bloc for execution to a queue with a new function `ext_oneapi_graph()`. And can be replayed as many times as needed.



## Prerequisites
| Optimized for | Description
|:--- |:---
| OS | Linux* Ubuntu* <br>Windows* 10, 11
| Hardware | Intel GPU
| Software | Intel® oneAPI DPC++/C++ Compiler


## Key Implementation Details

Key SYCL* concepts demonstrated in the code sample include using command graph extension with buffers and accessors. This sample is using queue recording API.

>**Note**: For comprehensive information about oneAPI programming, see the *[Intel® oneAPI Programming Guide](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/overview.html)*. (Use search or the table of contents to find relevant information quickly.)


## Set Environment Variables

When working with the command-line interface (CLI), you should configure the oneAPI toolkits using environment variables. Set up your CLI environment by sourcing the `setvars` script every time you open a new terminal window. This practice ensures that your compiler, libraries, and tools are ready for development.

> **Note**: You can use [Modulefiles scripts](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-modulefiles-with-linux.html) to set up your development environment. The modulefiles scripts work with all Linux shells.

> **Note**: If you want only specific components or versions of those components, use a [setvars config file](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-a-config-file-for-setvars-sh-on-linux-or-macos.html) to set up your development environment.


## Build the `Diamond Dependency` Sample for CPU and GPU

> **Note**: If you have not already done so, set up your CLI
> environment by sourcing the `setvars` script in the root of your oneAPI installation.
>
> Linux*:
> - For system wide installations: `. /opt/intel/oneapi/setvars.sh`
> - For private installations: ` . ~/intel/oneapi/setvars.sh`
> - For non-POSIX shells, like csh, use the following command: `bash -c 'source <install-dir>/setvars.sh ; exec csh'`
>
> Windows*:
> - `C:\Program Files (x86)\Intel\oneAPI\setvars.bat`
> - Windows PowerShell*, use the following command: `cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'`
>
> For more information on configuring environment variables or if you have a Unified Directory Layout, see
*[Use the setvars and oneapi-vars Scripts with Linux*](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-the-setvars-script-with-linux-or-macos.html)* or *[Use the setvars and oneapi-vars Scripts with Windows*](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/current/use-the-setvars-script-with-windows.html)*.

### On Linux*

The project uses a standard CMake build configuration system. Ensure the SYCL compiler is used by the configuration either by setting the environment variable `CXX=<compiler>` or passing the configuration flag
`-DCMAKE_CXX_COMPILER=<compiler>` where `<compiler>` is your SYCL compiler's
executable (for example Intel `icpx` or LLVM `clang++`).

1. Change to the sample directory.
2. Build the program.
```
mkdir -p build && cd build
cmake .. -DCMAKE_CXX_COMPILER=<compiler>
cmake --build .
```

The CMake configuration automatically detects the available SYCL backends and
enables the SPIR/CUDA/HIP targets for the device code, including the corresponding
architecture flags. If desired, these auto-configured cmake options may be overridden
with the following ones:

| OPTION | VALUE
|:--- |:---
| ENABLE_SPIR | ON or OFF
| ENABLE_CUDA | ON or OFF
| ENABLE_HIP | ON or OFF
| CUDA_COMPUTE_CAPABILITY | Integer, e.g. `70` meaning capability 7.0 (arch `sm_70`)
| HIP_GFX_ARCH | String, e.g. `gfx1030`

#### Troubleshooting

If an error occurs, you can get more details by running `make` with
the `VERBOSE=1` argument:
```
make VERBOSE=1
```
If you receive an error message, troubleshoot the problem using the **Diagnostics Utility for Intel® oneAPI Toolkits**. The diagnostic utility provides configuration and system checks to help find missing dependencies, permissions errors, and other issues. See the *[Diagnostics Utility for Intel® oneAPI Toolkits User Guide](https://www.intel.com/content/www/us/en/docs/oneapi/user-guide-diagnostic-utility/current/overview.html)* for more information on using the utility.


## Run the `Diamond Dependency` Sample

### On Linux

1. Run the program.
```
./diamondDependency
```

## License

Code samples are licensed under the MIT license. See [License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.

Third-party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt).
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
//==============================================================
// Copyright © 2025 Codeplay Software
//
// SPDX-License-Identifier: MIT
// =============================================================

#include "../../common/aspect_queries.hpp"

#include <sycl/sycl.hpp>

namespace sycl_ext = sycl::ext::oneapi::experimental;
using namespace sycl;

int main() {
constexpr size_t Size = 1024;

queue Queue{};

ensure_graph_support(Queue.get_device());

std::vector<int> DataA(Size), DataB(Size), DataC(Size);

// Lifetime of buffers must exceed the lifetime of graphs they are used in.
buffer<int> BufferA{DataA.data(), range<1>{Size}};
BufferA.set_write_back(false);
buffer<int> BufferB{DataB.data(), range<1>{Size}};
BufferB.set_write_back(false);
buffer<int> BufferC{DataC.data(), range<1>{Size}};
BufferC.set_write_back(false);

{
// New object representing graph of command-groups
sycl_ext::command_graph Graph(
Queue.get_context(), Queue.get_device(),
{sycl_ext::property::graph::assume_buffer_outlives_graph{}});

// `Queue` will be put in the recording state where commands are recorded to
// `Graph` rather than submitted for execution immediately.
Graph.begin_recording(Queue);

// Record commands to `Graph` with the following topology.
//
// increment_kernel
// / \
// A->/ A->\
// / \
// add_kernel subtract_kernel
// \ /
// B->\ C->/
// \ /
// decrement_kernel

Queue.submit([&](handler &CGH) {
auto Pdata = BufferA.get_access<access::mode::read_write>(CGH);
CGH.parallel_for<class Increment_kernel>(
range<1>(Size), [=](item<1> Id) { Pdata[Id]++; });
});

Queue.submit([&](handler &CGH) {
auto Pdata1 = BufferA.get_access<access::mode::read>(CGH);
auto Pdata2 = BufferB.get_access<access::mode::read_write>(CGH);
CGH.parallel_for<class Add_kernel>(
range<1>(Size), [=](item<1> Id) { Pdata2[Id] += Pdata1[Id]; });
});

Queue.submit([&](handler &CGH) {
auto Pdata1 = BufferA.get_access<access::mode::read>(CGH);
auto Pdata2 = BufferC.get_access<access::mode::read_write>(CGH);
CGH.parallel_for<class Subtract_kernel>(
range<1>(Size), [=](item<1> Id) { Pdata2[Id] -= Pdata1[Id]; });
});

Queue.submit([&](handler &CGH) {
auto Pdata1 = BufferB.get_access<access::mode::read_write>(CGH);
auto Pdata2 = BufferC.get_access<access::mode::read_write>(CGH);
CGH.parallel_for<class Decrement_kernel>(range<1>(Size), [=](item<1> Id) {
Pdata1[Id]--;
Pdata2[Id]--;
});
});

// `Queue` will be returned to the executing state where commands are
// submitted immediately for extension.
Graph.end_recording();

// Finalize the modifiable graph to create an executable graph that can be
// submitted for execution.
auto Exec_graph = Graph.finalize();

// Execute graph
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(Exec_graph); })
.wait();
}

return 0;
}
Loading