diff --git a/.github/pre-commit/spelling_allowlist.txt b/.github/pre-commit/spelling_allowlist.txt index 2a7d9cc41fc..4d9f8072267 100644 --- a/.github/pre-commit/spelling_allowlist.txt +++ b/.github/pre-commit/spelling_allowlist.txt @@ -10,6 +10,7 @@ AquSim Asynchronous BFGS Bloch +BlueField Braket CLA CLI @@ -23,6 +24,7 @@ CUDA Cartesian CentOS Conda +ConnectX CuPy DCO DGX @@ -32,6 +34,7 @@ Deutsch Devcontainer Doxygen Exponentiating +FPGA FPGAs Fermion Fock @@ -49,6 +52,8 @@ Hadamard Hadamards Hamiltonian Hamiltonians +Hololink +Holoscan Homebrew IQM InfiniBand @@ -75,9 +80,11 @@ Max-Cut Miniconda MyST NGC +NIC NVIDIA NVLink NVQIR +NVQLink OPX OQC ORCA @@ -117,7 +124,9 @@ QuEra QuTiP Quake Quantinuum +RDMA RHEL +RPC RSA RSH SDK @@ -139,6 +148,7 @@ Toshiko UCCSD VQE Vazirani +Verilog WSL Xcode Zener @@ -254,6 +264,7 @@ functors grovers hadamard hamiltonian +handoff heisenberg homogenous iff @@ -294,6 +305,7 @@ namespaces natively normalization nullary +nvcc observables optimizer optimizers @@ -306,6 +318,7 @@ parameterization performant photonic photonics +pipelined precompute precomputed prepend @@ -325,13 +338,16 @@ qudits qumode qumodes reStructuredText +realtime reconfigurable reproducibility +reusability runtime runtimes rvalue scalability scalable +schemas selectable sexualized shifter @@ -365,6 +381,7 @@ superpositions symplectic tablegen templated +testability toolchain toolchains toolset diff --git a/.github/workflows/publishing.yml b/.github/workflows/publishing.yml index 9a16f892dbf..3730f7f505f 100644 --- a/.github/workflows/publishing.yml +++ b/.github/workflows/publishing.yml @@ -645,6 +645,27 @@ jobs: retention-days: 1 if-no-files-found: error + cudaq_realtime_installers: + name: CUDA-Q Realtime installer + if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' + needs: assets + permissions: + contents: read + packages: read + strategy: + matrix: + platform: [amd64, arm64] + cuda_version: ['12.6', '13.0'] + fail-fast: false + uses: ./.github/workflows/realtime_prebuilt_binaries.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_READONLY_TOKEN: ${{ secrets.DOCKERHUB_READONLY_TOKEN }} + with: + platform: ${{ matrix.platform }} + cuda_version: ${{ matrix.cuda_version }} + environment: ghcr-deployment + cudaq_wheels: name: CUDA-Q Python wheels if: ${{ toJson(fromJson(needs.assets.outputs.python_wheels).info_files) != '[]' }} @@ -1394,7 +1415,7 @@ jobs: create_release: name: CUDA-Q Release - needs: [assets, cudaq_images, cudaq_installers, cudaq_wheels, cudaq_metapackages, macos_artifacts, metapackage_validation_macos] + needs: [assets, cudaq_images, cudaq_installers, cudaq_wheels, cudaq_metapackages, macos_artifacts, metapackage_validation_macos, cudaq_realtime_installers] if: needs.assets.outputs.release_title && inputs.github_commit == '' && inputs.nvidia_mgpu_commit == '' runs-on: ubuntu-latest @@ -1421,6 +1442,12 @@ jobs: name: ${{ needs.cudaq_metapackages.outputs.artifact_name }} path: metapackages + - name: Download CUDA-Q Realtime installer + uses: actions/download-artifact@v4 + with: + pattern: 'install_cuda_quantum_realtime_*' + path: installers + # The python wheels are uploaded as a release asset, but not pushed to anywhere else. # Note that PyPI packages cannot be updated once pushed; # - We could upload wheels to test-pypi when creating a release. @@ -1468,7 +1495,7 @@ jobs: clean_up: name: Clean up - needs: [assets, cudaq_images, cudaq_installers, cudaq_wheels, cudaq_metapackages, macos_artifacts, image_validation, installer_validation, metapackage_validation_conda, metapackage_validation_macos, wheel_validation_piponly, create_release] + needs: [assets, cudaq_images, cudaq_installers, cudaq_wheels, cudaq_metapackages, macos_artifacts, cudaq_realtime_installers, image_validation, installer_validation, metapackage_validation_conda, metapackage_validation_macos, wheel_validation_piponly, create_release] # Force this job to run even when some of the dependencies above are skipped. if: always() && !cancelled() && needs.assets.result != 'skipped' && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') runs-on: ubuntu-latest diff --git a/.github/workflows/realtime_ci.yml b/.github/workflows/realtime_ci.yml new file mode 100644 index 00000000000..e1d013b4e18 --- /dev/null +++ b/.github/workflows/realtime_ci.yml @@ -0,0 +1,195 @@ +on: + workflow_dispatch: + inputs: + cache_base: + required: false + type: string + description: 'The name of the branch to use as cache base.' + default: main + push: + branches: + - "pull-request/[0-9]+" + paths: + - realtime/** # Only trigger on changes to the realtime directory + - .github/** # Also trigger on changes to GitHub workflows + merge_group: + types: + - checks_requested + +name: Realtime CI + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build_installer: + name: Build CUDA Quantum Realtime assets + strategy: + matrix: + platform: [amd64, arm64] + cuda_version: ['12.6', '13.0'] + fail-fast: false + uses: ./.github/workflows/realtime_prebuilt_binaries.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_READONLY_TOKEN: ${{ secrets.DOCKERHUB_READONLY_TOKEN }} + with: + platform: ${{ matrix.platform }} + cuda_version: ${{ matrix.cuda_version }} + + test_installer: + name: Test Installer + needs: build_installer + strategy: + matrix: + platform: [amd64, arm64] + cuda_version: ['12.6', '13.0'] + distro: ['ubuntu24.04', 'ubi9'] + fail-fast: false + runs-on: ${{ (contains(matrix.platform, 'arm') && 'linux-arm64-gpu-a100-latest-1') || 'linux-amd64-gpu-a100-latest-1' }} + permissions: + contents: read + packages: read + + container: + # Note: in this test, we need to build a CUDA kernel (to use with CUDAQ Realtime), hence we need the CUDA toolkit, not just the runtime. + image: nvidia/cuda:${{ matrix.cuda_version }}.0-devel-${{ matrix.distro }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download installer + uses: actions/download-artifact@v4 + with: + name: install_cuda_quantum_realtime_cu${{ matrix.cuda_version }}.${{ matrix.platform }} + path: ./installers + + - name: Install CUDA Quantum Realtime using installer + shell: bash + run: | + cd ./installers + # Find the installer file (assuming there's only one in the directory) + installer_file=$(ls install_cuda_quantum_realtime_*) + echo "Found installer: $installer_file" + # Make the installer executable and run it + chmod +x "$installer_file" + ./"$installer_file" --accept + + - name: Test CUDA Quantum Realtime installation + shell: bash + run: | + # Install cmake for building a test example that uses the installed CUDA Quantum Realtime + # Install cmake depending on distro + if [[ "${{ matrix.distro }}" == "ubi9" ]]; then + dnf install -y cmake + else + apt update && apt install -y --no-install-recommends cmake + fi + # Build the test example that uses the installed CUDA Quantum Realtime + cd realtime/examples/gpu_dispatch + mkdir -p build && cd build + cmake .. + make -j$(nproc) + ./dispatch_kernel + + build_realtime: + name: Build + strategy: + matrix: + platform: [amd64, arm64] + cuda_version: ['12.6', '13.0'] + fail-fast: false + runs-on: ${{ (contains(matrix.platform, 'arm') && 'linux-arm64-cpu8') || 'linux-amd64-cpu8' }} + permissions: + contents: read + packages: read + + steps: + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Build CUDA Quantum Realtime + uses: ./.github/actions/run-in-docker + with: + # Use a base CUDA development image for compilation, i.e., no CUDA-Q dependencies pre-installed. + image: nvidia/cuda:${{ matrix.cuda_version }}.0-devel-ubuntu24.04 + volume: ${{ github.workspace }}:/workspace + shell: bash + run: | + # Note: HSB requires cmake 3.20+ + apt update && apt install -y --no-install-recommends ca-certificates gpg wget + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ noble main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null + apt update && apt install -y kitware-archive-keyring + # Build + apt update && apt install -y --no-install-recommends cmake git ninja-build nvcomp + cd /workspace/realtime + bash scripts/install_dev_prerequisites.sh + # Build HSB (GPU RoCE transceiver and hololink_core) + export CUDA_NATIVE_ARCH=${{ (contains(matrix.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }} + cd /workspace/ && git clone -b release-2.6.0-EA https://github.com/nvidia-holoscan/holoscan-sensor-bridge.git && cd holoscan-sensor-bridge + cmake -G Ninja -S /workspace/holoscan-sensor-bridge -B /workspace/holoscan-sensor-bridge/build -DCMAKE_BUILD_TYPE=Release -DHOLOLINK_BUILD_ONLY_NATIVE=OFF -DHOLOLINK_BUILD_PYTHON=OFF -DHOLOLINK_BUILD_TESTS=OFF -DHOLOLINK_BUILD_TOOLS=OFF -DHOLOLINK_BUILD_EXAMPLES=OFF -DHOLOLINK_BUILD_EMULATOR=OFF + cmake --build /workspace/holoscan-sensor-bridge/build --target roce_receiver gpu_roce_transceiver hololink_core + # Build CUDA-Q Realtime + cd /workspace/realtime + mkdir -p build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DCUDAQ_REALTIME_BUILD_TESTS=ON -DCUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS=ON -DHOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR=/workspace/holoscan-sensor-bridge -DHOLOSCAN_SENSOR_BRIDGE_BUILD_DIR=/workspace/holoscan-sensor-bridge/build/ -DCMAKE_INSTALL_PREFIX=/workspace/installer + make -j$(nproc) install + + - name: 'Tar files' + run: cd ${{ github.workspace }}/realtime/build && tar czf ${{ github.workspace }}/build_artifacts.tgz * + + - name: Upload binary artifact + uses: actions/upload-artifact@v4 + with: + name: cuda-quantum-realtime-binaries-${{ matrix.platform }}-cu${{ matrix.cuda_version }} + path: build_artifacts.tgz + retention-days: 1 + if-no-files-found: error + + - name: Upload installer artifact + uses: actions/upload-artifact@v4 + with: + name: cuda-quantum-realtime-installer-${{ matrix.platform }}-cu${{ matrix.cuda_version }} + path: ${{ github.workspace }}/installer + retention-days: 1 + if-no-files-found: error + + test_realtime: + name: Test + needs: build_realtime + strategy: + matrix: + platform: [amd64, arm64] + cuda_version: ['12.6', '13.0'] + fail-fast: false + runs-on: ${{ (contains(matrix.platform, 'arm') && 'linux-arm64-gpu-a100-latest-1') || 'linux-amd64-gpu-a100-latest-1' }} + permissions: + contents: read + packages: read + + container: + # Use a base CUDA runtime image for testing + image: nvidia/cuda:${{ matrix.cuda_version }}.0-runtime-ubuntu24.04 + + steps: + + - name: Download binary artifact + uses: actions/download-artifact@v4 + with: + name: cuda-quantum-realtime-binaries-${{ matrix.platform }}-cu${{ matrix.cuda_version }} + path: ./realtime_build_artifacts + + - name: Extract files + run: mkdir -p /workspace/realtime/build && tar -xvzf ./realtime_build_artifacts/build_artifacts.tgz -C /workspace/realtime/build + + - name: Test CUDA Quantum Realtime + shell: bash + run: | + # Install cmake for ctest + apt update && apt install -y --no-install-recommends cmake + cd /workspace/realtime/build + ctest -V diff --git a/.github/workflows/realtime_prebuilt_binaries.yml b/.github/workflows/realtime_prebuilt_binaries.yml new file mode 100644 index 00000000000..27b0abfbe28 --- /dev/null +++ b/.github/workflows/realtime_prebuilt_binaries.yml @@ -0,0 +1,100 @@ +on: + workflow_call: + inputs: + platform: + type: string + required: false + default: linux/amd64 + platform_base_image: + required: false + type: string + default: amd64/almalinux:8 + description: The image to use as a base image when building the installer. + cuda_version: + required: true + type: string + description: The CUDA version used for the build (e.g. 12.3). + environment: + type: string + required: false + secrets: + DOCKERHUB_USERNAME: + required: true + DOCKERHUB_READONLY_TOKEN: + required: true + +name: Pre-built realtime binaries +jobs: + build_installer: + name: Build CUDA Quantum Realtime installers + runs-on: ${{ (contains(inputs.platform, 'arm') && 'linux-arm64-cpu32') || 'linux-amd64-cpu32' }} + permissions: + contents: read + packages: read + outputs: + image_hash: ghcr.io/nvidia/cuda-quantum-realtime-assets-${{ inputs.platform }}-cu${{ inputs.cuda_version }} + artifact_name: cuda-quantum-realtime-assets-${{ inputs.platform }}-cu${{ inputs.cuda_version }} + services: + registry: + image: registry:2 + ports: + - 5000:5000 + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + submodules: true + + - name: Log in to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_READONLY_TOKEN }} + + - name: Login to GitHub CR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ github.token }} + - name: Set up context for buildx + run: | + docker context create builder_context + + - name: Set up buildx runner + uses: docker/setup-buildx-action@v3 + with: + endpoint: builder_context + version: v0.19.0 + buildkitd-config: /etc/buildkit/buildkitd.toml # hard-coded to run on our runners + driver-opts: | + network=host + image=moby/buildkit:v0.19.0 + - name: Build assets + id: docker_build + uses: docker/build-push-action@v5 + with: + context: . + file: ./realtime/docker/assets.Dockerfile + build-args: | + base_image=${{ (contains(inputs.platform, 'arm') && 'ghcr.io/nvidia/arm64v8/almalinux:8') || 'ghcr.io/nvidia/amd64/almalinux:8' }} + cuda_version=${{ inputs.cuda_version }} + cuda_native_arg=${{ (contains(inputs.cuda_version, '12') && '80-real;90') || '80-real;90-real;100f-real;110-real;120-real;100-virtual' }} + tags: ghcr.io/nvidia/cuda-quantum-realtime-assets-${{ inputs.platform }}-cu${{ inputs.cuda_version }} + # labels: ${{ steps.metadata.outputs.labels }} + platforms: ${{ inputs.platform }} + # TODO: set up cache + # cache-from: | + # ${{ inputs.build_cache }} + # ${{ steps.config.outputs.additional_build_caches }} + # TODO: push to the cache registry + push: false + # Output the installer + outputs: type=local,dest=/tmp/install + - name: Upload installer + uses: actions/upload-artifact@v4 + with: + name: install_cuda_quantum_realtime_cu${{ inputs.cuda_version }}.${{ inputs.platform }} + path: /tmp/install + retention-days: 1 + if-no-files-found: error diff --git a/realtime/.clang-format b/realtime/.clang-format new file mode 100644 index 00000000000..4c6382a71ef --- /dev/null +++ b/realtime/.clang-format @@ -0,0 +1,12 @@ +BasedOnStyle: LLVM +AlwaysBreakTemplateDeclarations: Yes +IncludeCategories: + - Regex: '^<' + Priority: 4 + - Regex: '^"cudaq/' + Priority: 3 + - Regex: '^"(realtime|\.\.)/' + Priority: 2 + - Regex: '.*' + Priority: 1 +InsertNewlineAtEOF: Yes diff --git a/realtime/.gitignore b/realtime/.gitignore new file mode 100644 index 00000000000..ccec909e9d6 --- /dev/null +++ b/realtime/.gitignore @@ -0,0 +1,99 @@ +# Editor backup files +*~ + +# Patch files +*.orig +*.rej + +# Compiled Object files +*.slo +*.lo +*.o +*.obj +*.x +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +**/Output/ +**/.lit*.txt + +# Executables +*.exe +*.out +*.app +**/out/ +/*build*/ +/*Build/ +/plugins/ +/other_library_builds/ +/.cproject +/.project +/.settings/ +**/*.jar +**/.ptp* +*.ab +/dist/ +/*egg*/ +/python/*egg* +/*tmp*/ +/wheelhouse/ +**/.ipynb_checkpoints +compile_commands.json +**/*.dat +**/.antlr +__pycache__/ + +# IDE files +.vscode/* +.theia/* + +# Container files +**/.docker/* + +# LSP files +.cache/* + +# LLVM/MLIR files +*.ll +*.bc + +# Build results +[Bb]in/ +[Oo]bj/ +*.bson +*.csv +*.bin +docs/sphinx/_doxygen +docs/sphinx/_mdgen +**/_build/* +**/_skbuild/* +_version.py + +# third party integrations +simulators/ +apps/ + +# macOS +.DS_Store + +# JetBrains IDE files +.idea + +# vim files +*.tmp diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt new file mode 100644 index 00000000000..02fc9260afc --- /dev/null +++ b/realtime/CMakeLists.txt @@ -0,0 +1,125 @@ +# ============================================================================ # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +cmake_minimum_required(VERSION 3.22 FATAL_ERROR) + +include(FetchContent) + +# Set a default build type if none was specified. Must set this before +# project(). +set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") + +# Set a default install prefix if none was specified. +set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING + "Install path prefix, prepended onto install directories") + +# Project setup +# ============================================================================== + +# Check if core is built as a standalone project. +project(cudaq-realtime) +set(CUDAQ_REALTIME_STANDALONE_BUILD TRUE) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# The following must go after `project(...)` +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) + +set(CUDAQ_REALTIME_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(CUDAQ_REALTIME_INCLUDE_DIR ${CUDAQ_REALTIME_SOURCE_DIR}/include) + +# Add cmake directory to module path for custom Find modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +# Options +# ============================================================================== + +option(CUDAQ_REALTIME_BUILD_TESTS + "Generate build targets for the CUDAQ real-time unit tests" ON) +option(CUDAQ_REALTIME_BUILD_EXAMPLES + "Generate build targets for the CUDAQ real-time example programs" ON) +option(CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS + "Build Hololink bridge/emulator/playback tools (requires hololink)." + OFF) + +# Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt) +# ============================================================================== +include(CheckLanguage) +check_language(CUDA) +set(CUDA_FOUND FALSE) +# Generate -gencode arch=compute_XX,code=sm_XX for list of supported +# arch values. +# List should be sorted in increasing order. +function(CUDA_get_gencode_args out_args_string arch_values) + # allow the user to pass the list like a normal variable + set(arch_list ${arch_values} ${ARGN}) + set(out "") + foreach(arch IN LISTS arch_list) + set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}") + endforeach(arch) + + # Repeat the last one as to ensure the generation of PTX for most + # recent virtual architecture for forward compatibility + list(GET arch_list -1 last_arch) + set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}") + set(${out_args_string} ${out} PARENT_SCOPE) +endfunction() + +if(CMAKE_CUDA_COMPILER) + if (NOT CUDA_TARGET_ARCHS) + if (CUDAToolkit_VERSION VERSION_LESS 13.0) + # Ampere, Hopper + set(CUDA_TARGET_ARCHS "80;90") + else() + # Ampere, Hopper, Blackwell + set(CUDA_TARGET_ARCHS "80;90;100") + endif() + endif() + CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options -fPIC") + + enable_language(CUDA) + set(CUDA_FOUND TRUE) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + find_package(CUDAToolkit REQUIRED) + message(STATUS "Cuda language found.") +endif() + +# External Dependencies +# ============================================================================== + +find_package(Threads REQUIRED) + +# Enable static linking of the C++ standard library to avoid dependency issues when distributing the library. +SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++ -static-libgcc") +SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc") + +add_subdirectory(lib) + +if (CUDAQ_REALTIME_BUILD_EXAMPLES) + message(STATUS "RoCE/DOCA examples removed for RPC dispatch workflow.") +endif() + +if (CUDAQ_REALTIME_BUILD_TESTS) + add_custom_target(CudaqRealtimeUnitTests) + include(CTest) + + add_custom_target(run_tests + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH="${CUDAQ_INSTALL_DIR}:${CMAKE_BINARY_DIR}/python" + ${CMAKE_CTEST_COMMAND} --output-on-failure + DEPENDS CudaqRealtimeUnitTests + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + ) + add_subdirectory(unittests) +endif() + diff --git a/realtime/README.md b/realtime/README.md new file mode 100644 index 00000000000..58297e2901a --- /dev/null +++ b/realtime/README.md @@ -0,0 +1,40 @@ +# CUDA-Q Realtime Library + +CUDA-Q Realtime is a library for tightly coupling GPU accelerated compute +to the control system of a quantum processor. + +It fulfills two primary responsibilities: + +1. It provides the low-level basis of realtime coprocessing +between FPGA and CPU-GPU systems. + +2. It provides the low latency networking stack of the NVQLink architecture, +enabling system integrators to achieve few-microsecond +data round trips between FPGA and GPU. + +## Getting Started + +To learn more about how to work with CUDA-Q Realtime, +please take a look at the [user guide](docs/user_guide.md). + +If you would like to install the latest iteration under development in this +repository and/or add your own modifications, take a look at [these +instructions](docs/building.md) about building CUDA-Q Realtime from source. + +## Specifications + +The following specifications for the CUDA-Q Realtime message protocol +and host API are available: + +- [Message protocol](docs/cudaq_realtime_message_protocol.md) + +- [Realtime host API](docs/cudaq_realtime_host_api.md) + +## Feedback + +Please let us know your feedback and ideas for the CUDA-Q platform in the +[Discussions][cuda_quantum_discussions] tab of this repository, or file an +[issue][cuda_quantum_issues]. + +[cuda_quantum_discussions]: https://github.com/NVIDIA/cuda-quantum/discussions +[cuda_quantum_issues]: https://github.com/NVIDIA/cuda-quantum/issues diff --git a/realtime/docker/assets.Dockerfile b/realtime/docker/assets.Dockerfile new file mode 100644 index 00000000000..794a03e79b5 --- /dev/null +++ b/realtime/docker/assets.Dockerfile @@ -0,0 +1,115 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# This file builds the CUDA Quantum Realtime binaries from scratch such that +# they can be used on a range of Linux systems, provided the requirements documented in +# the data center installation guide are satisfied. +# +# Usage: +# Must be built from the repo root with: +# docker build -t ghcr.io/nvidia/cudaq-realtime-assets:amd64-cu12 -f realtime/docker/assets.Dockerfile . + +# [Operating System] +ARG base_image=amd64/almalinux:8 +FROM ${base_image} AS assets +SHELL ["/bin/bash", "-c"] +ARG cuda_version=13.0 +ENV CUDA_VERSION=${cuda_version} + +# When a dialogue box would be needed during install, assume default configurations. +# Set here to avoid setting it for all install commands. +# Given as arg to make sure that this value is only set during build but not in the launched container. +ARG DEBIAN_FRONTEND=noninteractive +RUN dnf install -y --nobest --setopt=install_weak_deps=False \ + 'dnf-command(config-manager)' && \ + dnf config-manager --enable powertools + +ADD scripts/configure_build.sh /cuda-quantum/scripts/configure_build.sh + +# [Prerequisites] +ARG PYTHON=python3.11 +RUN dnf install -y --nobest --setopt=install_weak_deps=False ${PYTHON} + +# [Build Dependencies] +RUN dnf install -y --nobest --setopt=install_weak_deps=False wget git unzip + +## [CUDA] +RUN source /cuda-quantum/scripts/configure_build.sh install-cuda +## [Compiler Toolchain] +RUN source /cuda-quantum/scripts/configure_build.sh install-gcc + +# [>ToolchainConfiguration] +ENV GCC_TOOLCHAIN="/opt/rh/gcc-toolset-11/root/usr/" +ENV CXX="${GCC_TOOLCHAIN}/bin/g++" +ENV CC="${GCC_TOOLCHAIN}/bin/gcc" +ENV CUDACXX=/usr/local/cuda/bin/nvcc +ENV CUDAHOSTCXX="${GCC_TOOLCHAIN}/bin/g++" +# [ **_NOTE:_** The above build instructions and tests only cover +the basic CUDA-Q Realtime library, e.g., the dispatch library +and host API; no networking transport layer is included. + +## Enable Holoscan Sensor Bridge Support + +[Holoscan Sensor Bridge](https://www.nvidia.com/en-us/technologies/holoscan-sensor-bridge/) +(`HSB`) provides a standard API and open-source software that +streams high-speed data directly to GPU memory through FPGA interfaces. + +CUDA-Q Realtime supports `HSB`, enabling users to build applications +for realtime coprocessing between FPGA and GPU systems. + +### Hardware Requirements + +- NVIDIA ConnectX-7/BlueField + +- FPGA + +### Software Requirements + +- `DOCA` version 3.3 with `gpunetio` + +Please refer to [the download page](https://developer.nvidia.com/doca-downloads) +to install `DOCA` for your system. + +> **_NOTE:_** Please make sure `doca-sdk-gpunetio` is installed along with `doca-all`. + +### Build Holoscan Sensor Bridge + +To build CUDA-Q Realtime with `HSB`, first, one needs to compile the `HSB` code. + +After cloning `HSB` from [GitHub](https://github.com/nvidia-holoscan/holoscan-sensor-bridge/tree/release-2.6.0-EA), +build it with + +```bash +# HOLOLINK_DIR is the top-level directory of HSB source code +cmake -G Ninja -S "$HOLOLINK_DIR" -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DHOLOLINK_BUILD_ONLY_NATIVE=OFF \ + -DHOLOLINK_BUILD_PYTHON=OFF \ + -DHOLOLINK_BUILD_TESTS=OFF \ + -DHOLOLINK_BUILD_TOOLS=OFF \ + -DHOLOLINK_BUILD_EXAMPLES=OFF \ + -DHOLOLINK_BUILD_EMULATOR=OFF + +cmake --build build \ + --target roce_receiver gpu_roce_transceiver hololink_core +``` + +> **_NOTE:_** In order to compile Holoscan Sensor Bridge from source, +we need to install all of its dependencies. +Please refer to `HSB` [documentation](https://docs.nvidia.com/holoscan/sensor-bridge/latest/setup.html) +for more details. + + + +> **_NOTE:_** One can also use the Holoscan Sensor Bridge Docker [container](https://docs.nvidia.com/holoscan/sensor-bridge/latest/build.html) +to build CUDA-Q Realtime. + +### Build CUDA-Q Realtime with `HSB` + +To enable `HSB`, we can configure `cmake` when building CUDA-Q Realtime as follows: + +```bash +cmake -G Ninja -S "$CUDAQ_REALTIME_DIR" -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS=ON \ + -DHOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR="$HOLOLINK_DIR" \ + -DHOLOSCAN_SENSOR_BRIDGE_BUILD_DIR="$HOLOLINK_DIR/build" +cmake --build build +``` + +The `$CUDAQ_REALTIME_DIR` directory is the `realtime` +sub-directory in CUDA-Q source tree. + +### Running the FPGA RPC dispatch test + +To run the end-to-end RPC dispatch testing between FPGA and GPU +using CUDA-Q Realtime and Holoscan Sensor Bridge, + +- Load the `HSB` bit-file into the FPGA. +The bit-file can be obtained from [here](https://github.com/nvidia-holoscan/holoscan-sensor-bridge/tree/release-2.6.0-EA). + +- Run the test script (at `cuda-quantum/realtime/unittests/utils/hololink_test.sh`). +For example, + +```bash +bash hololink_test.sh --page-size 512 --device mlx5_0 --gpu 0 --bridge-ip 192.168.0.101 --fpga-ip 192.168.0.2 --unified +``` + +> **_NOTE:_** +> The command line arguments need to be adjusted based on the system setup: +> +> - `--device` is the `IB` device name that is connected to the FPGA. +> - `--gpu` is the GPU device Id that we want to run the RPC callback on. +> - `--fpga-ip` is the IP address of the `HSB` FPGA. +> - `--bridge-ip` is the IP address of the NIC on the host machine. + +Upon successful completion, the above script should print out the following: + +```text +=== Verification Summary === + ILA samples captured: 100 + tvalid=0 (idle): 0 + RPC responses: 100 + Non-RPC frames: 0 + Unique messages verified: 100 of 100 + Responses matched: 100 + Header errors: 0 + Payload errors: 0 + +=== PTP Round-Trip Latency === + Samples: 100 + Min: 3589 ns + Max: 6348 ns + Avg: 3872.0 ns + CSV written: ptp_latency.csv + RESULT: PASS + +=== Shutting down === +``` + +> **_NOTE:_** In the above test script, we execute a simple RPC dispatch tests, whereby +the FPGA sends data (array of bytes) to the GPU; the GPU performs a simple increment +by one calculation on each of the byte in the incoming array and returns the array. +We then validate the data and measure the round-trip latency then output +the report as shown above. + + + +> **_NOTE:_** One can also execute the whole build and execution using +> the validation script as follows: +> +> ```bash +> bash hololink_test.sh --page-size 512 --device mlx5_0 --gpu 0 --bridge-ip 192.168.0.101 --fpga-ip 192.168.0.2 --unified --build --hololink-dir $HOLOLINK_DIR --cuda-quantum-dir $CUDAQ_DIR +> ``` +> +> `$HOLOLINK_DIR` and `$CUDAQ_DIR` are the top-level source directory of Hololink +> and CUDA-Q accordingly. +> Please note that `$CUDAQ_DIR` here is the parent directory +> that contains the `realtime` sub-directory. diff --git a/realtime/docs/cudaq_realtime_host_api.md b/realtime/docs/cudaq_realtime_host_api.md new file mode 100644 index 00000000000..303131ce606 --- /dev/null +++ b/realtime/docs/cudaq_realtime_host_api.md @@ -0,0 +1,1126 @@ +# CUDA-Q Realtime Host API + +This document explains the C host API for realtime dispatch, the RPC wire +protocol, and complete wiring examples. It is written for external partners +integrating CUDA-QX decoders with their own transport mechanisms. The API and +protocol are **transport-agnostic** and support multiple data transport options, +including NVIDIA Hololink (RDMA via ConnectX NIC's), `libibverbs`, and proprietary +transport layers. Handlers can execute on GPU (via CUDA kernels) or CPU (via +host threads). Examples in this document use Hololink's 3-kernel workflow (RX +kernel/dispatch/TX kernel) for illustration, but the same principles apply to +other transport mechanisms. + +## What is Hololink? + +**Hololink** is NVIDIA's low-latency sensor bridge framework that enables +direct GPU memory access from external devices (FPGAs, sensors) over Ethernet +using RDMA (Remote Direct Memory Access) via ConnectX NIC's. In the context of +quantum error correction, Hololink is one example of a transport mechanism that +connects the quantum control system (typically an FPGA) to GPU-based decoders. + +**Repository**: [`nvidia-holoscan`/`holoscan-sensor-bridge` (`nvqlink` branch)](https://github.com/nvidia-holoscan/holoscan-sensor-bridge/tree/nvqlink) + +Hololink handles: + +- **RX (Receive)**: RX kernel receives data from the FPGA directly +into GPU memory via RDMA +- **TX (Transmit)**: TX kernel sends results back +to the FPGA via RDMA +- **RDMA transport**: Zero-copy data movement using +ConnectX-7 NIC's with GPUDirect support + +The CUDA-Q Realtime Host API provides the **middle component** +(dispatch kernel or thread) that sits between +the transport's RX and TX components, executing the actual decoder logic. + +## Transport Mechanisms + +The realtime dispatch API is designed to work with multiple transport mechanisms +that move data between the quantum control system (FPGA) and the decoder. The +transport mechanism handles getting RPC messages into RX ring buffer slots and +sending responses from TX ring buffer slots back to the FPGA. + +### Supported Transport Options + +**Hololink (GPU-based with GPUDirect)**: + +- Uses ConnectX-7 NIC's with RDMA for zero-copy data movement +- RX and TX are persistent GPU kernels that directly access GPU memory +- Requires GPUDirect support +- Lowest latency option for GPU-based decoders + +**`libibverbs` (CPU-based)**: + +- Standard InfiniBand Verbs API for RDMA on the CPU +- RX and TX are host threads that poll CPU-accessible memory +- Works with CPU-based dispatchers +- Ring buffers reside in host memory (`cudaHostAlloc` or regular `malloc`) + +**Proprietary Transport Mechanisms**: + +- Custom implementations with or without GPUDirect support +- May use different networking technologies or memory transfer methods +- Must implement the ring buffer + flag protocol defined in this document +- Can target either GPU (with suitable memory access) or CPU execution + +The key requirement is that the transport mechanism implements the ring buffer +slot + flag protocol: writing RPC messages to RX slots and setting `rx_flags`, +then reading TX slots after `tx_flags` are set. + +## The 3-Kernel Architecture (Hololink Example) {#three-kernel-architecture} + +The Hololink workflow separates concerns into three persistent GPU kernels that +communicate via shared ring buffers: + + +3-kernel architecture + +### Data Flow Summary + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StepComponentAction
1-2FPGA → ConnectXDetection event data sent over Ethernet, RDMA writes to GPU memory
3RX KernelFrames detection events into RPC message, sets rx_flags[slot] (see Message completion note)
4-5Dispatch KernelPolls for ready slots, looks up handler by function_id, executes decoder
6Dispatch KernelWrites RPCResponse + correction, sets tx_flags[slot]
7-8TX KernelPolls for responses, triggers RDMA send back to FPGA
9ConnectX → FPGACorrection delivered to quantum controller
+ + + +### Why 3 Kernels? + +1. **Separation of concerns**: +Transport (RX/TX kernels) vs. compute (dispatch) are decoupled +2. **Reusability**: +Same dispatch kernel works with any decoder handler +3. **Testability**: +Dispatch kernel can be tested without Hololink hardware +4. **Flexibility**: +RX/TX kernels can be replaced with different transport mechanisms +5. **Transport independence**: +The protocol works with Hololink, `libibverbs`, or proprietary transports + +For use cases where latency is more important than transport independence, see +[Unified Dispatch Mode](#unified-dispatch-mode) which combines all three kernels +into one. + +## Unified Dispatch Mode + +The **unified dispatch mode** (`CUDAQ_KERNEL_UNIFIED`) is an alternative to the +3-kernel architecture that combines RDMA receive, RPC dispatch, and RDMA transmit +into a single GPU kernel. By eliminating the inter-kernel ring-buffer flag +handoff between RX, dispatch, and TX kernels, the unified kernel reduces +round-trip latency for simple (non-cooperative) RPC handlers. + +### Architecture + +In unified mode, a single GPU thread: + +1. Polls the `DOCA` completion queue (`CQ`) for an incoming RDMA message +2. Parses the `RPCHeader` from the receive buffer +3. Looks up and calls the registered handler in-place +4. Writes the `RPCResponse` header (overwriting the request header) +5. Sends the response via `DOCA` `BlueFlame` +6. Re-posts the receive work queue entry (`WQE`) + +The symmetric ring layout means the response overwrites the request in the same +buffer slot. `RPCHeader` fields (`request_id`, `ptp_timestamp`) are saved to +registers before the handler runs. + +### Transport-Agnostic API, Transport-Specific Implementation + +The dispatcher host API remains transport-agnostic. Unified mode introduces: + +- `CUDAQ_KERNEL_UNIFIED` -- a new `cudaq_kernel_type_t` enum value +- `cudaq_unified_launch_fn_t` -- a launch function type that receives an opaque + `void* transport_ctx` instead of ring-buffer pointers +- `cudaq_dispatcher_set_unified_launch()` -- wires the launch function and + transport context to the dispatcher + +The transport-specific details (`DOCA` `QP` handles, memory keys, ring buffer +addresses) are packed into an opaque struct (`hololink_doca_transport_ctx` for the +Hololink/`DOCA` implementation) and passed through the `void* transport_ctx` +pointer. A different transport could define its own context struct and launch +function, and the dispatcher would manage it identically. The bridge returns a +`cudaq_unified_dispatch_ctx_t` bundle containing the launch function pointer +and the opaque transport context, keeping the dispatcher API fully transport-agnostic. + +### When to Use Which Mode + +**3-kernel mode** (`CUDAQ_KERNEL_REGULAR` or `CUDAQ_KERNEL_COOPERATIVE`): + +- Transport-agnostic -- works with any transport that implements the ring-buffer + flag protocol + +- Required for cooperative handlers that use `grid.sync()` + +- Best choice when transport independence is a priority + +**Unified mode** (`CUDAQ_KERNEL_UNIFIED`): + +- Lowest latency for regular (non-cooperative) handlers +- Transport-specific kernel implementation (currently `DOCA`/Hololink) +- Single-thread, single-block kernel -- no inter-kernel synchronization overhead +- Not compatible with cooperative handlers or `CUDAQ_DISPATCH_GRAPH_LAUNCH` + +### Host API Extensions + +```cpp +typedef enum { + CUDAQ_KERNEL_REGULAR = 0, + CUDAQ_KERNEL_COOPERATIVE = 1, + CUDAQ_KERNEL_UNIFIED = 2 +} cudaq_kernel_type_t; + +typedef void (*cudaq_unified_launch_fn_t)( + void *transport_ctx, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, + cudaStream_t stream); + +cudaq_status_t cudaq_dispatcher_set_unified_launch( + cudaq_dispatcher_t *dispatcher, + cudaq_unified_launch_fn_t unified_launch_fn, + void *transport_ctx); +``` + +When `kernel_type == CUDAQ_KERNEL_UNIFIED`: + +- `cudaq_dispatcher_set_ringbuffer()` and `cudaq_dispatcher_set_launch_fn()` + are **not required** (the unified kernel handles transport internally) +- `cudaq_dispatcher_set_unified_launch()` **must** be called instead +- `num_slots` and `slot_size` in the configuration may be zero +- All other wiring (`set_function_table`, `set_control`) remains the same + +### Wiring Example (Unified Mode with Hololink) + +```cpp +// Pack DOCA transport handles +hololink_doca_transport_ctx ctx; +ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver); +ctx.rx_ring_data = hololink_get_rx_ring_data_addr(transceiver); +ctx.rx_ring_stride_sz = hololink_get_page_size(transceiver); +ctx.rx_ring_mkey = htonl(hololink_get_rkey(transceiver)); +ctx.rx_ring_stride_num = hololink_get_num_pages(transceiver); +ctx.frame_size = frame_size; + +// Configure dispatcher for unified mode +cudaq_dispatcher_config_t config{}; +config.device_id = gpu_id; +config.kernel_type = CUDAQ_KERNEL_UNIFIED; +config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + +cudaq_dispatcher_create(manager, &config, &dispatcher); +cudaq_dispatcher_set_unified_launch( + dispatcher, &hololink_launch_unified_dispatch, &ctx); +cudaq_dispatcher_set_function_table(dispatcher, &table); +cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats); +cudaq_dispatcher_start(dispatcher); +``` + +## What This API Does (In One Paragraph) + +The host API wires a dispatcher (GPU kernel or CPU thread) to shared ring buffers. +The transport mechanism (e.g., Hololink RX/TX kernels, `libibverbs` threads, or +proprietary transport) places incoming RPC messages into RX slots and retrieves +responses from TX slots. +The dispatcher polls RX flags (see Message completion note), looks up a +handler by `function_id`, executes it on the GPU, and writes a response into the +same slot. `Hololink`'s RX/TX kernels handle device I/O; the dispatch kernel sits +in the middle and runs the decoder handler. + +## Scope + +- C host API in `cudaq_realtime.h` +- RPC messaging protocol (header + payload + response) + +## Terms and Components + +- **Ring buffer**: +Fixed-size slots holding RPC messages (see Message completion note). +Each slot has an RX flag and a TX flag. +- **RX flag**: +Nonzero means a slot is ready to be processed. +- **TX flag**: +Nonzero means a response is ready to send. +- **Dispatcher**: +Component that processes RPC messages (GPU kernel or CPU thread). +- **Handler**: +Function registered in the function table that processes specific message types. +- **Function table**: +Array of handler function pointers + IDs + schemas. + +## Schema Data Structures + +Each handler registered in the function table includes a schema that describes +its argument and result types. + +### Type Descriptors + +```cpp +// Standardized payload type identifiers +typedef enum { + CUDAQ_TYPE_UINT8 = 0x10, + CUDAQ_TYPE_INT32 = 0x11, + CUDAQ_TYPE_INT64 = 0x12, + CUDAQ_TYPE_FLOAT32 = 0x13, + CUDAQ_TYPE_FLOAT64 = 0x14, + CUDAQ_TYPE_ARRAY_UINT8 = 0x20, + CUDAQ_TYPE_ARRAY_INT32 = 0x21, + CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, + CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, + CUDAQ_TYPE_BIT_PACKED = 0x30 // Bit-packed data (LSB-first) +} cudaq_payload_type_t; + +struct cudaq_type_desc_t { + uint8_t type_id; // cudaq_payload_type_t value + uint8_t reserved[3]; + uint32_t size_bytes; // Total size in bytes + uint32_t num_elements; // Interpretation depends on type_id +}; +``` + +The `num_elements` field interpretation: + +- **Scalar types** (`CUDAQ_TYPE_UINT8`, `CUDAQ_TYPE_INT32`, etc.): +unused, set to 1 +- **Array types** (`CUDAQ_TYPE_ARRAY_*`): number of array elements +- **CUDAQ_TYPE_BIT_PACKED**: number of bits (not bytes) + +### Handler Schema + +```cpp +struct cudaq_handler_schema_t { + uint8_t num_args; // Number of input arguments + uint8_t num_results; // Number of return values + uint16_t reserved; + + cudaq_type_desc_t args[8]; // Argument type descriptors + cudaq_type_desc_t results[4]; // Result type descriptors +}; +``` + +Limits: + +- Maximum 8 arguments per handler +- Maximum 4 results per handler +- Total payload size must fit in slot: `slot_size - sizeof(RPCHeader)` + +## RPC Messaging Protocol + +Each RX ring buffer slot contains an RPC request. The dispatcher writes the +response to the corresponding TX ring buffer slot. + +```text +RX Slot: | RPCHeader | request payload bytes | +TX Slot: | RPCResponse | response payload bytes | +``` + +Payload encoding details (type system, multi-argument encoding, bit-packing, +and QEC-specific examples) are defined in [cudaq_realtime_message_protocol.md](cudaq_realtime_message_protocol.md). + +Magic values (little-endian 32-bit): + +- `RPC_MAGIC_REQUEST = 0x43555152` (`'CUQR'`) +- `RPC_MAGIC_RESPONSE = 0x43555153` (`'CUQS'`) + +```cpp +// Wire format (byte layout must match dispatch_kernel_launch.h) +struct RPCHeader { + uint32_t magic; // RPC_MAGIC_REQUEST + uint32_t function_id; // fnv1a_hash("handler_name") + uint32_t arg_len; // payload bytes following this header + uint32_t request_id; // caller-assigned ID, echoed in the response + uint64_t ptp_timestamp; // PTP send timestamp (set by sender; 0 if unused) +}; + +struct RPCResponse { + uint32_t magic; // RPC_MAGIC_RESPONSE + int32_t status; // 0 = success + uint32_t result_len; // bytes of response payload + uint32_t request_id; // echoed from RPCHeader::request_id + uint64_t ptp_timestamp; // echoed from RPCHeader::ptp_timestamp +}; +``` + +Both structs are 24 bytes, packed with no padding. See `cudaq_realtime_message_protocol.bs` +for `request_id` and `ptp_timestamp` semantics. + +Payload conventions: + +- **Request payload**: +argument data as specified by handler schema. +- **Response payload**: +result data as specified by handler schema. +- **Size limit**: +payload must fit in one slot. `max_payload_bytes = slot_size - sizeof(RPCHeader)`. +- **Multi-argument encoding**: +arguments concatenated in schema order (see message protocol doc). + +## Host API Overview + +Header: `realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h` + +## Manager and Dispatcher Topology + +The manager is a lightweight owner for one or more dispatchers. Each dispatcher +is configured independently (e.g., `vp_id`, `kernel_type`, `dispatch_mode`) and +can target different workloads. + + +Manager and dispatcher topology + +## Host API Functions + + +Function usage: + +cudaq_dispatch_manager_create creates +the top-level manager that owns dispatchers. + +Parameters: + +- `out_mgr`: receives the created manager handle. + +Call this once near program startup and keep the manager alive for the +lifetime of the dispatch subsystem. + +cudaq_dispatch_manager_destroy +releases the manager and any internal resources. + +Parameters: + +- `mgr`: manager handle to destroy. + +Call this after all dispatchers have been destroyed and the program is +shutting down. + +cudaq_dispatcher_create allocates a +dispatcher instance and validates the configuration. + +Parameters: + +- `mgr`: owning manager. +- `config`: filled `cudaq_dispatcher_config_t` with: + - `device_id` (default 0): selects the CUDA device for the dispatcher + - `num_blocks` (default 1) + - `threads_per_block` (default 32) + - `num_slots` (required) + - `slot_size` (required) + - `vp_id` (default 0): tags a dispatcher to a transport channel. + Queue pair selection and NIC port/IP binding are configured + in Hololink, not in this API. + - `kernel_type` (default `CUDAQ_KERNEL_REGULAR`) + - `CUDAQ_KERNEL_REGULAR`: standard kernel launch + - `CUDAQ_KERNEL_COOPERATIVE`: cooperative launch (`grid.sync()` capable) + - `CUDAQ_KERNEL_UNIFIED`: single-kernel dispatch with integrated transport + (see [Unified Dispatch Mode](#unified-dispatch-mode)) + - `dispatch_mode` (default `CUDAQ_DISPATCH_DEVICE_CALL`) + - `CUDAQ_DISPATCH_DEVICE_CALL`: direct `__device__` handler call (lowest latency) + - `CUDAQ_DISPATCH_GRAPH_LAUNCH`: CUDA graph launch from device code + (requires `sm_90+`, Hopper or later GPUs) +- `out_dispatcher`: receives the created dispatcher handle. + +Call this before wiring ring buffers, function tables, or control state. + +cudaq_dispatcher_destroy releases a dispatcher +after it has been stopped. + +Parameters: + +- `dispatcher`: dispatcher handle to destroy. + +Call this when the dispatcher is no longer needed. + +cudaq_dispatcher_set_ringbuffer provides +the RX/TX flag and data pointers the dispatch kernel will poll +and use for request/response slots. + +Parameters: + +- `dispatcher`: dispatcher handle. +- `ringbuffer`: `cudaq_ringbuffer_t` with: + - `rx_flags`: device-visible pointer to RX flags. + - `tx_flags`: device-visible pointer to TX flags. + - `rx_data`: device-visible pointer to RX slot data (request payloads). + - `tx_data`: device-visible pointer to TX slot data (response payloads). + - `rx_stride_sz`: size in bytes of each RX slot. + - `tx_stride_sz`: size in bytes of each TX slot. + +Call this before `cudaq_dispatcher_start`, after allocating mapped host memory +or device memory for the ring buffers. + +cudaq_dispatcher_set_function_table supplies +the function table containing handler pointers, IDs, and schemas. + +Parameters: + +- `dispatcher`: dispatcher handle. +- `table`: `cudaq_function_table_t` with: + - `entries`: device pointer to array of `cudaq_function_entry_t`. + - `count`: number of entries in the table. + +```cpp +// Unified function table entry with schema +struct cudaq_function_entry_t { + union { + void* device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL + cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH + } handler; + + uint32_t function_id; + uint8_t dispatch_mode; // Per-handler dispatch mode + uint8_t reserved[3]; + + cudaq_handler_schema_t schema; // Handler interface schema +}; + +struct cudaq_function_table_t { + cudaq_function_entry_t* entries; // Device pointer to entry array + uint32_t count; // Number of entries +}; +``` + +Call this after initializing the device-side function table entries. +Each entry contains a handler pointer (or graph), function_id, dispatch mode, +and schema describing the handler's interface. + +Function ID semantics: + +- `function_id` is the 32-bit **`FNV-1a` hash** of the handler name string. +- The handler name is the string you hash when populating entries; +there is no separate runtime registration call. +- If no entry matches, the dispatcher clears the slot without a response. +- Suggested: use stable, human-readable handler names (e.g., `"mock_decode"`). + +cudaq_dispatcher_set_control supplies +the shutdown flag and stats buffer the dispatch kernel uses +for termination and bookkeeping. + +Parameters: + +- `dispatcher`: dispatcher handle. +- `shutdown_flag`: device-visible flag used to signal shutdown. +- `stats`: device-visible stats buffer. + +Call this before starting the dispatcher; both buffers must remain valid for +the dispatcher’s lifetime. + +cudaq_dispatcher_set_launch_fn provides +the host-side launch wrapper that invokes the dispatch kernel +with the correct grid/block dimensions. + +Parameters: + +- `dispatcher`: dispatcher handle. +- `launch_fn`: host launch function pointer. + +Call this once during setup. Typically you pass one of the provided launch functions: + +- `cudaq_launch_dispatch_kernel_regular` - for `CUDAQ_KERNEL_REGULAR` mode +- `cudaq_launch_dispatch_kernel_cooperative` - for `CUDAQ_KERNEL_COOPERATIVE` mode + +cudaq_dispatcher_start launches +the persistent dispatch kernel and begins processing slots. + +Parameters: + +- `dispatcher`: dispatcher handle. + +Call this only after ring buffers, function table, control buffers, and launch +function are set. + +cudaq_dispatcher_stop signals +the dispatch kernel to exit and waits for it to shut down. + +Parameters: + +- `dispatcher`: dispatcher handle. + +Call this during tear-down before destroying the dispatcher. + +cudaq_dispatcher_get_processed reads +the processed‑packet counter from the stats buffer +to support debugging or throughput tracking. + +Parameters: + +- `dispatcher`: dispatcher handle. +- `out_packets`: receives the processed packet count. + +### Occupancy Query and Eager Module Loading + +Before calling `cudaq_dispatcher_start`, call the appropriate occupancy query +to force eager loading of the dispatch kernel module. This avoids lazy-load +deadlocks when the dispatch kernel and transport kernels (e.g., Hololink RX/TX) +run as persistent kernels. + +cudaq_dispatch_kernel_query_occupancy returns the +maximum number of active blocks per multiprocessor for the **regular** dispatch +kernel. + +Parameters: + +- `out_blocks`: receives the max blocks per SM (or 0 on error). +- `threads_per_block`: block size used for the occupancy calculation. + +Returns `cudaSuccess` on success. Call this when `kernel_type` is +`CUDAQ_KERNEL_REGULAR`. + +cudaq_dispatch_kernel_cooperative_query_occupancy +returns the maximum number of active blocks per multiprocessor for the +**cooperative** dispatch kernel. + +Parameters: + +- `out_blocks`: receives the max blocks per SM (or 0 on error). +- `threads_per_block`: block size used for the occupancy calculation +(e.g., 128 for cooperative decoders). + +Returns `cudaSuccess` on success. Call this when `kernel_type` is +`CUDAQ_KERNEL_COOPERATIVE`. Use the same `threads_per_block` value that will +be passed to the dispatcher configuration and launch function. + +Call the occupancy function that matches the dispatcher's `kernel_type` once +before `cudaq_dispatcher_start`; the result can be used to size the dispatch +grid (e.g., to reserve `SM`'s for transport kernels). + +Lifetime/ownership: + +- All resources are assumed to live for the program lifetime. +- The API does not take ownership of host-allocated memory. + +Threading: + +- Single-threaded host usage; create/wire/start/stop from one thread. + +Error handling: + +- All calls return `cudaq_status_t`. +- `CUDAQ_ERR_INVALID_ARG` for missing pointers or invalid configuration. +- `CUDAQ_ERR_CUDA` for CUDA API failures during start/stop. + +### Graph-Based Dispatch Functions + +The following functions are only available when using +`CUDAQ_DISPATCH_GRAPH_LAUNCH` mode with `sm_90+` GPUs: + +cudaq_create_dispatch_graph_regular creates +a graph-based dispatch context that enables device-side graph launching. + +Parameters: + +- `rx_flags`: device-visible pointer to RX ring buffer flags +- `tx_flags`: device-visible pointer to TX ring buffer flags +- `rx_data`: device-visible pointer to RX slot data (request payloads) +- `tx_data`: device-visible pointer to TX slot data (response payloads) +- `rx_stride_sz`: size in bytes of each RX slot +- `tx_stride_sz`: size in bytes of each TX slot +- `function_table`: device pointer to function table entries +- `func_count`: number of function table entries +- `graph_io_ctx`: device pointer to a `GraphIOContext` struct for graph buffer communication +- `shutdown_flag`: device-visible shutdown flag +- `stats`: device-visible stats buffer +- `num_slots`: number of ring buffer slots +- `num_blocks`: grid size for dispatch kernel +- `threads_per_block`: block size for dispatch kernel +- `stream`: CUDA stream for graph operations +- `out_context`: receives the created graph context handle + +Returns `cudaSuccess` on success, or CUDA error code on failure. + +This function creates a graph containing the dispatch kernel, instantiates it +with `cudaGraphInstantiateFlagDeviceLaunch`, and uploads it to the device. +The resulting graph context enables device-side `cudaGraphLaunch()` calls +from within handlers. + +cudaq_launch_dispatch_graph launches +the dispatch graph to begin processing RPC messages. + +Parameters: + +- `context`: graph context handle from `cudaq_create_dispatch_graph_regular` +- `stream`: CUDA stream for graph launch + +Returns `cudaSuccess` on success, or CUDA error code on failure. + +Call this to start the persistent dispatch kernel. The kernel will continue +running until the shutdown flag is set. + +cudaq_destroy_dispatch_graph destroys +the graph context and releases all associated resources. + +Parameters: + +- `context`: graph context handle to destroy + +Returns `cudaSuccess` on success, or CUDA error code on failure. + +Call this after the dispatch kernel has exited (shutdown flag was set) +to clean up graph resources. + +### Kernel Launch Helper Functions + +The following helper functions are provided for use with `cudaq_dispatcher_set_launch_fn()`: + +cudaq_launch_dispatch_kernel_regular launches +the dispatch kernel in regular (non-cooperative) mode. + +Parameters: + +- `rx_flags`: device-visible pointer to RX ring buffer flags +- `tx_flags`: device-visible pointer to TX ring buffer flags +- `rx_data`: device-visible pointer to RX slot data (request payloads) +- `tx_data`: device-visible pointer to TX slot data (response payloads) +- `rx_stride_sz`: size in bytes of each RX slot +- `tx_stride_sz`: size in bytes of each TX slot +- `function_table`: device pointer to function table entries +- `func_count`: number of function table entries +- `shutdown_flag`: device-visible shutdown flag +- `stats`: device-visible stats buffer +- `num_slots`: number of ring buffer slots +- `num_blocks`: grid size for dispatch kernel +- `threads_per_block`: block size for dispatch kernel +- `stream`: CUDA stream for kernel launch + +Use this when `kernel_type` is set to `CUDAQ_KERNEL_REGULAR` in the dispatcher configuration. + +cudaq_launch_dispatch_kernel_cooperative +launches the dispatch kernel in cooperative mode. + +Parameters: Same as `cudaq_launch_dispatch_kernel_regular`. + +Use this when `kernel_type` is set to `CUDAQ_KERNEL_COOPERATIVE` +in the dispatcher configuration. +This enables the dispatch kernel and handlers to use grid-wide synchronization +via `cooperative_groups::this_grid().sync()`. + +## Memory Layout and Ring Buffer Wiring + +Each slot is a fixed-size byte region: + +```text +| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) | +``` + +Unused padding is the remaining bytes in the fixed-size slot after the header +and payload. + +Flags (both are `uint64_t` arrays of slot flags): + +- `rx_flags[slot]` is set by the producer to a non-zero value when a slot is ready. +- `tx_flags[slot]` is set by the dispatch kernel to a non-zero value +when the response is ready. + +Message completion note: +An RPC message may be delivered as multiple RDMA writes into a single slot. +Completion is signaled only after the final write (often an RDMA write with +immediate) sets `rx_flags[slot]` to a non-zero value. The dispatch kernel treats +the slot as complete only after the flag is set. + +In the NIC-free path, flags and data are allocated with +`cudaHostAllocMapped` so the device and host see the same memory. + +## Step-by-Step: Wiring the Host API (Minimal) + +```cpp +// Host API wiring +ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); +cudaq_dispatcher_config_t config{}; +config.device_id = 0; +config.num_blocks = 1; +config.threads_per_block = 32; +config.num_slots = static_cast(num_slots_); +config.slot_size = static_cast(slot_size_); +config.vp_id = 0; +config.kernel_type = CUDAQ_KERNEL_REGULAR; +config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + +ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK); + +cudaq_ringbuffer_t ringbuffer{}; +ringbuffer.rx_flags = rx_flags_; +ringbuffer.tx_flags = tx_flags_; +ringbuffer.rx_data = rx_data_; +ringbuffer.tx_data = tx_data_; +ringbuffer.rx_stride_sz = slot_size_; +ringbuffer.tx_stride_sz = slot_size_; +ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK); + +// Allocate and initialize function table entries +cudaq_function_entry_t* d_entries; +cudaMalloc(&d_entries, func_count_ * sizeof(cudaq_function_entry_t)); + +// Initialize entries on device (including schemas) +init_function_table<<<1, 1>>>(d_entries); +cudaDeviceSynchronize(); + +cudaq_function_table_t table{}; +table.entries = d_entries; +table.count = func_count_; +ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK); + +ASSERT_EQ(cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_), + CUDAQ_OK); + +ASSERT_EQ(cudaq_dispatcher_set_launch_fn( + dispatcher_, + &cudaq::qec::realtime::mock_decode_launch_dispatch_kernel), + CUDAQ_OK); + +ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); +``` + +## Device Handler and Function ID + +```cpp +// The dispatcher uses function_id to find the handler +constexpr std::uint32_t MOCK_DECODE_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("mock_decode"); + +/// @brief Initialize the device function table with schema +__global__ void init_function_table(cudaq_function_entry_t* entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + // Entry 0: Mock decoder + entries[0].handler.device_fn_ptr = + reinterpret_cast(&cudaq::qec::realtime::mock_decode_rpc); + entries[0].function_id = MOCK_DECODE_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + // Schema: 1 arg (bit-packed detection events), 1 result (correction byte) + entries[0].schema.num_args = 1; + entries[0].schema.args[0] = {CUDAQ_TYPE_BIT_PACKED, {0}, 16, 128}; // 128 bits + entries[0].schema.num_results = 1; + entries[0].schema.results[0] = {CUDAQ_TYPE_UINT8, {0}, 1, 1}; + } +} +``` + +### Multi-Argument Handler Example + +```cpp +constexpr std::uint32_t ADVANCED_DECODE_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("advanced_decode"); + +__global__ void init_advanced_handler(cudaq_function_entry_t* entries, + uint32_t index) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[index].handler.device_fn_ptr = + reinterpret_cast(&advanced_decode_rpc); + entries[index].function_id = ADVANCED_DECODE_FUNCTION_ID; + entries[index].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + // Schema: 2 args (detection events + calibration), 1 result + entries[index].schema.num_args = 2; + entries[index].schema.args[0] = {CUDAQ_TYPE_BIT_PACKED, {0}, 16, 128}; + entries[index].schema.args[1] = {CUDAQ_TYPE_ARRAY_FLOAT32, {0}, 64, 16}; // 16 floats + entries[index].schema.num_results = 1; + entries[index].schema.results[0] = {CUDAQ_TYPE_UINT8, {0}, 1, 1}; + } +} +``` + +## CUDA Graph Dispatch Mode + +The `CUDAQ_DISPATCH_GRAPH_LAUNCH` mode enables handlers to be executed as +pre-captured CUDA graphs launched from device code. +This is useful for complex multi-kernel workflows that benefit +from graph optimization and can reduce kernel launch overhead for sophisticated decoders. + +### Requirements + +- **GPU Architecture**: Compute capability 9.0 or higher (Hopper H100 or later) +- **CUDA Version**: CUDA 12.0+ with device-side graph launch support +- **Graph Setup**: Handler graphs must be captured and instantiated with `cudaGraphInstantiateFlagDeviceLaunch` + +### Graph-Based Dispatch API + +The API provides functions to properly wrap the dispatch kernel +in a graph context that enables device-side `cudaGraphLaunch()`: + +```cpp +// Opaque handle for graph-based dispatch context +typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; + +// Create a graph-based dispatch context +cudaError_t cudaq_create_dispatch_graph_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, + uint8_t *rx_data, uint8_t *tx_data, + size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + void *graph_io_ctx, volatile int *shutdown_flag, uint64_t *stats, + size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, + cudaStream_t stream, cudaq_dispatch_graph_context **out_context); + +// Launch the dispatch graph +cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, + cudaStream_t stream); + +// Destroy the dispatch graph context +cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); +``` + +### Graph Handler Setup Example + +```cpp +/// @brief Initialize function table with CUDA graph handler +__global__ void init_function_table_graph(cudaq_function_entry_t* entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.graph_exec = /* pre-captured cudaGraphExec_t */; + entries[0].function_id = DECODE_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + + // Schema: same as device call mode + entries[0].schema.num_args = 1; + entries[0].schema.args[0] = {TYPE_BIT_PACKED, {0}, 16, 128}; + entries[0].schema.num_results = 1; + entries[0].schema.results[0] = {TYPE_UINT8, {0}, 1, 1}; + } +} +``` + +### Graph Capture and Instantiation + +Handler graphs must be captured and instantiated with the device launch flag: + +```cpp +cudaStream_t capture_stream; +cudaStreamCreate(&capture_stream); + +// Capture the decoder kernel(s) into a graph +cudaStreamBeginCapture(capture_stream, cudaStreamCaptureModeGlobal); +decode_kernel<<>>(args...); +cudaStreamEndCapture(capture_stream, &graph); + +// Instantiate with device launch flag (required for device-side cudaGraphLaunch) +cudaGraphExec_t graph_exec; +cudaGraphInstantiateWithFlags(&graph_exec, graph, + cudaGraphInstantiateFlagDeviceLaunch); + +// Upload graph to device +cudaGraphUpload(graph_exec, capture_stream); +cudaStreamSynchronize(capture_stream); +cudaStreamDestroy(capture_stream); +``` + +### When to Use Graph Dispatch + +Use `CUDAQ_DISPATCH_GRAPH_LAUNCH` mode with the graph-based dispatch API +when handlers need to launch CUDA graphs from device code. +The graph-based dispatch API (`cudaq_create_dispatch_graph_regular()` + `cudaq_launch_dispatch_graph()`) +wraps the dispatch kernel in a graph execution context, enabling device-side `cudaGraphLaunch()` +calls from within handlers. + +### Graph vs Device Call Dispatch + +**Device Call Mode** (`CUDAQ_DISPATCH_DEVICE_CALL`): + +- Lowest latency for simple handlers +- Direct `__device__` function call from dispatcher +- Suitable for lightweight decoders and data transformations +- No special hardware requirements + +**Graph Launch Mode** (`CUDAQ_DISPATCH_GRAPH_LAUNCH`): + +- Enables complex multi-kernel workflows +- Benefits from CUDA graph optimizations +- Requires `sm_90+` hardware (Hopper or later) +- Higher setup overhead but can reduce per-invocation latency for complex pipelines + +## Building and Sending an RPC Message + +Adapted from `test_realtime_decoding.cu` (the actual test uses a library helper, +`setup_mock_decode_function_table`, that performs equivalent setup via +`cudaMemcpy`): + +Note: this host-side snippet emulates what the external device/FPGA would do +when populating RX slots in a Hololink deployment. + +```cpp +/// @brief Write detection events to RX buffer in RPC format. +void write_rpc_request(std::size_t slot, const std::vector& measurements) { + uint8_t* slot_data = const_cast(rx_data_host_) + slot * slot_size_; + + // Write RPCHeader + cudaq::realtime::RPCHeader* header = + reinterpret_cast(slot_data); + header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + header->function_id = MOCK_DECODE_FUNCTION_ID; + header->arg_len = static_cast(measurements.size()); + header->request_id = static_cast(slot); + header->ptp_timestamp = 0; // Set by FPGA in production; 0 for NIC-free tests + + // Write measurement data after header + memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), + measurements.data(), measurements.size()); +} +``` + +## Reading the Response + +Note: this host-side snippet emulates what the external device/FPGA would do +when consuming TX slots in a Hololink deployment. + +```cpp +/// @brief Read response from TX buffer. +/// Responses are written by the dispatch kernel to the TX ring buffer; read from tx_data, not rx_data. +bool read_rpc_response(std::size_t slot, uint8_t& correction, + std::int32_t* status_out = nullptr, + std::uint32_t* result_len_out = nullptr, + std::uint32_t* request_id_out = nullptr, + std::uint64_t* ptp_timestamp_out = nullptr) { + __sync_synchronize(); + const uint8_t* slot_data = const_cast(tx_data_host_) + slot * slot_size_; + + // Read RPCResponse + const cudaq::realtime::RPCResponse* response = + reinterpret_cast(slot_data); + + if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) { + return false; + } + + if (status_out) + *status_out = response->status; + if (result_len_out) + *result_len_out = response->result_len; + if (request_id_out) + *request_id_out = response->request_id; + if (ptp_timestamp_out) + *ptp_timestamp_out = response->ptp_timestamp; + + if (response->status != 0) { + return false; + } + + // Read correction data after response header + correction = *(slot_data + sizeof(cudaq::realtime::RPCResponse)); + return true; +} +``` + +## Schema-Driven Argument Parsing + +The dispatcher uses the handler schema to interpret the `typeless` payload bytes. +This example shows conceptual parsing logic: + +```cpp +__device__ void parse_args_from_payload( + const uint8_t* payload, + const cudaq_handler_schema_t& schema, + void** arg_ptrs) { + + uint32_t offset = 0; + + for (uint8_t i = 0; i < schema.num_args; i++) { + arg_ptrs[i] = const_cast(payload + offset); + offset += schema.args[i].size_bytes; + } +} + +__device__ void dispatch_with_schema( + uint8_t* slot_data, + const cudaq_function_entry_t& entry) { + + RPCHeader* hdr = reinterpret_cast(slot_data); + uint8_t* payload = slot_data + sizeof(RPCHeader); + + // Parse arguments using schema + void* arg_ptrs[8]; + parse_args_from_payload(payload, entry.schema, arg_ptrs); + + // Call handler with parsed arguments + if (entry.dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + auto handler = reinterpret_cast(entry.handler.device_fn_ptr); + handler(arg_ptrs, entry.schema.num_args, /* result buffer */); + } + // ... graph launch path uses same parsed args +} +``` + +For multi-argument payloads, arguments are **concatenated in schema order**: + +```text +| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... | + ^ ^ ^ + offset=0 offset=16 offset=80 +``` + +The schema specifies the size of each argument, allowing the dispatcher to +compute offsets. + +## Hololink 3-Kernel Workflow (Primary) + +See the [3-Kernel Architecture](#three-kernel-architecture) diagram above for +the complete data flow. The key integration points are: + +**Ring buffer handoff (RX → Dispatch)**: + +```cpp +// Hololink RX kernel sets this after writing detection event data +rx_flags[slot] = device_ptr_to_slot_data; +``` + +**Ring buffer handoff (Dispatch → TX)**: + +```cpp +// Dispatch kernel sets this after writing RPCResponse +tx_flags[slot] = device_ptr_to_slot_data; +``` + +**Latency path**: The critical path is: + +1. RDMA write completes → RX kernel signals → Dispatch polls and processes → +TX kernel polls and sends → RDMA read completes + +All three kernels are **persistent** (launched once, run indefinitely), so +there is no kernel launch overhead in the hot path. + +## NIC-Free Testing (No Hololink / No ConnectX-7) + +Emulate RX/TX with mapped host memory: + +- `cuda-quantum` host API test: + - `realtime/unittests/test_dispatch_kernel.cu` + +## Troubleshooting + +- **Timeout waiting for TX**: ensure the RX flag points to device-mapped memory. +- **Invalid `arg`**: check `slot_size`, `num_slots`, function table pointers. +- **CUDA errors**: verify `device_id`, and that CUDA is initialized. diff --git a/realtime/docs/cudaq_realtime_message_protocol.md b/realtime/docs/cudaq_realtime_message_protocol.md new file mode 100644 index 00000000000..953027aa444 --- /dev/null +++ b/realtime/docs/cudaq_realtime_message_protocol.md @@ -0,0 +1,429 @@ +# CUDA-Q Realtime Messaging Protocol + +This document defines the RPC (Remote Procedure Call) payload encoding used by +the realtime dispatch kernel for processing data and returning results. It complements +[cudaq_realtime_host_api.md](cudaq_realtime_host_api.md), +which focuses on wiring and API usage. + +## Scope + +- RPC header/response wire format +- `PTP` timestamp propagation for latency measurement +- Payload encoding and type system +- Schema contract and payload interpretation +- Function dispatch semantics + +Note: This protocol is hardware-agnostic. While the companion document +[cudaq_realtime_host_api.md](cudaq_realtime_host_api.md) provides +implementation details for both GPU and CPU-based dispatchers, +the wire format and encoding rules specified here apply universally. + +## RPC Header / Response + +Each ring-buffer slot is interpreted as: + +```text +| RPCHeader | payload bytes (arg_len) | unused padding (slot_size - header - payload) | +``` + +```cpp +struct RPCHeader { + uint32_t magic; // RPC_MAGIC_REQUEST + uint32_t function_id; // fnv1a_hash("handler_name") + uint32_t arg_len; // payload bytes following this header + uint32_t request_id; // caller-assigned ID, echoed in the response + uint64_t ptp_timestamp; // PTP send timestamp (set by sender; 0 if unused) +}; + +struct RPCResponse { + uint32_t magic; // RPC_MAGIC_RESPONSE + int32_t status; // 0 = success + uint32_t result_len; // bytes of response payload + uint32_t request_id; // echoed from RPCHeader::request_id + uint64_t ptp_timestamp; // echoed from RPCHeader::ptp_timestamp +}; +``` + +Both `structs` are 24 bytes, packed with no padding. + +Magic values (little-endian 32-bit): + +- `RPC_MAGIC_REQUEST = 0x43555152` (`'CUQR'`) +- `RPC_MAGIC_RESPONSE = 0x43555153` (`'CUQS'`) + +## Request ID Semantics + +`request_id` is a caller-assigned opaque 32-bit value included in every request. +The dispatch kernel copies it verbatim into the corresponding `RPCResponse`. +The protocol does not interpret or constrain the value; its meaning is defined +by the application. + +Typical uses: + +- **Shot index**: The sender sets `request_id` to the shot number, enabling + out-of-order or pipelined verification of responses. +- **Sequence number**: Monotonically increasing counter for detecting lost or + duplicated messages. +- **Unused**: Set to 0 when not needed. The dispatcher echoes it regardless. + +The dispatcher echoes `request_id` in all dispatch paths (cooperative, +regular, and graph-launch). + +## `PTP` Timestamp Semantics + +`ptp_timestamp` is a 64-bit field carrying a Precision Time Protocol (`PTP`) +send timestamp. It enables end-to-end latency measurement from the moment a +message leaves the sender (e.g., FPGA) to the moment a response is produced. + +The dispatch kernel copies `ptp_timestamp` verbatim from the incoming +`RPCHeader` into the corresponding `RPCResponse`. Individual RPC handlers do +not need to read, interpret, or propagate this field; it is handled entirely +by the dispatch infrastructure. + +Typical uses: + +- **FPGA-injected timestamp**: The FPGA writes the `PTP` time-of-day into + `ptp_timestamp` just before transmitting each message. The receiver + compares the echoed timestamp against the `PTP` clock at capture time to + compute round-trip latency. +- **Software timestamp**: A software sender (e.g., playback tool) may set the + field to a host-side `PTP` or monotonic clock value for profiling. +- **Unused**: Set to 0 when latency measurement is not needed. The dispatcher + echoes it regardless. + +The encoding is opaque to the protocol; the 64-bit value is echoed without +interpretation. By convention, the field carries a `PTP` time-of-day in +nanoseconds, but senders and receivers may agree on any encoding. + +The dispatcher echoes `ptp_timestamp` in all dispatch paths (cooperative, +regular, and graph-launch). + +## Function ID Semantics + +`function_id` selects which handler the dispatcher invokes for a given RPC +message. The dispatcher performs a lookup in the function table (array of +function pointers + IDs) and calls the matching entry. + +See [cudaq_realtime_host_api.md](cudaq_realtime_host_api.md) for function ID hashing, +handler naming, and function table registration details. + +## Schema and Payload Interpretation + +The RPC payload is **`typeless` on the wire**. The bytes following `RPCHeader` +are an opaque blob from the protocol's perspective. + +**Payload interpretation is defined by the handler schema**, which is registered +in the dispatcher's function table during setup (see [cudaq_realtime_host_api.md](cudaq_realtime_host_api.md)). +The schema specifies: + +- Number of arguments +- Type and size of each argument +- Number of return values +- Type and size of each return value + +**Out-of-band contract**: The client (e.g., FPGA) firmware and dispatcher function +table must agree on the schema for each `function_id`. Schema mismatches are detected +during integration testing, not at runtime. + +For handlers with multiple arguments, the payload is a **concatenation** of +argument data in schema order: + +```text +| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... | +``` + +The dispatcher uses the schema to determine where each argument begins and ends within +the payload. + +### Type System + +Standardized payload type identifiers used in handler schemas: + +```cpp +enum PayloadTypeID : uint8_t { + TYPE_UINT8 = 0x10, + TYPE_INT32 = 0x11, + TYPE_INT64 = 0x12, + TYPE_FLOAT32 = 0x13, + TYPE_FLOAT64 = 0x14, + TYPE_ARRAY_UINT8 = 0x20, + TYPE_ARRAY_INT32 = 0x21, + TYPE_ARRAY_FLOAT32 = 0x22, + TYPE_ARRAY_FLOAT64 = 0x23, + TYPE_BIT_PACKED = 0x30 // Bit-packed data (LSB-first) +}; +``` + +Schema type descriptor (see [cudaq_realtime_host_api.md](cudaq_realtime_host_api.md) +for full definition): + +```cpp +struct cudaq_type_desc_t { + uint8_t type_id; // PayloadTypeID value + uint8_t reserved[3]; + uint32_t size_bytes; // Total size in bytes + uint32_t num_elements; // Interpretation depends on type_id +}; +``` + +The `num_elements` field interpretation: + +- **Scalar types** (`TYPE_UINT8`, `TYPE_INT32`, etc.): unused, set to 1 +- **Array types** (`TYPE_ARRAY_*`): number of array elements +- **TYPE_BIT_PACKED**: number of bits (not bytes) + +Note: For arbitrary binary data or vendor-specific formats, use `TYPE_ARRAY_UINT8`. + +Encoding rules: + +- All multi-byte integers: **little-endian** +- Floating-point: **IEEE 754** format +- Arrays: tightly packed elements (no padding) +- Bit-packed data: LSB-first within each byte, +`size_bytes = ceil(num_elements / 8)` + +## Payload Encoding + +The payload contains the argument data for the handler function. The encoding +depends on the argument types specified in the handler schema. + +### Single-Argument Payloads + +For handlers with one argument, the payload contains the argument data directly: + +```text +| RPCHeader | argument_bytes | +``` + +### Multi-Argument Payloads + +For handlers with multiple arguments, arguments are **concatenated in schema order** +with no padding or delimiters: + +```text +| RPCHeader | arg0_bytes | arg1_bytes | arg2_bytes | ... | +``` + +The schema specifies the size of each argument, +allowing the dispatcher to compute offsets. + +### Size Constraints + +The total payload must fit in a single ring-buffer slot: + +```text +total_size = sizeof(RPCHeader) + arg_len ≤ slot_size +max_payload_bytes = slot_size - sizeof(RPCHeader) +``` + +### Encoding Examples + +**Example 1: Handler with signature** `void process(int32_t count, float threshold)` + +Schema: + +- `arg0`: `TYPE_INT32`, 4 bytes +- `arg1`: `TYPE_FLOAT32`, 4 bytes + +Wire encoding: + +```text +Offset | Content +-------|-------- +0-23 | RPCHeader { magic, function_id, arg_len=8, request_id, ptp_timestamp } +24-27 | count (int32_t, little-endian) +28-31 | threshold (float, IEEE 754) +``` + +**Example 2: Handler with signature** +`void decode(const uint8_t* bits, uint32_t num_bits)` + +Schema: + +- `arg0`: `TYPE_BIT_PACKED`, `size_bytes=16`, `num_elements=128` +- `arg1`: `TYPE_UINT32`, `size_bytes=4`, `num_elements=1` + +Wire encoding: + +```text +Offset | Content +-------|-------- +0-23 | RPCHeader { magic, function_id, arg_len=20, request_id, ptp_timestamp } +24-39 | bits (bit-packed, LSB-first, 128 bits) +40-43 | num_bits=128 (uint32_t, little-endian) +``` + +### Bit-Packed Data Encoding + +For `TYPE_BIT_PACKED` arguments: + +- Bits are packed **LSB-first** within each byte +- Payload length: `size_bytes = ceil(num_elements / 8)` bytes +- The schema specifies both `size_bytes` (storage) +and `num_elements` (actual bit count) + +Example for 10 bits (`size_bytes=2`, `num_elements=10`): + +```text +bits: b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 +byte[0]: b0 b1 b2 b3 b4 b5 b6 b7 (LSB-first) +byte[1]: b8 b9 0 0 0 0 0 0 (unused bits set to zero) +``` + +The handler can use `num_elements` from the schema to determine how many bits +are valid, avoiding the need to pass bit count as a separate argument (though +some handlers may still choose to do so for flexibility). + +**Use case**: `TYPE_BIT_PACKED` is suitable for **binary measurements** where +each measurement result is 0 or 1 (1 bit per measurement). + +### Multi-Bit Measurement Encoding + +For applications requiring richer measurement data (e.g., soft readout, leakage +detection), use array types instead of `TYPE_BIT_PACKED`: + +**4-bit soft readout** (confidence values 0-15): + +Use `TYPE_ARRAY_UINT8` with custom packing (2 measurements per byte): + +- Schema: `TYPE_ARRAY_UINT8`, `size_bytes = ceil(num_measurements / 2)`, +`num_elements = num_measurements` +- Encoding: Low nibble = measurement[0], high nibble = measurement[1], etc. + +**8-bit soft readout** (confidence values 0-255): + +Use `TYPE_ARRAY_UINT8` with one byte per measurement: + +- Schema: `TYPE_ARRAY_UINT8`, `size_bytes = num_measurements`, `num_elements = num_measurements` +- Encoding: byte[i] = measurement[i] + +**Floating-point confidence values**: + +Use `TYPE_ARRAY_FLOAT32`: + +- Schema: `TYPE_ARRAY_FLOAT32`, `size_bytes = num_measurements × 4`, +`num_elements = num_measurements` +- Encoding: IEEE 754 single-precision floats, tightly packed + +**Leakage/erasure-resolving readout** (values beyond binary): + +Use `TYPE_ARRAY_UINT8` or `TYPE_ARRAY_INT32` depending on +the range of measurement outcomes +(e.g., 0=ground, 1=excited, 2=leakage state). + +## [Response Encoding](#response-encoding) + +The response is written to the TX ring buffer slot (separate from the RX buffer +that contains the request): + +```text +| RPCResponse | result_bytes | +``` + +Like the request payload, the response payload encoding is **defined by the +handler schema**. The schema's `results[]` array specifies the type and size +of each return value. + +### Single-Result Response + +For handlers returning one value, the result is written directly after the +response header. + +**Example response** for a handler returning a single `uint8_t`: + +Schema: + +- `result0`: `TYPE_UINT8`, `size_bytes=1`, `num_elements=1` + +Wire encoding: + +```text +Offset | Content | Value (hex) +-------|--------------------------------------------|-------------- +0-3 | magic (RPC_MAGIC_RESPONSE) | 53 51 55 43 +4-7 | status (0 = success) | 00 00 00 00 +8-11 | result_len | 01 00 00 00 +12-15 | request_id (echoed from request) | XX XX XX XX +16-23 | ptp_timestamp (echoed from request) | XX XX XX XX XX XX XX XX +24 | result value (uint8_t) | 03 +25-... | unused padding | XX XX XX XX +``` + +### Multi-Result Response + +For handlers returning multiple values, results are **concatenated in schema order** +(same pattern as multi-argument requests): + +```text +| RPCResponse | result0_bytes | result1_bytes | ... | +``` + +**Example**: Handler returning correction (`uint8_t`) + confidence (`float`) + +Schema: + +- `result0`: `TYPE_UINT8`, `size_bytes=1`, `num_elements=1` +- `result1`: `TYPE_FLOAT32`, `size_bytes=4`, `num_elements=1` + +Wire encoding: + +```text +Offset | Content +-------|-------- +0-23 | RPCResponse { magic, status=0, result_len=5, request_id, ptp_timestamp } +24 | correction (uint8_t) +25-28 | confidence (float32, IEEE 754) +``` + +### Status Codes + +- `status = 0`: Success +- `status > 0`: Handler-specific error +- `status < 0`: Protocol-level error + +## QEC-Specific Usage Example + +This section shows how the realtime messaging protocol is used for quantum +error correction (QEC) decoding. This is one application of the protocol; +other use cases follow the same pattern. + +### QEC Terminology + +In QEC applications, the following terminology applies: + +- **Measurement result**: +Raw readout value from a QPU measurement (0 or 1 for binary readout) +- **Detection event**: +`XOR`'d measurement results as dictated by the parity check (stabilizer) matrix +- **Syndrome**: +The full history or set of detection events used by the decoder + +The decoder consumes detection events (often called "syndrome data" colloquially) +and produces corrections. + +### QEC Decoder Handler + +Typical QEC decoder signature: + +```cpp +void qec_decode(const uint8_t* detection_events, uint32_t num_events, + uint8_t* correction); +``` + +Schema: + +- `arg0`: `TYPE_BIT_PACKED`, variable size (detection events, 1 bit per event) +- `arg1`: `TYPE_UINT32`, 4 bytes (number of detection events) +- `result0`: `TYPE_UINT8`, 1 byte (correction bit-packed) + +### Decoding Rounds + +For QEC applications, one RPC message typically corresponds to one **decoding round** +(one invocation of the decoder with a set of detection events). The boundaries of +each decoding round are determined by the quantum control system (e.g., FPGA) when +building RPC messages. + +Note: The term "shot" is often used in quantum computing to mean one full execution +of a quantum program (repeated `num_shots` times for statistics). In the context +of realtime decoding, we use "decoding round" to avoid confusion, as there may be +many RPC invocations during a single quantum program execution. diff --git a/realtime/docs/cudaq_realtime_network_interface.md b/realtime/docs/cudaq_realtime_network_interface.md new file mode 100644 index 00000000000..ff7eed9398b --- /dev/null +++ b/realtime/docs/cudaq_realtime_network_interface.md @@ -0,0 +1,250 @@ +# CUDA-Q Realtime Network Layer Provider Interface + +Using the [CUDA-Q Realtime host API](cudaq_realtime_host_api.md), one can +build an end-to-end RPC dispatch solution, as demonstrated in the +Hololink RDMA example with a simple increment RPC handler +(`realtime/unittests/utils`). + +In addition to building an end-to-end application based on specific networking software, +e.g., Hololink in the above example, we also provide a networking provider +wrapper interface, allowing one to build a networking-agnostic application. + +## Quick Start + +CUDA-Q Realtime networking interface consists of a set of APIs to +construct a real-time RPC dispatch solution in a networking-agnostic manner. +These APIs are backed by a provider plugin (as a shared library) +implementing the specific transport protocol. + +The basic APIs for the networking interface are: + +### Create the networking 'bridge' + +```cpp +/// @brief Create and initialize a transport bridge for the specified provider. +/// For the built-in Hololink provider, this loads the Hololink shared library +/// and initializes the transceiver with the provided `args`. For the EXTERNAL +/// provider, this loads the shared library specified by the +/// CUDAQ_REALTIME_BRIDGE_LIB environment variable and calls its create callback +/// to initialize the bridge. +cudaq_status_t +cudaq_bridge_create(cudaq_realtime_bridge_handle_t *out_bridge_handle, + cudaq_realtime_transport_provider_t provider, int argc, + char **argv); +``` + +This will initialize the networking layer context. The `cudaq_realtime_transport_provider_t` +enum specifies whether it is a builtin provider (e.g., Hololink) or an external one. +For the latter, it will perform dynamic loading to retrieve the +networking implementation. Arguments, e.g., networking information, can also be provided +to initialize the networking context. + +### Initialize a connection to the remote peer, e.g., a FPGA + +```cpp +/// @brief Connect the transport bridge. +cudaq_status_t cudaq_bridge_connect(cudaq_realtime_bridge_handle_t bridge); +``` + +### Retrieve the transport context information + +This context information can be either a ring buffer (for `cudaq_dispatcher_set_ringbuffer`) +or a unified context for (`cudaq_dispatcher_set_unified_launch`). + +```cpp +/// @brief Retrieve the transport context for the given bridge. +/// This could be a ring buffer or unified context. +cudaq_status_t cudaq_bridge_get_transport_context( + cudaq_realtime_bridge_handle_t bridge, + cudaq_realtime_transport_context_t context_type, void *out_context); +``` + +### Start the transport layer processing loop, i.e., ready to send and receive packages + +```cpp +/// @brief Launch the transport bridge's main processing loop (e.g. start +/// Hololink kernels). +cudaq_status_t cudaq_bridge_launch(cudaq_realtime_bridge_handle_t bridge); +``` + +Depending on the implementation, this could mean launching kernels/functions to +monitor the network stack (e.g., a socket, RDMA data, etc.) and fill up the RPC +header and payload accordingly as specified in the [message protocol](cudaq_realtime_message_protocol.md). + +### Terminate the connection to the remote peer + +```cpp +/// @brief Disconnect the transport bridge (e.g. stop Hololink kernels and +/// disconnect). +cudaq_status_t cudaq_bridge_disconnect(cudaq_realtime_bridge_handle_t bridge); +``` + +### Destroy the transport context + +```cpp +/// @brief Destroy the transport bridge and release all associated resources. +cudaq_status_t cudaq_bridge_destroy(cudaq_realtime_bridge_handle_t bridge); +``` + +An example of using this wrapper interface can be found at `realtime/unittests/bridge_interface/hololink/hololink_bridge.cpp`. + +## Extending CUDA-Q realtime with a custom networking interface + +This guide explains how to integrate a new networking provider +with CUDA-Q realtime via this interface. +The integration process involves creating a shared library implementing +the below `cudaq_realtime_bridge_interface_t` and provide a `cudaq_realtime_get_bridge_interface` +function to retrieve a static instance of this interface. + +```cpp +/// @brief Interface struct for transport layer providers. Each provider must +/// implement this interface and provide a `getter` function +/// (`cudaq_realtime_get_bridge_interface`) that returns a pointer to a +/// statically allocated instance of this struct with the function pointers set +/// to the provider's implementation. +typedef struct { + int version; + cudaq_status_t (*create)(cudaq_realtime_bridge_handle_t *, int, char **); + cudaq_status_t (*destroy)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*get_transport_context)(cudaq_realtime_bridge_handle_t, + cudaq_realtime_transport_context_t, + void *); + cudaq_status_t (*connect)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*launch)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*disconnect)(cudaq_realtime_bridge_handle_t); + +} cudaq_realtime_bridge_interface_t; +``` + +At runtime, when a `CUDAQ_PROVIDER_EXTERNAL` is requested in `cudaq_bridge_create`, +CUDA-Q will retrieve the environment variable `CUDAQ_REALTIME_BRIDGE_LIB` +to locate the shared library implementing this interface +to provide networking functionality. + +### Example + +Here's a template for implementing a networking interface wrapper class. + +```cpp +#include "" +#include "cudaq/realtime/daemon/bridge/bridge_interface.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +// Custom data structure for your networking stack. +// This will be encapsulated as an opaque `cudaq_realtime_bridge_handle_t`. +struct ProviderNameNetworkContext { + +}; + +/// Implementing the cudaq_realtime_bridge_interface_t functions +extern "C" { +static cudaq_status_t +provider_name_bridge_create(cudaq_realtime_bridge_handle_t *handle, int argc, + char **argv) { + + // Create and initialize the networking handle + // This may take into account the arguments. + ProviderNameNetworkContext *ctx = new ProviderNameNetworkContext(...); + + // Set the output handle to the created context (opaque to the caller) + *handle = ctx; + + return CUDAQ_OK; +} + +static cudaq_status_t +provider_name_bridge_destroy(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + ProviderNameNetworkContext *ctx = reinterpret_cast(handle); + + // Add any clean-up actions (if not already handled in the `ProviderNameNetworkContext` destructor) + + delete ctx; + return CUDAQ_OK; +} + +static cudaq_status_t +provider_name_bridge_get_transport_context( + cudaq_realtime_bridge_handle_t handle, + cudaq_realtime_transport_context_t context_type, void *out_context) { + + if (!handle || !out_ringbuffer) + return CUDAQ_ERR_INVALID_ARG; + ProviderNameNetworkContext *ctx = reinterpret_cast(handle); + + // Populate the transport context + if (context_type == RING_BUFFER) { + out_ringbuffer->rx_flags = ...; + ... + } + + return CUDAQ_OK; +} + +static cudaq_status_t +provider_name_bridge_connect(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + ProviderNameNetworkContext *ctx = reinterpret_cast(handle); + + // Perform any custom actions to initiate a connection: open data stream/socket, etc. + + return CUDAQ_OK; +} + +static cudaq_status_t +provider_name_bridge_launch(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + ProviderNameNetworkContext *ctx = reinterpret_cast(handle); + // Launch thread/CUDA kernels/etc. to monitor the networking traffic + + return CUDAQ_OK; +} + +static cudaq_status_t +provider_name_bridge_disconnect(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + ProviderNameNetworkContext *ctx = reinterpret_cast(handle); + + // Terminate the connection, e.g., stop any network monitoring actions, closing sockets/streams, etc. + return CUDAQ_OK; +} + + +// Add an entry point hook to retrieve the networking interface implementation +cudaq_realtime_bridge_interface_t *cudaq_realtime_get_bridge_interface() { + static cudaq_realtime_bridge_interface_t cudaq_provider_name_bridge_interface = { + CUDAQ_REALTIME_BRIDGE_INTERFACE_VERSION, + provider_name_bridge_create, + provider_name_bridge_destroy, + provider_name_bridge_get_transport_context, + provider_name_bridge_connect, + provider_name_bridge_launch, + provider_name_bridge_disconnect, + }; + return &cudaq_provider_name_bridge_interface; +} +} + +``` + +A sample of a `CMakeLists.txt` configuration is also provided here for reference. + +```cmake +find_package( REQUIRED) + +# Create the networking interface wrapper +add_library(cudaq-realtime-bridge-provider-name SHARED provider_name_bridge_impl.cpp) + +target_include_directories(cudaq-realtime-bridge-provider-name + PRIVATE + ${CUDAQ_REALTIME_INCLUDE_DIR}) + +target_link_libraries(cudaq-realtime-bridge-provider-name + PRIVATE + cudaq-realtime + ) +``` diff --git a/realtime/docs/nvqlink_latency_demo.md b/realtime/docs/nvqlink_latency_demo.md new file mode 100644 index 00000000000..2637241cb61 --- /dev/null +++ b/realtime/docs/nvqlink_latency_demo.md @@ -0,0 +1,263 @@ +# Steps to execute the NVQLink latency demo + +The source Verilog code can be found [here](https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/QEC/HSB-2.6.0-EA/). + +More details about how the `Holoscan Sensor Bridge` (`HSB`) IP can be incorporated +can be found [here](https://docs.nvidia.com/holoscan/sensor-bridge/latest/fpga_index.html) + +Furthermore, for this experiment, we need the Integrated Logic Analyzer (`ILA`) +to keep the captured measurements. See the "Hololink IP: +Connecting an `APB` `ILA` for Debug" section below. + +> **Note:** For this experiment, we recommend using NVIDIA ConnectX-7 NIC +with dual `QSFP` ports. Prior ConnectX generations may not have all +the capabilities required. + +## Steps to do the experiment + +1. Load the bit-file into the FPGA. +2. Setup the host to run the experiment. +Mainly the IP address of the NIC needs to be set to `192.168.0.101`. +More details can be found at the +*Data Channel Enumeration and IP Address Configuration* section of [this document](https://docs.nvidia.com/holoscan/sensor-bridge/latest/architecture.html) +3. Download the accompanying software from [GitHub](https://github.com/nvidia-holoscan/holoscan-sensor-bridge/tree/release-2.6.0-EA) + + Then generate the docker: + + ```sh + sudo sh ./docker/build.sh --dgpu + sudo sh ./docker/demo.sh + ``` + +> **Note:** The above command is applicable to discrete GPU configurations. +If using an integrated GPU configuration, please use the `--igpu` option. + +To run the test, here is an example for 32B messages reported in the paper: + +```sh +python3 ./examples/gpu_roce_loopback.py --frame-size=32 --hololink=192.168.0.2 --rx-ibv-name=mlx5_0 --tx-ibv-name=mlx5_0 --mtu=256 +``` + +> **Note:** The `rx-ibv-name` and `tx-ibv-name` arguments in the above command +may need to be changed according to the system setup. + +Then to capture the data from the experiment and run the latency calculation: + +```sh +python3 ila.py +python3 latency_calc.py +``` + +> **Note:** These two python scripts can be found next to the Verilog source code. + +## Hololink IP: Connecting an `APB` `ILA` for Debug + +This guide describes how to attach an Integrated Logic Analyzer (`ILA`) +to one of the Hololink IP's `APB` register interfaces for real-time signal capture +and debugging over Ethernet. + +### Overview + +The Hololink IP exposes multiple `APB` register interfaces via the `REG_INST` +parameter (defined in `HOLOLINK_def.svh`). +These interfaces can be used to connect custom user logic, including `ILA`'s, +for monitoring internal signals. + +In this example, we connect the `s_apb_ila` module to **`APB[2]`** +and configure it to capture `PTP` timestamps, frame information, +and other debug signals. + +### `APB` Interface Signals from Hololink + +The Hololink IP provides the following `APB` signals for user register interfaces: + +```systemverilog +// From HOLOLINK_top outputs +logic [`REG_INST-1:0] apb_psel; // Per-interface select +logic apb_penable; // Common enable +logic [31:0] apb_paddr; // Common address bus +logic [31:0] apb_pwdata; // Common write data +logic apb_pwrite; // Common write enable + +// To HOLOLINK_top inputs +logic [`REG_INST-1:0] apb_pready; // Per-interface ready +logic [31:0] apb_prdata [`REG_INST-1:0]; // Per-interface read data +logic [`REG_INST-1:0] apb_pserr; // Per-interface error +``` + +### Step 1: Tie Off Unused `APB` Interfaces + +For any `APB` interfaces not in use, tie off the signals appropriately: + +```systemverilog +// Tie off unused APB bus signals +assign apb_pserr[7:3] = '0; +assign apb_pserr[1:0] = '0; +assign apb_pready[7:3] = '1; +assign apb_pready[1:0] = '0; +``` + +> **Note:** `APB[2]` is left unassigned here since it will be connected to the `ILA`. + +--- + +### Step 2: Create `APB` Interface Structs for the `ILA` + +The `s_apb_ila` module uses the `apb_m2s` and `apb_s2m` struct types from `apb_pkg`. +Declare the interface signals: + +```systemverilog +import apb_pkg::*; + +apb_m2s ila_apb_m2s; +apb_s2m ila_apb_s2m; +``` + +--- + +### Step 3: Instantiate the `s_apb_ila` Module + +The `s_apb_ila` module is part of the Hololink IP library (`lib_apb/s_apb_ila.sv`). + +```systemverilog +localparam ILA_DATA_WIDTH = 256; + +s_apb_ila #( + .DEPTH ( 65536 ), + .W_DATA ( ILA_DATA_WIDTH ) +) u_apb_ila ( + // APB Interface (slow clock domain) + .i_aclk ( apb_clk ), + .i_arst ( apb_rst ), + .i_apb_m2s ( ila_apb_m2s ), + .o_apb_s2m ( ila_apb_s2m ), + + // User Capture Interface (fast clock domain) + .i_pclk ( hif_clk ), + .i_prst ( hif_rst ), + .i_trigger ( '1 ), // Always triggered + .i_enable ( '1 ), // Always enabled + .i_wr_data ( ila_wr_data ), // Data to capture + .i_wr_en ( ptp_ts_en ), // Write enable + .o_ctrl_reg ( ) // Optional control output +); +``` + +--- + +### Step 4: Connect `APB[2]` to the `ILA` + +Map the Hololink `APB` signals to the `ILA`'s struct interface: + +```systemverilog +// APB Master-to-Slave signals (from Hololink to ILA) +assign ila_apb_m2s.psel = apb_psel[2]; // Select APB interface 2 +assign ila_apb_m2s.penable = apb_penable; +assign ila_apb_m2s.paddr = apb_paddr; +assign ila_apb_m2s.pwdata = apb_pwdata; +assign ila_apb_m2s.pwrite = apb_pwrite; + +// APB Slave-to-Master signals (from ILA back to Hololink) +assign apb_pready[2] = ila_apb_s2m.pready; +assign apb_prdata[2] = ila_apb_s2m.prdata; +assign apb_pserr[2] = ila_apb_s2m.pserr; +``` + +--- + +### Step 5: Define the Write Data Vector + +Structure the `ila_wr_data` signal to capture the signals of interest. +Here's the example configuration used: + +```systemverilog +localparam ILA_DATA_WIDTH = 256; +logic [ILA_DATA_WIDTH-1:0] ila_wr_data; + +// Bit assignments +assign ila_wr_data[63:0] = ptp_ts[63:0]; // PTP timestamp from sensor frame +assign ila_wr_data[127:64] = {ptp_sec_sync_usr[31:0], // Synchronized PTP seconds + ptp_nsec_sync_usr[31:0]}; // Synchronized PTP nanoseconds +assign ila_wr_data[139:128] = frame_cnt; // 12-bit frame counter +assign ila_wr_data[140] = sof; // Start of frame +assign ila_wr_data[141] = eof; // End of frame +assign ila_wr_data[255:142] = 'h123456789ABCDEF; // Debug pattern (filler) +``` + +#### Write Data Bit Map Summary + +| Bits | Width | Signal | Description | +|------|-------|--------|-------------| +| [63:0] | 64 | `ptp_ts` | `PTP` timestamp extracted from sensor TX data | +| [127:64] | 64 | `{ptp_sec, ptp_nsec}` | Synchronized `PTP` time (seconds + nanoseconds) from Hololink | +| [139:128] | 12 | `frame_cnt` | Frame counter extracted from sensor TX data | +| [140] | 1 | `sof` | Start of frame indicator | +| [141] | 1 | `eof` | End of frame indicator | +| [255:142] | 114 | Debug pattern | Fixed pattern for debugging | + +> **Note:** `ptp_sec_sync_usr` and `ptp_nsec_sync_usr` are the `PTP` time outputs +from Hololink (`o_ptp_sec`, `o_ptp_nanosec`) synchronized to +the host interface clock domain. + +--- + +### Step 6: Supporting Logic + +#### Frame Detection + +```systemverilog +logic sof, eof; +assign sof = sif_tx_axis_tvalid[0]; // SOF on first valid +assign eof = sif_tx_axis_tlast[0]; // EOF on last +``` + +#### Timestamp Capture + +```systemverilog +logic [79:0] ptp_ts; +logic ptp_ts_en; +logic [11:0] frame_cnt; + +always_ff @(posedge hif_clk) begin + if (hif_rst) begin + ptp_ts <= '0; + ptp_ts_en <= '0; + frame_cnt <= '0; + end + else begin + ptp_ts <= (sof) ? sif_tx_axis_tdata[0][79:0] : ptp_ts; + frame_cnt <= (sof) ? sif_tx_axis_tdata[0][91:80] : frame_cnt; + ptp_ts_en <= sof; + end +end +``` + +--- + +### Sensor RX Interface Tie-Off + +In this configuration, only the **Sensor TX interface** is used +(for receiving data from the host). +The Sensor RX interface is not used and should be tied off as follows: + +```systemverilog +// Sensor Rx Streaming Interface - Tie off (not used) +.i_sif_axis_tvalid ( '0 ), +.i_sif_axis_tlast ( '0 ), +.i_sif_axis_tdata ( '{default:0} ), +.i_sif_axis_tkeep ( '{default:0} ), +.i_sif_axis_tuser ( '{default:0} ), +.o_sif_axis_tready ( ), // Leave unconnected +``` + +The Sensor TX interface (`o_sif_axis_*`) should have `i_sif_axis_tready` +tied high to always accept data: + +```systemverilog +.i_sif_axis_tready ( '1 ), +``` + +--- + +Once integrated, the `ILA` data can be accessed via `APB` register +reads from the host over Ethernet using the Hololink control plane. diff --git a/realtime/docs/user_guide.md b/realtime/docs/user_guide.md new file mode 100644 index 00000000000..ff9d710ea06 --- /dev/null +++ b/realtime/docs/user_guide.md @@ -0,0 +1,149 @@ +# CUDA-Q Realtime Installation Guide + +The following page describes the installation procedure of +CUDA-Q Realtime, including connectivity to a +[Holoscan Sensor Bridge](https://www.nvidia.com/en-us/technologies/holoscan-sensor-bridge/) +(`HSB`) FPGA. + +## Components + +### Hardware Components + +- A host system with NVIDIA GPU and ConnectX-7/BlueField NIC. + +- A FPGA, programmed with `HSB` IP and connected to the NIC. + +> **_NOTE:_** We recommended using NVIDIA ConnectX-7 as prior generations +may not have all the required capabilities. + +### Software Components + +- CUDA-Q Realtime installer. + +- CUDA Runtime (12+) + +- [`DOCA` 3.3.0 installation](https://developer.nvidia.com/doca-downloads) +with `gpunetio` support. + +> **_NOTE:_** `DOCA` is required to run the end-to-end validation with FPGA +using the builtin `HSB` support of CUDA-Q realtime. + + + +> **_NOTE:_** Please make sure `doca-sdk-gpunetio` is installed along with `doca-all`. + +## Setup + +To install CUDA-Q Realtime with Holoscan Sensor Bridge on a host machine +(bare-metal), please follow these steps. + +> **_NOTE:_** Alternatively, we can also build and run these steps in a Docker container. +Please refer to this [section](#using-docker) for instructions. + +1. Install CUDA-Q Realtime (if not already done so) + + For example, + + ```bash + ./install_cuda_quantum_realtime_cu13.arm64 --accept + ``` + + > **_NOTE:_** Please verify that CUDA-Q Realtime has been installed to `/opt/nvidia/cudaq/realtime`. + + + + > **_NOTE:_** After the installation, please follow the instructed + > post-installation step to set the environment variable, e.g., + > + > ```bash + > export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/cudaq/realtime/lib + > ``` + +2. Load `HSB` IP bit-file to the FPGA + + The bit-file for supported FPGA vendors + can be found [here](https://edge.urm.nvidia.com/artifactory/sw-holoscan-thirdparty-generic-local/QEC/HSB-2.6.0-EA/). + +3. Run the validation script + + The validation script is located at `/opt/nvidia/cudaq/realtime/validate.sh`. + + ```bash + bash /opt/nvidia/cudaq/realtime/validate.sh --page-size 512 --device mlx5_0 --gpu 0 --bridge-ip 192.168.0.101 --fpga-ip 192.168.0.2 --unified + ``` + + > **_NOTE:_** + > The command line arguments need to be adjusted based on the system setup: + > + > - `--device` is the `IB` device name that is connected to the FPGA. + > - `--gpu` is the GPU device Id that we want to run the RPC callback on. + > - `--fpga-ip` is the IP address of the `HSB` FPGA. + > - `--bridge-ip` is the IP address of the NIC on the host machine. + > - `--page-size` is the ring buffer slot size in bytes. + + Upon successful completion, the above validation script should + print out the following: + + ```text + === Verification Summary === + ILA samples captured: 100 + tvalid=0 (idle): 0 + RPC responses: 100 + Non-RPC frames: 0 + Unique messages verified: 100 of 100 + Responses matched: 100 + Header errors: 0 + Payload errors: 0 + + === PTP Round-Trip Latency === + Samples: 100 + Min: 3589 ns + Max: 6348 ns + Avg: 3872.0 ns + CSV written: ptp_latency.csv + RESULT: PASS + + === Shutting down === + ``` + +Congratulations! You have successfully validated the CUDA-Q Realtime installation. + +> **_NOTE:_** In the above test script, we execute a simple RPC dispatch tests, whereby +the FPGA sends data (array of bytes) to the GPU; the GPU performs +a simple increment by one calculation on each of the byte +in the incoming array and returns the array. +We validate the data and measure the round-trip latency +then output the report as shown above. + +## Using Docker + +In the CUDA-Q Realtime installation, the `demo.sh` script will +build a containerized environment containing necessary dependencies +for CUDA-Q Realtime. + +For example, + +```bash +bash /opt/nvidia/cudaq/realtime/demo.sh +``` + +will transfer the local CUDA-Q installation into that containerized environment. + +Inside the container, we can then run the validation check, i.e., + +```bash +bash /opt/nvidia/cudaq/realtime/validate.sh --page-size 512 --device mlx5_0 --gpu 0 --bridge-ip 192.168.0.101 --fpga-ip 192.168.0.2 --unified +``` + +### Manual Installation in Docker Container + +1. Launch your container with networking and GPU support. + + For example, `--net host --gpus all` should be used to launch the container. + +2. Install CUDA runtime. + +3. Install [`DOCA`](https://developer.nvidia.com/doca-downloads) +with `gpunetio` (`doca-sdk-gpunetio`) support. + +4. Download and install CUDA-Q Installer as described in the [setup](#setup) section. diff --git a/realtime/examples/gpu_dispatch/CMakeLists.txt b/realtime/examples/gpu_dispatch/CMakeLists.txt new file mode 100644 index 00000000000..08b67abdf20 --- /dev/null +++ b/realtime/examples/gpu_dispatch/CMakeLists.txt @@ -0,0 +1,69 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +cmake_minimum_required(VERSION 3.24 FATAL_ERROR) + +# A simple example demonstrating how to use the CUDA Quantum Realtime library. +project(dispatch_kernel) +find_package(CUDAToolkit REQUIRED) + +enable_language(CUDA) + +if(NOT CUDAQ_REALTIME_ROOT) + SET(CUDAQ_REALTIME_ROOT "$ENV{CUDAQ_REALTIME_ROOT}" CACHE PATH "Path to CUDA Quantum Realtime installation") +endif() + +if (NOT CUDAQ_REALTIME_ROOT) + # Default path to look for the library if the environment variable is not set + set(CUDAQ_REALTIME_ROOT "/opt/nvidia/cudaq/realtime") + message(STATUS "CUDAQ_REALTIME_ROOT environment variable is not set. Use the default path: ${CUDAQ_REALTIME_ROOT}") +endif() + + +find_library(CUDAQ_REALTIME_LIB + NAMES cudaq-realtime + HINTS + ${CUDAQ_REALTIME_ROOT}/lib +) + +if (NOT CUDAQ_REALTIME_LIB) + message(FATAL_ERROR "Could not find cudaq-realtime library. Please set CUDAQ_REALTIME_ROOT to the correct path.") +else() + message(STATUS "Found cudaq-realtime library at: ${CUDAQ_REALTIME_LIB}") +endif() + +find_library(CUDAQ_REALTIME_DISPATCH_LIB + NAMES cudaq-realtime-dispatch + HINTS + ${CUDAQ_REALTIME_ROOT}/lib +) + +if (NOT CUDAQ_REALTIME_DISPATCH_LIB) + message(FATAL_ERROR "Could not find cudaq-realtime-dispatch library. Please set CUDAQ_REALTIME_ROOT to the correct path.") +else() + message(STATUS "Found cudaq-realtime-dispatch library at: ${CUDAQ_REALTIME_DISPATCH_LIB}") +endif() + + +add_executable(dispatch_kernel dispatch_kernel.cu) + +set_target_properties(dispatch_kernel PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 +) +set_target_properties(dispatch_kernel PROPERTIES CUDA_ARCHITECTURES "80;90") + +target_include_directories(dispatch_kernel PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAQ_REALTIME_ROOT}/include +) + +target_link_libraries(dispatch_kernel PRIVATE + CUDA::cudart + ${CUDAQ_REALTIME_LIB} + ${CUDAQ_REALTIME_DISPATCH_LIB} +) diff --git a/realtime/examples/gpu_dispatch/dispatch_kernel.cu b/realtime/examples/gpu_dispatch/dispatch_kernel.cu new file mode 100644 index 00000000000..45d23902cd5 --- /dev/null +++ b/realtime/examples/gpu_dispatch/dispatch_kernel.cu @@ -0,0 +1,385 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file dispatch_kernel.cu +/// @brief Simple dispatch kernel for testing libcudaq-realtime. +/// +/// This example demonstrates a simple dispatch kernel that processes RPC +/// requests. +/// + +#include +#include +#include +#include +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" + +// Helper macro for CUDA error checking +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error in " << __FILE__ << ":" << __LINE__ << ": " \ + << cudaGetErrorString(err) << " (" << err << ")" << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDAQ_CHECK(call) \ + do { \ + auto err = call; \ + if (err != CUDAQ_OK) { \ + std::cerr << "CUDAQ error in " << __FILE__ << ":" << __LINE__ << ": " \ + << err << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +//============================================================================== +// Test Handler: Simple noop that copies input to output and adds 1 to each byte +//============================================================================== + +/// @brief Test handler that adds 1 to each byte. +__device__ int increment_handler(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len) { + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + out_data[i] = in_data[i] + 1; + } + *result_len = arg_len; + return 0; +} + +//============================================================================== +// Host API Dispatch Kernel Test Helpers +//============================================================================== + +constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_increment"); + +__device__ int rpc_increment_handler(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len) { + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + out_data[i] = static_cast(in_data[i] + 1); + } + *result_len = arg_len; + return 0; +} + +__global__ void init_rpc_function_table(cudaq_function_entry_t *entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.device_fn_ptr = + reinterpret_cast(&rpc_increment_handler); + entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + + // Schema: 1 array argument (uint8), 1 array result (uint8) + entries[0].schema.num_args = 1; + entries[0].schema.num_results = 1; + entries[0].schema.reserved = 0; + entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.args[0].reserved[0] = 0; + entries[0].schema.args[0].reserved[1] = 0; + entries[0].schema.args[0].reserved[2] = 0; + entries[0].schema.args[0].size_bytes = 0; // Variable size + entries[0].schema.args[0].num_elements = 0; // Variable size + entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.results[0].reserved[0] = 0; + entries[0].schema.results[0].reserved[1] = 0; + entries[0].schema.results[0].reserved[2] = 0; + entries[0].schema.results[0].size_bytes = 0; // Variable size + entries[0].schema.results[0].num_elements = 0; // Variable size + } +} + +bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, + volatile uint64_t **host_flags_out, + volatile uint64_t **device_flags_out, + std::uint8_t **host_data_out, + std::uint8_t **device_data_out) { + void *host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, num_slots * sizeof(uint64_t), + cudaHostAllocMapped); + if (err != cudaSuccess) + return false; + + void *device_flags_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void *host_data_ptr = nullptr; + err = + cudaHostAlloc(&host_data_ptr, num_slots * slot_size, cudaHostAllocMapped); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void *device_data_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + cudaFreeHost(host_data_ptr); + return false; + } + + memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); + + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); + return true; +} + +void free_ring_buffer(volatile uint64_t *host_flags, std::uint8_t *host_data) { + if (host_flags) + cudaFreeHost(const_cast(host_flags)); + if (host_data) + cudaFreeHost(host_data); +} + +extern "C" void launch_dispatch_kernel_wrapper( + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); +} + +//============================================================================== +// Test Kernel for DeviceCallMode +//============================================================================== + +using HandlerFunc = int (*)(const void *, void *, std::uint32_t, std::uint32_t, + std::uint32_t *); + +__device__ HandlerFunc d_increment_handler = increment_handler; + +//============================================================================== +// Main +//============================================================================== +int main() { + constexpr std::size_t num_slots_ = 2; + constexpr std::size_t slot_size_ = 256; + volatile uint64_t *rx_flags_host = nullptr; + volatile uint64_t *tx_flags_host = nullptr; + volatile uint64_t *rx_flags_device = nullptr; + volatile uint64_t *tx_flags_device = nullptr; + std::uint8_t *rx_data_host = nullptr; + std::uint8_t *tx_data_host = nullptr; + std::uint8_t *rx_data_device = nullptr; + std::uint8_t *tx_data_device = nullptr; + + volatile int *shutdown_flag_ = nullptr; + volatile int *d_shutdown_flag_ = nullptr; + uint64_t *d_stats_ = nullptr; + + cudaq_function_entry_t *d_function_entries_ = nullptr; + std::size_t func_count_ = 0; + + cudaq_dispatch_manager_t *manager_ = nullptr; + cudaq_dispatcher_t *dispatcher_ = nullptr; + + const bool allocated_rx = + allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host, + &rx_flags_device, &rx_data_host, &rx_data_device); + + if (!allocated_rx) { + std::cerr << "Failed to allocate RX ring buffer" << std::endl; + return 1; + } + const bool allocated_tx = + allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host, + &tx_flags_device, &tx_data_host, &tx_data_device); + + if (!allocated_tx) { + std::cerr << "Failed to allocate TX ring buffer" << std::endl; + return 1; + } + + void *tmp_shutdown = nullptr; + CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + shutdown_flag_ = static_cast(tmp_shutdown); + void *tmp_d_shutdown = nullptr; + CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown_flag_ = static_cast(tmp_d_shutdown); + *shutdown_flag_ = 0; + int zero = 0; + CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, sizeof(int), + cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMalloc(&d_stats_, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats_, 0, sizeof(uint64_t))); + + CUDA_CHECK(cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); + init_rpc_function_table<<<1, 1>>>(d_function_entries_); + CUDA_CHECK(cudaDeviceSynchronize()); + func_count_ = 1; + + CUDAQ_CHECK(cudaq_dispatch_manager_create(&manager_)); + cudaq_dispatcher_config_t config{}; + config.device_id = 0; + config.num_blocks = 1; + config.threads_per_block = 64; + config.num_slots = static_cast(num_slots_); + config.slot_size = static_cast(slot_size_); + config.vp_id = 0; + config.kernel_type = CUDAQ_KERNEL_REGULAR; + config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + CUDAQ_CHECK(cudaq_dispatcher_create(manager_, &config, &dispatcher_)); + + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = rx_flags_device; + ringbuffer.tx_flags = tx_flags_device; + ringbuffer.rx_data = rx_data_device; + ringbuffer.tx_data = tx_data_device; + ringbuffer.rx_stride_sz = slot_size_; + ringbuffer.tx_stride_sz = slot_size_; + CUDAQ_CHECK(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer)); + + cudaq_function_table_t table{}; + table.entries = d_function_entries_; + table.count = func_count_; + CUDAQ_CHECK(cudaq_dispatcher_set_function_table(dispatcher_, &table)); + + CUDAQ_CHECK( + cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_)); + CUDAQ_CHECK(cudaq_dispatcher_set_launch_fn(dispatcher_, + &launch_dispatch_kernel_wrapper)); + CUDAQ_CHECK(cudaq_dispatcher_start(dispatcher_)); + + const auto write_rpc_request = [&](std::size_t slot, + const std::vector &payload) { + std::uint8_t *slot_data = + const_cast(rx_data_host) + slot * slot_size_; + auto *header = reinterpret_cast(slot_data); + header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + header->function_id = RPC_INCREMENT_FUNCTION_ID; + header->arg_len = static_cast(payload.size()); + memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), payload.data(), + payload.size()); + }; + + const auto read_rpc_response = [&](std::size_t slot, + std::vector &payload, + std::int32_t *status_out = nullptr, + std::uint32_t *result_len_out = nullptr) { + __sync_synchronize(); + // Read from TX buffer (dispatch kernel writes response to symmetric TX) + const std::uint8_t *slot_data = + const_cast(tx_data_host) + slot * slot_size_; + auto *response = + reinterpret_cast(slot_data); + + if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) + return false; + if (status_out) + *status_out = response->status; + if (result_len_out) + *result_len_out = response->result_len; + if (response->status != 0) + return false; + + payload.resize(response->result_len); + memcpy(payload.data(), slot_data + sizeof(cudaq::realtime::RPCResponse), + response->result_len); + return true; + }; + + // Sample payload: array of bytes + std::vector payload = {0, 1, 2, 3}; + write_rpc_request(0, payload); + + std::cout << "RPC request sent, waiting for response..." << std::endl; + __sync_synchronize(); + const_cast(rx_flags_host)[0] = + reinterpret_cast(rx_data_device); + + int timeout = 50; + while (tx_flags_host[0] == 0 && timeout-- > 0) { + usleep(1000); + } + + if (timeout <= 0) { + std::cerr << "Timeout waiting for RPC response" << std::endl; + return 1; + } + + std::vector response; + std::int32_t status = -1; + std::uint32_t result_len = 0; + read_rpc_response(0, response, &status, &result_len); + std::cout << "RPC response received with status " << status + << " and result length " << result_len << std::endl; + + // Stop the dispatcher and clean up resources: + if (shutdown_flag_) { + *shutdown_flag_ = 1; + __sync_synchronize(); + } + if (dispatcher_) { + cudaq_dispatcher_stop(dispatcher_); + cudaq_dispatcher_destroy(dispatcher_); + dispatcher_ = nullptr; + } + if (manager_) { + cudaq_dispatch_manager_destroy(manager_); + manager_ = nullptr; + } + free_ring_buffer(rx_flags_host, rx_data_host); + free_ring_buffer(tx_flags_host, tx_data_host); + + if (shutdown_flag_) + cudaFreeHost(const_cast(shutdown_flag_)); + if (d_stats_) + cudaFree(d_stats_); + if (d_function_entries_) + cudaFree(d_function_entries_); + + bool valid = true; + for (std::size_t i = 0; i < result_len; ++i) { + std::cout << "Response byte " << i << ": " << static_cast(response[i]) + << "; expected: " << static_cast(payload[i] + 1) + << std::endl; + if (response[i] != static_cast(payload[i] + 1)) + valid = false; + } + + if (valid) { + std::cout << "Response is valid!" << std::endl; + } else { + std::cerr << "Response is invalid!" << std::endl; + return 1; + } + return 0; +} diff --git a/realtime/include/cudaq/realtime/daemon/bridge/bridge_interface.h b/realtime/include/cudaq/realtime/daemon/bridge/bridge_interface.h new file mode 100644 index 00000000000..4722e6712bc --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/bridge/bridge_interface.h @@ -0,0 +1,95 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +/// @file bridge_interface.h +/// @brief Interface Bindings for transport layer providers (e.g. Hololink). +/// +/// Different transport providers can be loaded at runtime via `dlopen`, +/// allowing for dynamic selection and initialization of the desired transport +/// layer. Environment variable CUDAQ_REALTIME_BRIDGE_LIB must be set to the +/// path of the shared library implementing the desired transport provider (if +/// not using the built-in Hololink provider). + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" + +#ifdef __cplusplus +extern "C" { +#endif + +///@brief Opaque data structure storing the details of the transport layer +/// connection +typedef void *cudaq_realtime_bridge_handle_t; + +typedef enum { + CUDAQ_PROVIDER_HOLOLINK = + 0, /// Hololink GPU-RoCE transceiver (built-in provider) + CUDAQ_PROVIDER_EXTERNAL = 1, /// Externally managed transport + +} cudaq_realtime_transport_provider_t; + +typedef enum { + RING_BUFFER = 0, // Ring buffer context (for Hololink provider) + UNIFIED = 1, /// Unified transport context for unified dispatch +} cudaq_realtime_transport_context_t; + +/// @brief Create and initialize a transport bridge for the specified provider. +/// For the built-in Hololink provider, this loads the Hololink shared library +/// and initializes the transceiver with the provided `args`. For the EXTERNAL +/// provider, this loads the shared library specified by the +/// CUDAQ_REALTIME_BRIDGE_LIB environment variable and calls its create callback +/// to initialize the bridge. +cudaq_status_t +cudaq_bridge_create(cudaq_realtime_bridge_handle_t *out_bridge_handle, + cudaq_realtime_transport_provider_t provider, int argc, + char **argv); + +/// @brief Destroy the transport bridge and release all associated resources. +cudaq_status_t cudaq_bridge_destroy(cudaq_realtime_bridge_handle_t bridge); + +/// @brief Retrieve the transport context for the given bridge. +/// This could be a ring buffer or unified context. +cudaq_status_t cudaq_bridge_get_transport_context( + cudaq_realtime_bridge_handle_t bridge, + cudaq_realtime_transport_context_t context_type, void *out_context); + +/// @brief Connect the transport bridge. +cudaq_status_t cudaq_bridge_connect(cudaq_realtime_bridge_handle_t bridge); + +/// @brief Launch the transport bridge's main processing loop (e.g. start +/// Hololink kernels). +cudaq_status_t cudaq_bridge_launch(cudaq_realtime_bridge_handle_t bridge); + +/// @brief Disconnect the transport bridge (e.g. stop Hololink kernels and +/// disconnect). +cudaq_status_t cudaq_bridge_disconnect(cudaq_realtime_bridge_handle_t bridge); + +#define CUDAQ_REALTIME_BRIDGE_INTERFACE_VERSION 1 + +/// @brief Interface struct for transport layer providers. Each provider must +/// implement this interface and provide a `getter` function +/// (`cudaq_realtime_get_bridge_interface`) that returns a pointer to a +/// statically allocated instance of this struct with the function pointers set +/// to the provider's implementation. +typedef struct { + int version; + cudaq_status_t (*create)(cudaq_realtime_bridge_handle_t *, int, char **); + cudaq_status_t (*destroy)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*get_transport_context)(cudaq_realtime_bridge_handle_t, + cudaq_realtime_transport_context_t, + void *); + cudaq_status_t (*connect)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*launch)(cudaq_realtime_bridge_handle_t); + cudaq_status_t (*disconnect)(cudaq_realtime_bridge_handle_t); + +} cudaq_realtime_bridge_interface_t; + +#ifdef __cplusplus +} +#endif diff --git a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h new file mode 100644 index 00000000000..9f5f6b4910e --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h @@ -0,0 +1,32 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/// Hololink/DOCA transport context for the unified dispatch kernel. +/// Packed by the Hololink bridge layer and passed as the opaque +/// transport_ctx pointer through the transport-agnostic dispatcher API. +typedef struct { + void *gpu_dev_qp; ///< doca_gpu_dev_verbs_qp* handle + uint8_t *rx_ring_data; ///< Device pointer to RX ring data buffer + size_t rx_ring_stride_sz; ///< Stride (slot size) in the ring buffer + uint32_t rx_ring_mkey; ///< Network-byte-order memory key (`htobe32(rkey)`) + uint32_t rx_ring_stride_num; ///< Number of slots in the ring + size_t frame_size; ///< Actual frame/payload size within a slot +} hololink_doca_transport_ctx; + +#ifdef __cplusplus +} +#endif diff --git a/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h new file mode 100644 index 00000000000..4619984f70d --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h @@ -0,0 +1,118 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_wrapper.h +/// @brief C interface to Hololink GpuRoceTransceiver. +/// +/// This wrapper avoids `fmt` library conflicts between Hololink (which uses +/// Holoscan's `fmt`) and CUDA files compiled by nvcc. + +#ifndef HOLOLINK_WRAPPER_H +#define HOLOLINK_WRAPPER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque handle for GpuRoceTransceiver +typedef void *hololink_transceiver_t; + +//============================================================================== +// Transceiver lifecycle +//============================================================================== + +/** + * Create a new Hololink transceiver. + * + * @param device_name IB device name (e.g., "rocep1s0f0") + * @param ib_port IB port number + * @param tx_ibv_qp Remote QP number (FPGA default: 2) + * @param gpu_id CUDA GPU device ID for DOCA GPUNetIO + * @param frame_size Size of each frame (cu_frame_size) + * @param page_size Size of each page/slot (cu_page_size) + * @param num_pages Number of pages (ring buffer slots) + * @param peer_ip Peer IP address + * @param forward 1 to run forward (echo) kernel + * @param rx_only 1 to run RX-only kernel + * @param tx_only 1 to run TX-only kernel + * @return Handle to transceiver, or NULL on failure + */ +hololink_transceiver_t hololink_create_transceiver( + const char *device_name, int ib_port, unsigned tx_ibv_qp, int gpu_id, + size_t frame_size, size_t page_size, unsigned num_pages, + const char *peer_ip, int forward, int rx_only, int tx_only); + +/** + * Destroy a transceiver and free resources. + */ +void hololink_destroy_transceiver(hololink_transceiver_t handle); + +/** + * Start the transceiver (initializes DOCA resources, creates QP/CQ). + * @return 1 on success, 0 on failure + */ +int hololink_start(hololink_transceiver_t handle); + +/** + * Close the transceiver (signals shutdown). + */ +void hololink_close(hololink_transceiver_t handle); + +/** + * Run the blocking monitor (launches GPU kernels and waits). + * This function blocks until close() is called. + */ +void hololink_blocking_monitor(hololink_transceiver_t handle); + +//============================================================================== +// QP information (for RDMA setup) +//============================================================================== + +uint32_t hololink_get_qp_number(hololink_transceiver_t handle); +uint32_t hololink_get_rkey(hololink_transceiver_t handle); +uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle); + +/** Get the DOCA GPU device QP handle (doca_gpu_dev_verbs_qp*). + * Needed by the unified dispatch kernel for direct DOCA verbs calls. */ +void *hololink_get_gpu_dev_qp(hololink_transceiver_t handle); + +//============================================================================== +// Ring buffer access +//============================================================================== + +/** Get device pointer to RX ring data buffer. */ +void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle); + +/** Get device pointer to RX ring flag array. */ +uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle); + +/** Get device pointer to TX ring data buffer. */ +void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle); + +/** Get device pointer to TX ring flag array. */ +uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle); + +/** Force eager CUDA module loading by querying kernel occupancy. + * Call before launching any persistent kernels. + * Returns true on success (all kernels valid). */ +bool hololink_query_kernel_occupancy(void); + +/** Get the page (slot) size configured for this transceiver. */ +size_t hololink_get_page_size(hololink_transceiver_t handle); + +/** Get the number of pages (slots) configured for this transceiver. */ +unsigned hololink_get_num_pages(hololink_transceiver_t handle); + +#ifdef __cplusplus +} +#endif + +#endif // HOLOLINK_WRAPPER_H diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h new file mode 100644 index 00000000000..a60ff452a12 --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h @@ -0,0 +1,265 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque handles +typedef struct cudaq_dispatch_manager_t cudaq_dispatch_manager_t; +typedef struct cudaq_dispatcher_t cudaq_dispatcher_t; + +// Error codes +typedef enum { + CUDAQ_OK = 0, + CUDAQ_ERR_INVALID_ARG = 1, + CUDAQ_ERR_INTERNAL = 2, + CUDAQ_ERR_CUDA = 3 +} cudaq_status_t; + +// Kernel synchronization type +typedef enum { + CUDAQ_KERNEL_REGULAR = 0, + CUDAQ_KERNEL_COOPERATIVE = 1, + CUDAQ_KERNEL_UNIFIED = 2 +} cudaq_kernel_type_t; + +// Dispatch invocation mode +typedef enum { + CUDAQ_DISPATCH_DEVICE_CALL = 0, + CUDAQ_DISPATCH_GRAPH_LAUNCH = 1 +} cudaq_dispatch_mode_t; + +// Payload type identifiers (matching PayloadTypeID in dispatch_kernel_launch.h) +typedef enum { + CUDAQ_TYPE_UINT8 = 0x10, + CUDAQ_TYPE_INT32 = 0x11, + CUDAQ_TYPE_INT64 = 0x12, + CUDAQ_TYPE_FLOAT32 = 0x13, + CUDAQ_TYPE_FLOAT64 = 0x14, + CUDAQ_TYPE_ARRAY_UINT8 = 0x20, + CUDAQ_TYPE_ARRAY_INT32 = 0x21, + CUDAQ_TYPE_ARRAY_FLOAT32 = 0x22, + CUDAQ_TYPE_ARRAY_FLOAT64 = 0x23, + CUDAQ_TYPE_BIT_PACKED = 0x30 +} cudaq_payload_type_t; + +// Type descriptor for arguments/results +typedef struct { + uint8_t type_id; // cudaq_payload_type_t value + uint8_t reserved[3]; // padding + uint32_t size_bytes; // total size in bytes + uint32_t num_elements; // number of elements (for arrays) +} cudaq_type_desc_t; + +// Handler schema describing function signature +typedef struct { + uint8_t num_args; // number of arguments + uint8_t num_results; // number of results + uint16_t reserved; // padding + cudaq_type_desc_t args[8]; // argument descriptors (max 8) + cudaq_type_desc_t results[4]; // result descriptors (max 4) +} cudaq_handler_schema_t; + +// Dispatcher configuration +typedef struct { + int device_id; // GPU device ID (>=0) + uint32_t num_blocks; // grid size + uint32_t threads_per_block; // block size + uint32_t num_slots; // ring buffer slots + uint32_t slot_size; // bytes per slot + uint32_t vp_id; // virtual port ID + cudaq_kernel_type_t kernel_type; // regular/cooperative kernel + cudaq_dispatch_mode_t dispatch_mode; // device call/graph launch +} cudaq_dispatcher_config_t; + +// GPU ring buffer pointers (device-visible mapped pointers) +typedef struct { + volatile uint64_t *rx_flags; // device pointer + volatile uint64_t *tx_flags; // device pointer + uint8_t *rx_data; // device pointer to RX data buffer + uint8_t *tx_data; // device pointer to TX data buffer + size_t rx_stride_sz; // size of each RX slot in bytes + size_t tx_stride_sz; // size of each TX slot in bytes +} cudaq_ringbuffer_t; + +// Unified function table entry with schema +typedef struct { + union { + void *device_fn_ptr; // for CUDAQ_DISPATCH_DEVICE_CALL + cudaGraphExec_t graph_exec; // for CUDAQ_DISPATCH_GRAPH_LAUNCH + } handler; + uint32_t function_id; // hash of function name (FNV-1a) + uint8_t dispatch_mode; // cudaq_dispatch_mode_t value + uint8_t reserved[3]; // padding + cudaq_handler_schema_t schema; // function signature schema +} cudaq_function_entry_t; + +// Function table for device-side dispatch +typedef struct { + cudaq_function_entry_t *entries; // device pointer to array of entries + uint32_t count; // number of entries +} cudaq_function_table_t; + +// Host launch function pointer type +typedef void (*cudaq_dispatch_launch_fn_t)( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a) +void cudaq_launch_dispatch_kernel_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +void cudaq_launch_dispatch_kernel_cooperative( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + volatile int *shutdown_flag, uint64_t *stats, size_t num_slots, + uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream); + +// Unified dispatch launch function pointer type. +// The unified kernel combines RDMA RX, RPC dispatch, and RDMA TX into a single +// kernel, eliminating inter-kernel flag handoff overhead. Transport-specific +// details are passed via an opaque context pointer so the dispatcher API +// remains transport-agnostic. +typedef void (*cudaq_unified_launch_fn_t)( + void *transport_ctx, cudaq_function_entry_t *function_table, + size_t func_count, volatile int *shutdown_flag, uint64_t *stats, + cudaStream_t stream); + +// Graph-enabled dispatch kernels (requires compute capability 9.0+, sm_90+) +// These functions are only available when compiled for sm_90 or higher +#if defined(__CUDACC__) || defined(CUDA_VERSION) + +//============================================================================== +// Graph-Based Dispatch API (Proper Device-Side Graph Launch Support) +//============================================================================== +// +// These functions properly support device-side cudaGraphLaunch() by wrapping +// the dispatch kernel in a graph that is instantiated with +// cudaGraphInstantiateFlagDeviceLaunch. +// +// Usage: +// 1. Allocate a GraphIOContext on the device (cudaMalloc) +// 2. Call cudaq_create_dispatch_graph_regular() to create the graph context +// 3. Call cudaq_launch_dispatch_graph() to launch the dispatch kernel +// 4. When done, call cudaq_destroy_dispatch_graph() to cleanup +// +// The dispatch kernel fills the GraphIOContext before each fire-and-forget +// graph launch. The graph kernel reads input from io_ctx->rx_slot, writes +// the RPCResponse to io_ctx->tx_slot, and signals completion by writing +// io_ctx->tx_flag_value to *io_ctx->tx_flag after a __threadfence_system(). + +// Forward declaration for GraphIOContext (defined in dispatch_kernel_launch.h) +struct cudaq_graph_io_context; + +// Opaque handle for graph-based dispatch context +typedef struct cudaq_dispatch_graph_context cudaq_dispatch_graph_context; + +// Create a graph-based dispatch context for the regular kernel type. +// This creates a graph containing the dispatch kernel, instantiates it with +// cudaGraphInstantiateFlagDeviceLaunch, and uploads it to the device. +// +// graph_io_ctx: Device pointer to a GraphIOContext struct. The dispatch +// kernel fills this before each fire-and-forget child graph launch so +// the graph kernel knows where to read input and write output. +// +// Returns cudaSuccess on success, or an error code on failure. +cudaError_t cudaq_create_dispatch_graph_regular( + volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data, + uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz, + cudaq_function_entry_t *function_table, size_t func_count, + void *graph_io_ctx, volatile int *shutdown_flag, uint64_t *stats, + size_t num_slots, uint32_t num_blocks, uint32_t threads_per_block, + cudaStream_t stream, cudaq_dispatch_graph_context **out_context); + +// Launch the dispatch graph. The dispatch kernel inside this graph can call +// cudaGraphLaunch() to launch child graphs from device code. +cudaError_t cudaq_launch_dispatch_graph(cudaq_dispatch_graph_context *context, + cudaStream_t stream); + +// Destroy the dispatch graph context and release all resources. +cudaError_t cudaq_destroy_dispatch_graph(cudaq_dispatch_graph_context *context); + +#endif + +// Manager lifecycle +cudaq_status_t +cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr); +cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr); + +// Dispatcher lifecycle +cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *mgr, + const cudaq_dispatcher_config_t *config, + cudaq_dispatcher_t **out_dispatcher); +cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher); + +// Wiring inputs +cudaq_status_t +cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, + const cudaq_ringbuffer_t *ringbuffer); +cudaq_status_t +cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, + const cudaq_function_table_t *table); +cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, + volatile int *shutdown_flag, + uint64_t *stats); +cudaq_status_t +cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, + cudaq_dispatch_launch_fn_t launch_fn); + +// Bundle struct returned by bridge implementations for unified dispatch. +// Contains the bridge-provided launch function and its opaque transport state, +// keeping the dispatcher API transport-agnostic. +typedef struct { + cudaq_unified_launch_fn_t + launch_fn; ///< Bridge-provided unified launch function + void *transport_ctx; ///< Bridge-owned opaque transport state +} cudaq_unified_dispatch_ctx_t; + +// Unified dispatch wiring -- pass a transport-specific launch function and +// an opaque context holding transport handles (e.g. DOCA QP, rkey). +// When set, cudaq_dispatcher_start() will invoke unified_launch_fn instead of +// the 3-kernel launch_fn. Ringbuffer setup is not required for unified mode. +cudaq_status_t +cudaq_dispatcher_set_unified_launch(cudaq_dispatcher_t *dispatcher, + cudaq_unified_launch_fn_t unified_launch_fn, + void *transport_ctx); + +// Start/stop +cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher); +cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher); + +// Stats +cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, + uint64_t *out_packets); + +// Force eager CUDA module loading for dispatch kernels (occupancy query). +// Call before cudaq_dispatcher_start() to avoid lazy-loading deadlocks. +cudaError_t cudaq_dispatch_kernel_query_occupancy(int *out_blocks, + uint32_t threads_per_block); +cudaError_t +cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks, + uint32_t threads_per_block); + +#ifdef __cplusplus +} +#endif diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh new file mode 100644 index 00000000000..3b3be6dcdf4 --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh @@ -0,0 +1,82 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +/// @file dispatch_kernel.cuh +/// @brief Dispatch kernel declarations for external projects. +/// +/// The dispatch kernel implementation now lives in a separate CUDA TU +/// (dispatch_kernel.cu) and is linked into libcudaq-realtime.so. This header +/// provides declarations and inline wrappers for the launch functions. + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" + +#include +#include + +namespace cudaq::realtime { + +//============================================================================== +// Kernel Launch Function Declarations (with schema-driven function table) +//============================================================================== +// These declarations match the extern "C" functions defined in dispatch_kernel.cu +// and cudaq_realtime.h + +/// @brief Inline wrapper for regular kernel (schema-aware). +inline void launch_dispatch_kernel_regular_inline( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, rx_data, tx_data, + rx_stride_sz, tx_stride_sz, + function_table, func_count, + shutdown_flag, stats, num_slots, + num_blocks, threads_per_block, stream); +} + +/// @brief Inline wrapper for cooperative kernel (schema-aware). +inline void launch_dispatch_kernel_cooperative_inline( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_cooperative( + rx_flags, tx_flags, rx_data, tx_data, + rx_stride_sz, tx_stride_sz, + function_table, func_count, + shutdown_flag, stats, num_slots, + num_blocks, threads_per_block, stream); +} + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h new file mode 100644 index 00000000000..48ad88f9b24 --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h @@ -0,0 +1,137 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include + +namespace cudaq::realtime { + +//============================================================================== +// RPC Protocol Structures (Wire Format) +//============================================================================== + +/// @brief RPC request header - wire format for function dispatch (24 bytes). +/// Must be wire-compatible with cuda-quantum RPC protocol. +struct __attribute__((packed)) RPCHeader { + std::uint32_t magic; ///< Magic value to validate message framing + std::uint32_t function_id; ///< Hash of function name (FNV-1a) + std::uint32_t arg_len; ///< Length of argument data in bytes + std::uint32_t request_id; ///< Caller-assigned ID echoed in the response + std::uint64_t + ptp_timestamp; ///< PTP send timestamp (set by sender; 0 if unused) +}; + +/// @brief RPC response header - returned to caller (24 bytes). +struct __attribute__((packed)) RPCResponse { + std::uint32_t magic; ///< Magic value to validate message framing + std::int32_t status; ///< Return status (0 = success) + std::uint32_t result_len; ///< Length of result data in bytes + std::uint32_t request_id; ///< Echoed from RPCHeader::request_id + std::uint64_t ptp_timestamp; ///< Echoed from RPCHeader::ptp_timestamp +}; + +//============================================================================== +// Device Function Type +//============================================================================== + +/// @brief Device RPC function signature. +/// +/// The handler reads arguments from the input buffer and writes results +/// directly to the output buffer. The two buffers never overlap, which +/// enables the dispatch kernel to point `output` straight into the TX +/// ring-buffer slot, eliminating a post-handler copy. +/// +/// @param input Pointer to argument data (RX buffer, read-only) +/// @param output Pointer to result buffer (TX buffer, write-only) +/// @param arg_len Length of argument data in bytes +/// @param max_result_len Maximum result buffer size in bytes +/// @param result_len Output: actual result length written +/// @return Status code (0 = success) +using DeviceRPCFunction = int (*)(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len); + +//============================================================================== +// Function ID Hashing +//============================================================================== + +/// @brief Compute FNV-1a hash of a string (for function_id). +/// @param str Null-terminated string to hash +/// @return 32-bit hash value +constexpr std::uint32_t fnv1a_hash(const char *str) { + std::uint32_t hash = 2166136261u; + while (*str) { + hash ^= static_cast(*str++); + hash *= 16777619u; + } + return hash; +} + +// RPC framing magic values (ASCII: CUQ?). +constexpr std::uint32_t RPC_MAGIC_REQUEST = 0x43555152; // 'CUQR' +constexpr std::uint32_t RPC_MAGIC_RESPONSE = 0x43555153; // 'CUQS' + +//============================================================================== +// Graph IO Context (for CUDAQ_DISPATCH_GRAPH_LAUNCH) +//============================================================================== + +/// @brief IO context passed to graph-launched RPC handlers via pointer +/// indirection. +/// +/// The dispatch kernel fills this context before each fire-and-forget graph +/// launch so the graph kernel knows where to read input, where to write the +/// response, and how to signal completion. The graph kernel is responsible +/// for writing the RPCResponse header to `tx_slot` and then setting +/// `*tx_flag = tx_flag_value` after a `__threadfence_system()`. +struct GraphIOContext { + void *rx_slot; ///< Input: RX slot (RPCHeader + `args`) + std::uint8_t *tx_slot; ///< Output: TX slot for RPCResponse + volatile std::uint64_t *tx_flag; ///< Pointer to TX flag for this slot + std::uint64_t tx_flag_value; ///< Value to write to tx_flag when done + std::size_t tx_stride_sz; ///< TX slot size (for max_result_len) +}; + +//============================================================================== +// Schema-Driven Type System +//============================================================================== + +/// @brief Standardized payload type identifiers for RPC arguments/results. +enum PayloadTypeID : std::uint8_t { + TYPE_UINT8 = 0x10, + TYPE_INT32 = 0x11, + TYPE_INT64 = 0x12, + TYPE_FLOAT32 = 0x13, + TYPE_FLOAT64 = 0x14, + TYPE_ARRAY_UINT8 = 0x20, + TYPE_ARRAY_INT32 = 0x21, + TYPE_ARRAY_FLOAT32 = 0x22, + TYPE_ARRAY_FLOAT64 = 0x23, + TYPE_BIT_PACKED = 0x30 +}; + +/// @brief Type descriptor for a single argument or result. +struct __attribute__((packed)) cudaq_type_desc_t { + std::uint8_t type_id; ///< PayloadTypeID value + std::uint8_t reserved[3]; ///< Padding for alignment + std::uint32_t size_bytes; ///< Total size in bytes + std::uint32_t num_elements; ///< Number of elements (for arrays) +}; + +/// @brief Handler schema describing argument and result types. +struct __attribute__((packed)) cudaq_handler_schema_t { + std::uint8_t num_args; ///< Number of arguments + std::uint8_t num_results; ///< Number of results + std::uint16_t reserved; ///< Padding for alignment + cudaq_type_desc_t args[8]; ///< Argument type descriptors (max 8) + cudaq_type_desc_t results[4]; ///< Result type descriptors (max 4) +}; + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h new file mode 100644 index 00000000000..d34c0b83093 --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/dispatch_modes.h @@ -0,0 +1,64 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include + +namespace cudaq::realtime { + +/// @brief Device call dispatch mode - direct __device__ function call. +/// +/// The handler function is called directly from within the dispatch kernel. +/// This is the simplest and lowest-latency dispatch mode, suitable for +/// lightweight handlers like simple decoders or data transformations. +struct DeviceCallMode { + /// @brief Dispatch to handler via direct device function call. + /// + /// @tparam HandlerFunc Function pointer type + /// @tparam ContextType Context structure type + /// @tparam Args Additional argument types + /// @param handler The __device__ function to call + /// @param ctx Handler context (matrices, dimensions, etc.) + /// @param args Additional arguments + template + __device__ static void dispatch(HandlerFunc handler, ContextType &ctx, + Args... args) { + handler(ctx, args...); + } +}; + +/// @brief Graph launch dispatch mode - launches a CUDA graph from device. +/// +/// The handler is a pre-captured CUDA graph that gets launched from the +/// persistent kernel. This is suitable for complex multi-kernel workflows +/// that benefit from graph optimization. +/// +/// NOTE: Requires the graph to be captured and stored in the context at +/// initialization time. The context must contain graph_exec handle. +struct GraphLaunchMode { + /// @brief Dispatch via CUDA graph launch from device. + /// + /// @tparam ContextType Context structure type (must have graph_exec member) + /// @param ctx Handler context containing the graph executable + template + __device__ static void dispatch(ContextType &ctx) { +// Device graph launch requires CUDA 12.0+ and appropriate context setup +// The graph_exec must be a cudaGraphExec_t captured at initialization +#if __CUDA_ARCH__ >= 900 + // cudaGraphLaunch is available from device code on Hopper+ + // Note: This is a placeholder - actual implementation requires + // the graph_exec to be properly set up in the context + if (ctx.graph_exec != nullptr) { + cudaGraphLaunch(ctx.graph_exec, ctx.stream); + } +#endif + } +}; + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h b/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h new file mode 100644 index 00000000000..b7efcac1bcc --- /dev/null +++ b/realtime/include/cudaq/realtime/daemon/dispatcher/kernel_types.h @@ -0,0 +1,39 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include +#include + +namespace cudaq::realtime { + +/// @brief Regular kernel synchronization using __syncthreads(). +/// +/// Use this for single-block kernels or when only block-level synchronization +/// is needed. Suitable for simple decode handlers that don't require +/// grid-wide coordination. +struct RegularKernel { + /// @brief Not a cooperative kernel -- handler is called by thread 0 only. + static constexpr bool is_cooperative = false; + /// @brief Synchronize threads within a block. + __device__ static void sync() { __syncthreads(); } +}; + +/// @brief Cooperative kernel synchronization using grid.sync(). +/// +/// Use this for multi-block kernels that need grid-wide synchronization, +/// such as complex decoders with data dependencies across blocks. +/// Requires kernel to be launched with cudaLaunchCooperativeKernel. +struct CooperativeKernel { + /// @brief Cooperative kernel -- handler is called by ALL threads. + static constexpr bool is_cooperative = true; + __device__ static void sync() { cooperative_groups::this_grid().sync(); } +}; + +} // namespace cudaq::realtime diff --git a/realtime/include/cudaq/realtime/hololink_bridge_common.h b/realtime/include/cudaq/realtime/hololink_bridge_common.h new file mode 100644 index 00000000000..cc035422fdb --- /dev/null +++ b/realtime/include/cudaq/realtime/hololink_bridge_common.h @@ -0,0 +1,575 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +/// @file hololink_bridge_common.h +/// @brief Header-only bridge skeleton for Hololink-based RPC dispatch. +/// +/// Provides common infrastructure used by all Hololink bridge tools: +/// - Command-line argument parsing for IB device, peer IP, QP, etc. +/// - Hololink transceiver creation and QP connection +/// - Dispatch kernel wiring via the cudaq host API +/// - Main run loop with diagnostics +/// - Graceful shutdown +/// +/// Each concrete bridge tool (generic increment, mock decoder, real decoder) +/// implements a small main() that: +/// 1. Parses any tool-specific arguments +/// 2. Sets up its RPC function table on the GPU +/// 3. Calls bridge_run() with a BridgeConfig struct +/// +/// This header is compiled by a standard C++ compiler; all CUDA and Hololink +/// calls go through C interfaces (cudaq_realtime.h, hololink_wrapper.h). + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +// Hololink C wrapper (link against hololink_wrapper_bridge static library) +#include "cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h" + +// Weak declaration of the Hololink unified dispatch launch function +// (defined in libcudaq-realtime-bridge-hololink.so). Weak so that +// bridge tools that only use the 3-kernel architecture don't need +// to link the bridge-hololink library. +extern "C" __attribute__((weak)) void +hololink_launch_unified_dispatch(void *transport_ctx, + cudaq_function_entry_t *function_table, + size_t func_count, volatile int *shutdown_flag, + uint64_t *stats, cudaStream_t stream); + +namespace cudaq::realtime { + +//============================================================================== +// CUDA Error Checking +//============================================================================== + +#ifndef BRIDGE_CUDA_CHECK +#define BRIDGE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \ + << cudaGetErrorString(err) << std::endl; \ + return 1; \ + } \ + } while (0) +#endif + +//============================================================================== +// Global Signal Handler +//============================================================================== + +namespace detail { +inline std::atomic &bridge_shutdown_flag() { + static std::atomic flag{false}; + return flag; +} +inline void bridge_signal_handler(int) { bridge_shutdown_flag() = true; } +} // namespace detail + +//============================================================================== +// Bridge Configuration +//============================================================================== + +/// @brief Configuration for the bridge's Hololink and dispatch kernel setup. +struct BridgeConfig { + // IB / network + std::string device = "rocep1s0f0"; ///< IB device name + std::string peer_ip = "10.0.0.2"; ///< FPGA/emulator IP + uint32_t remote_qp = 0x2; ///< Remote QP number (FPGA default: 2) + int gpu_id = 0; ///< GPU device ID + int timeout_sec = 60; ///< Runtime timeout in seconds + + // Ring buffer sizing + uint32_t payload_size = 8; ///< RPC payload size in bytes + size_t frame_size = 0; ///< Computed: `sizeof(RPCHeader) + payload_size` + size_t page_size = + 384; ///< Ring buffer slot size (>= frame_size, 128-aligned) + unsigned num_pages = 64; ///< Number of ring buffer slots + + // QP exchange (emulator mode) + bool exchange_qp = false; ///< Use QP exchange protocol + int exchange_port = 12345; ///< TCP port for QP exchange + + // Forward mode: use Hololink's built-in forward kernel (echo) instead of + // separate RX + dispatch + TX kernels. Useful for baseline latency testing. + bool forward = false; + + // Unified dispatch mode: single kernel combines RDMA RX, RPC dispatch, and + // RDMA TX via direct DOCA verbs calls. Eliminates the inter-kernel flag + // handoff overhead of the 3-kernel path. Regular handlers only. + bool unified = false; + + // Dispatch kernel config + cudaq_function_entry_t *d_function_entries = nullptr; ///< GPU function table + size_t func_count = 0; ///< Number of entries + + /// @brief Dispatch kernel grid configuration. + /// Defaults match the regular (non-cooperative) kernel. + cudaq_kernel_type_t kernel_type = CUDAQ_KERNEL_REGULAR; + uint32_t num_blocks = 1; + uint32_t threads_per_block = 32; + + /// @brief Pointer to the dispatch kernel launch function. + /// Default: cudaq_launch_dispatch_kernel_regular + cudaq_dispatch_launch_fn_t launch_fn = nullptr; + + /// @brief Optional cleanup callback invoked during shutdown. + std::function cleanup_fn; +}; + +//============================================================================== +// Common Argument Parsing +//============================================================================== + +/// @brief Parse common bridge arguments from the command line. +/// +/// Recognized flags: `--device=`, `--peer-ip=`, `--remote-qp=`, `--gpu=`, +/// `--timeout=`, `--page-size=`, `--num-pages=`, `--exchange-qp`, +/// `--exchange-port=`. Unknown flags are silently ignored (so tool-specific +/// flags can co-exist). +/// +/// @param argc Argument count +/// @param argv Argument vector +/// @param [out] config Bridge configuration to populate +inline void parse_bridge_args(int argc, char *argv[], BridgeConfig &config) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.find("--device=") == 0) + config.device = arg.substr(9); + else if (arg.find("--peer-ip=") == 0) + config.peer_ip = arg.substr(10); + else if (arg.find("--remote-qp=") == 0) + config.remote_qp = std::stoul(arg.substr(12), nullptr, 0); + else if (arg.find("--gpu=") == 0) + config.gpu_id = std::stoi(arg.substr(6)); + else if (arg.find("--timeout=") == 0) + config.timeout_sec = std::stoi(arg.substr(10)); + else if (arg.find("--page-size=") == 0) + config.page_size = std::stoull(arg.substr(12)); + else if (arg.find("--num-pages=") == 0) + config.num_pages = std::stoul(arg.substr(12)); + else if (arg == "--exchange-qp") + config.exchange_qp = true; + else if (arg.find("--exchange-port=") == 0) + config.exchange_port = std::stoi(arg.substr(16)); + else if (arg.find("--payload-size=") == 0) + config.payload_size = std::stoul(arg.substr(15)); + else if (arg == "--forward") + config.forward = true; + else if (arg == "--unified") + config.unified = true; + } + + config.frame_size = sizeof(cudaq::realtime::RPCHeader) + config.payload_size; +} + +//============================================================================== +// Bridge Run Function +//============================================================================== + +/// @brief Run the Hololink bridge with the given configuration. +/// +/// This function: +/// 1. Initialises CUDA on the configured GPU +/// 2. Creates the Hololink transceiver and connects the QP +/// 3. Forces eager CUDA module loading +/// 4. Wires the cudaq dispatch kernel to the Hololink ring buffers +/// 5. Launches Hololink RX+TX kernels +/// 6. Runs the main diagnostic loop until timeout or signal +/// 7. Performs orderly shutdown +/// +/// The caller must set config.d_function_entries and config.func_count +/// before calling this function. +/// +/// @param config Fully-populated bridge configuration +/// @return 0 on success, non-zero on error +inline int bridge_run(BridgeConfig &config) { + signal(SIGINT, detail::bridge_signal_handler); + signal(SIGTERM, detail::bridge_signal_handler); + + auto &g_shutdown = detail::bridge_shutdown_flag(); + + //============================================================================ + // [1] Initialize CUDA + //============================================================================ + std::cout << "\n[1/5] Initializing CUDA..." << std::endl; + BRIDGE_CUDA_CHECK(cudaSetDevice(config.gpu_id)); + + cudaDeviceProp prop; + BRIDGE_CUDA_CHECK(cudaGetDeviceProperties(&prop, config.gpu_id)); + std::cout << " GPU: " << prop.name << std::endl; + + //============================================================================ + // [2] Create Hololink transceiver + //============================================================================ + std::cout << "\n[2/5] Creating Hololink transceiver..." << std::endl; + + // Ensure page_size >= frame_size + if (config.page_size < config.frame_size) { + std::cout << " Adjusting page_size from " << config.page_size << " to " + << config.frame_size << " to fit frame" << std::endl; + config.page_size = config.frame_size; + } + + std::cout << " Frame size: " << config.frame_size << " bytes" << std::endl; + std::cout << " Page size: " << config.page_size << " bytes" << std::endl; + std::cout << " Num pages: " << config.num_pages << std::endl; + + // Unified mode uses Hololink's forward/symmetric ring layout but doesn't + // run any Hololink kernels -- the unified dispatch kernel handles RX+TX. + bool use_forward_ring = config.forward || config.unified; + + hololink_transceiver_t transceiver = hololink_create_transceiver( + config.device.c_str(), 1, // ib_port + config.remote_qp, // remote QP number (FPGA default: 2) + config.gpu_id, // DOCA GPU device ID + config.frame_size, config.page_size, config.num_pages, + config.peer_ip.c_str(), // immediate connection + use_forward_ring ? 1 : 0, // forward (symmetric ring layout) + use_forward_ring ? 0 : 1, // rx_only + use_forward_ring ? 0 : 1 // tx_only + ); + + if (!transceiver) { + std::cerr << "ERROR: Failed to create Hololink transceiver" << std::endl; + return 1; + } + + std::cout << " Connecting to remote QP 0x" << std::hex << config.remote_qp + << std::dec << " at " << config.peer_ip << "..." << std::endl; + + if (!hololink_start(transceiver)) { + std::cerr << "ERROR: Failed to start Hololink transceiver" << std::endl; + hololink_destroy_transceiver(transceiver); + return 1; + } + + // Hololink start() pops the CUDA context via cuCtxPopCurrent; restore it. + BRIDGE_CUDA_CHECK(cudaSetDevice(config.gpu_id)); + + std::cout << " QP connected to remote peer" << std::endl; + + uint32_t our_qp = hololink_get_qp_number(transceiver); + uint32_t our_rkey = hololink_get_rkey(transceiver); + uint64_t our_buffer = hololink_get_buffer_addr(transceiver); + + std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; + std::cout << " RKey: " << our_rkey << std::endl; + std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec + << std::endl; + + // Ring buffer pointers + uint8_t *rx_ring_data = + reinterpret_cast(hololink_get_rx_ring_data_addr(transceiver)); + uint64_t *rx_ring_flag = hololink_get_rx_ring_flag_addr(transceiver); + uint8_t *tx_ring_data = + reinterpret_cast(hololink_get_tx_ring_data_addr(transceiver)); + uint64_t *tx_ring_flag = hololink_get_tx_ring_flag_addr(transceiver); + + if (!rx_ring_data || !rx_ring_flag || !tx_ring_data || !tx_ring_flag) { + std::cerr << "ERROR: Failed to get ring buffer pointers" << std::endl; + hololink_destroy_transceiver(transceiver); + return 1; + } + + //============================================================================ + // [3] Force eager CUDA module loading + //============================================================================ + std::cout << "\n[3/5] Forcing CUDA module loading..." << std::endl; + + // Hololink kernels are already warmed up by start() (which does warmup + // launches for prepare_receive_send, forward, rx_only, tx_only). + // The dispatch kernel occupancy query below handles our own kernels. + + // Dispatch kernel resources (unused in forward mode) + volatile int *shutdown_flag = nullptr; + volatile int *d_shutdown_flag = nullptr; + uint64_t *d_stats = nullptr; + cudaq_dispatch_manager_t *manager = nullptr; + cudaq_dispatcher_t *dispatcher = nullptr; + + // Transport context for unified mode (must outlive the dispatcher) + hololink_doca_transport_ctx unified_ctx{}; + + if (!config.forward) { + if (!config.unified) { + int dispatch_blocks = 0; + cudaError_t occ_err; + if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) { + occ_err = cudaq_dispatch_kernel_cooperative_query_occupancy( + &dispatch_blocks, config.threads_per_block); + } else { + occ_err = cudaq_dispatch_kernel_query_occupancy(&dispatch_blocks, 1); + } + if (occ_err != cudaSuccess) { + std::cerr << "ERROR: Dispatch kernel occupancy query failed: " + << cudaGetErrorString(occ_err) << std::endl; + return 1; + } + std::cout << " Dispatch kernel occupancy: " << dispatch_blocks + << " blocks/SM" << std::endl; + } + + //========================================================================== + // [4] Wire dispatch kernel + //========================================================================== + std::cout << "\n[4/5] Wiring dispatch kernel (" + << (config.unified ? "unified" : "3-kernel") << ")..." + << std::endl; + + void *tmp_shutdown = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + shutdown_flag = static_cast(tmp_shutdown); + void *tmp_d_shutdown = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown_flag = static_cast(tmp_d_shutdown); + *shutdown_flag = 0; + int zero = 0; + BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag), &zero, + sizeof(int), cudaMemcpyHostToDevice)); + + BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + + if (cudaq_dispatch_manager_create(&manager) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to create dispatch manager" << std::endl; + return 1; + } + + cudaq_dispatcher_config_t dconfig{}; + dconfig.device_id = config.gpu_id; + dconfig.vp_id = 0; + dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + if (config.unified) { + dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED; + dconfig.num_blocks = 1; + dconfig.threads_per_block = 1; + dconfig.num_slots = 0; + dconfig.slot_size = 0; + } else { + dconfig.kernel_type = config.kernel_type; + dconfig.num_blocks = config.num_blocks; + dconfig.threads_per_block = config.threads_per_block; + dconfig.num_slots = static_cast(config.num_pages); + dconfig.slot_size = static_cast(config.page_size); + } + + if (cudaq_dispatcher_create(manager, &dconfig, &dispatcher) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to create dispatcher" << std::endl; + return 1; + } + + if (config.unified) { + // Pack DOCA transport handles into the opaque context + unified_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver); + unified_ctx.rx_ring_data = rx_ring_data; + unified_ctx.rx_ring_stride_sz = hololink_get_page_size(transceiver); + unified_ctx.rx_ring_mkey = htonl(hololink_get_rkey(transceiver)); + unified_ctx.rx_ring_stride_num = hololink_get_num_pages(transceiver); + unified_ctx.frame_size = config.frame_size; + + if (cudaq_dispatcher_set_unified_launch(dispatcher, + &hololink_launch_unified_dispatch, + &unified_ctx) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set unified launch function" + << std::endl; + return 1; + } + } else { + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = reinterpret_cast(rx_ring_flag); + ringbuffer.tx_flags = reinterpret_cast(tx_ring_flag); + ringbuffer.rx_data = rx_ring_data; + ringbuffer.tx_data = tx_ring_data; + ringbuffer.rx_stride_sz = config.page_size; + ringbuffer.tx_stride_sz = config.page_size; + + if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set ringbuffer" << std::endl; + return 1; + } + + cudaq_dispatch_launch_fn_t launch_fn = config.launch_fn; + if (!launch_fn) { + launch_fn = &cudaq_launch_dispatch_kernel_regular; + } + if (cudaq_dispatcher_set_launch_fn(dispatcher, launch_fn) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set launch function" << std::endl; + return 1; + } + } + + cudaq_function_table_t table{}; + table.entries = config.d_function_entries; + table.count = config.func_count; + if (cudaq_dispatcher_set_function_table(dispatcher, &table) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set function table" << std::endl; + return 1; + } + + if (cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set control" << std::endl; + return 1; + } + + if (cudaq_dispatcher_start(dispatcher) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to start dispatcher" << std::endl; + return 1; + } + std::cout << " Dispatch kernel launched" << std::endl; + } else { + std::cout << "\n[4/5] Forward mode -- skipping dispatch kernel" + << std::endl; + } + + //============================================================================ + // [5] Launch Hololink kernels (if needed) and run + //============================================================================ + std::thread hololink_thread; + + if (config.unified) { + std::cout << "\n[5/5] Unified mode -- Hololink kernels not needed" + << std::endl; + } else { + std::cout << "\n[5/5] Launching Hololink kernels..." << std::endl; + hololink_thread = std::thread( + [transceiver]() { hololink_blocking_monitor(transceiver); }); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + std::cout << " Hololink RX+TX kernels started" << std::endl; + } + + // Print QP info for FPGA stimulus tool + std::cout << "\n=== Bridge Ready ===" << std::endl; + std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; + std::cout << " RKey: " << our_rkey << std::endl; + std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec + << std::endl; + std::cout << "\nWaiting for data (Ctrl+C to stop, timeout=" + << config.timeout_sec << "s)..." << std::endl; + + //============================================================================ + // Main run loop + //============================================================================ + cudaStream_t diag_stream = nullptr; + BRIDGE_CUDA_CHECK( + cudaStreamCreateWithFlags(&diag_stream, cudaStreamNonBlocking)); + + auto start_time = std::chrono::steady_clock::now(); + uint64_t last_processed = 0; + + while (!g_shutdown) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); + if (elapsed > config.timeout_sec) { + std::cout << "\nTimeout reached (" << config.timeout_sec << "s)" + << std::endl; + break; + } + + // Progress report every 5 seconds + if (!config.forward && d_stats && elapsed > 0 && elapsed % 5 == 0) { + uint64_t processed = 0; + cudaMemcpyAsync(&processed, d_stats, sizeof(uint64_t), + cudaMemcpyDeviceToHost, diag_stream); + cudaStreamSynchronize(diag_stream); + if (processed != last_processed) { + std::cout << " [" << elapsed << "s] Processed " << processed + << " packets" << std::endl; + last_processed = processed; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + + //============================================================================ + // Shutdown + //============================================================================ + std::cout << "\n=== Shutting down ===" << std::endl; + + if (diag_stream) { + cudaStreamDestroy(diag_stream); + diag_stream = nullptr; + } + + if (!config.forward) { + *shutdown_flag = 1; + __sync_synchronize(); + cudaq_dispatcher_stop(dispatcher); + + uint64_t total_processed = 0; + cudaq_dispatcher_get_processed(dispatcher, &total_processed); + std::cout << " Total packets processed (dispatch RX): " << total_processed + << std::endl; + } + + hololink_close(transceiver); + if (hololink_thread.joinable()) + hololink_thread.join(); + + if (dispatcher) + cudaq_dispatcher_destroy(dispatcher); + if (manager) + cudaq_dispatch_manager_destroy(manager); + hololink_destroy_transceiver(transceiver); + + if (shutdown_flag) + cudaFreeHost(const_cast(shutdown_flag)); + if (d_stats) + cudaFree(d_stats); + + // Call tool-specific cleanup + if (config.cleanup_fn) + config.cleanup_fn(); + + std::cout << "\n*** Bridge shutdown complete ***" << std::endl; + return 0; +} + +/// @brief Default dispatch kernel launch wrapper. +/// +/// Matches cudaq_dispatch_launch_fn_t signature; delegates to +/// cudaq_launch_dispatch_kernel_regular from libcudaq-realtime. +inline void bridge_launch_dispatch_kernel( + volatile std::uint64_t *rx_flags, volatile std::uint64_t *tx_flags, + std::uint8_t *rx_data, std::uint8_t *tx_data, std::size_t rx_stride_sz, + std::size_t tx_stride_sz, cudaq_function_entry_t *function_table, + std::size_t func_count, volatile int *shutdown_flag, std::uint64_t *stats, + std::size_t num_slots, std::uint32_t num_blocks, + std::uint32_t threads_per_block, cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, shutdown_flag, stats, num_slots, num_blocks, + threads_per_block, stream); +} + +} // namespace cudaq::realtime diff --git a/realtime/lib/CMakeLists.txt b/realtime/lib/CMakeLists.txt new file mode 100644 index 00000000000..7cd05df051a --- /dev/null +++ b/realtime/lib/CMakeLists.txt @@ -0,0 +1,17 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +include(GNUInstallDirs) + +install(DIRECTORY ${CUDAQ_REALTIME_INCLUDE_DIR}/cudaq + COMPONENT cudaq-realtime-headers + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING PATTERN "*.h" PATTERN "*.cuh" +) + +add_subdirectory(daemon) diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt new file mode 100644 index 00000000000..855afddce26 --- /dev/null +++ b/realtime/lib/daemon/CMakeLists.txt @@ -0,0 +1,91 @@ +# ============================================================================ # +# Copyright (c) 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# ============================================================================== +# Shared library for external consumers (libcudaq-realtime.so) +# ============================================================================== +# This shared library exports a C-compatible host API for wiring dispatchers +# and includes the GPU dispatch kernel device code. + +if(CUDA_FOUND) + set(CMAKE_INSTALL_LIBDIR lib) + set(CUDAQ_REALTIME_SOURCES + dispatcher/cudaq_realtime_api.cpp + bridge/bridge_interface_api.cpp + ) + + add_library(cudaq-realtime SHARED ${CUDAQ_REALTIME_SOURCES}) + + target_include_directories(cudaq-realtime + PUBLIC + $ + $ + ) + + target_link_libraries(cudaq-realtime + PUBLIC + CUDA::cudart_static + ) + + target_compile_definitions(cudaq-realtime PUBLIC CUDAQ_REALTIME_HAVE_CUDA) + target_link_options(cudaq-realtime PRIVATE + "-Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/cudaq-realtime.map") + set_target_properties(cudaq-realtime PROPERTIES LINK_FLAGS_RELEASE "-s") + + set_target_properties(cudaq-realtime PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime + COMPONENT realtime-lib + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + add_library(cudaq-realtime-dispatch STATIC + dispatcher/dispatch_kernel.cu + ) + set_target_properties(cudaq-realtime-dispatch + PROPERTIES + LINK_FLAGS_RELEASE "-Wl,--exclude-libs=ALL") + set_target_properties(cudaq-realtime-dispatch PROPERTIES CXX_VISIBILITY_PRESET "hidden" CUDA_VISIBILITY_PRESET "hidden") + + target_include_directories(cudaq-realtime-dispatch + PUBLIC + $ + $ + ) + + # Link CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) + find_library(CUDADEVRT_LIBRARY cudadevrt + HINTS ${CUDAToolkit_LIBRARY_DIR} + REQUIRED + ) + + target_link_libraries(cudaq-realtime-dispatch + PUBLIC + CUDA::cudart_static + ${CUDADEVRT_LIBRARY} + ) + + set_target_properties(cudaq-realtime-dispatch PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib + ) + + install(TARGETS cudaq-realtime-dispatch + COMPONENT realtime-lib + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + if (CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS) + add_subdirectory(bridge/hololink) + endif() +endif() diff --git a/realtime/lib/daemon/bridge/bridge_interface_api.cpp b/realtime/lib/daemon/bridge/bridge_interface_api.cpp new file mode 100644 index 00000000000..083d63aa434 --- /dev/null +++ b/realtime/lib/daemon/bridge/bridge_interface_api.cpp @@ -0,0 +1,190 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file bridge_interface_api.cpp +/// @brief API implementation for transport layer bridge interface. +/// +/// This file provides the implementation of the API functions declared in +/// bridge_interface.h. It manages the loading of transport provider shared +/// libraries, retrieval of their interface structs, and dispatch of API calls +/// to the appropriate provider implementation based on the bridge handle. + +#include "cudaq/realtime/daemon/bridge/bridge_interface.h" +#include +#include +#include +#include +#include +namespace { +std::unordered_map + provider_interface_map; + +std::unordered_map + bridge_handle_interface_map; + +// Mutex to protect access to global maps (provider_interface_map and +// bridge_handle_interface_map) for thread safety. +std::shared_mutex bridge_interface_mutex; + +/// @brief Path to the built-in Hololink bridge library. This is used when the +/// provider is CUDAQ_PROVIDER_HOLOLINK to load the Hololink implementation of +/// the bridge interface. The library must be present at the load path (e.g., +/// LD_LIBRARY_PATH) for the built-in provider to work. +const char *Hololink_Bridge_Lib = "libcudaq-realtime-bridge-hololink.so"; +} // namespace + +cudaq_status_t +cudaq_bridge_create(cudaq_realtime_bridge_handle_t *out_bridge_handle, + cudaq_realtime_transport_provider_t provider, int argc, + char **argv) { + // For create, hold an unique lock. + std::unique_lock lock(bridge_interface_mutex); + + const auto it = provider_interface_map.find(provider); + if (it != provider_interface_map.end()) { + auto *bridge_interface = it->second; + return bridge_interface->create(out_bridge_handle, argc, argv); + } + + const std::string lib_name = [&]() { + if (provider == CUDAQ_PROVIDER_HOLOLINK) { + return Hololink_Bridge_Lib; + } else { + const char *bridgeLibPath = std::getenv("CUDAQ_REALTIME_BRIDGE_LIB"); + if (!bridgeLibPath) { + std::cerr << "ERROR: CUDAQ_REALTIME_BRIDGE_LIB environment variable " + "not set for EXTERNAL provider" + << std::endl; + return ""; + } + return bridgeLibPath; + } + }(); + + if (lib_name.empty()) + return CUDAQ_ERR_INVALID_ARG; + dlerror(); // reset errors + + if (!out_bridge_handle) + return CUDAQ_ERR_INVALID_ARG; + + void *lib_handle = dlopen(lib_name.c_str(), RTLD_NOW); + + if (!lib_handle) { + std::cerr << "ERROR: Failed to load bridge library '" << lib_name + << "': " << dlerror() << std::endl; + return CUDAQ_ERR_INTERNAL; + } + using GetInterfaceFunction = cudaq_realtime_bridge_interface_t *(*)(); + GetInterfaceFunction fcn = (GetInterfaceFunction)(intptr_t)dlsym( + lib_handle, "cudaq_realtime_get_bridge_interface"); + if (!fcn) { + std::cerr << "ERROR: Failed to interface getter from '" << lib_name + << "': " << dlerror() << std::endl; + return CUDAQ_ERR_INTERNAL; + } + + cudaq_realtime_bridge_interface_t *bridge_interface = fcn(); + + if (!bridge_interface) { + std::cerr << "ERROR: Bridge interface getter returned null from '" + << lib_name << "'" << std::endl; + return CUDAQ_ERR_INTERNAL; + } + provider_interface_map[provider] = bridge_interface; + + // Check interface version compatibility + if (bridge_interface->version != CUDAQ_REALTIME_BRIDGE_INTERFACE_VERSION) { + std::cerr << "ERROR: Bridge interface version mismatch for '" << lib_name + << "': expected " << CUDAQ_REALTIME_BRIDGE_INTERFACE_VERSION + << ", got " << bridge_interface->version << std::endl; + return CUDAQ_ERR_INTERNAL; + } + // Run the create callback to allow the bridge to perform any initial setup + const auto status = bridge_interface->create(out_bridge_handle, argc, argv); + if (status == CUDAQ_OK) { + bridge_handle_interface_map[*out_bridge_handle] = bridge_interface; + } + return status; +} + +cudaq_status_t cudaq_bridge_destroy(cudaq_realtime_bridge_handle_t bridge) { + // For destroy, hold an unique lock. + std::unique_lock lock(bridge_interface_mutex); + + const auto it = bridge_handle_interface_map.find(bridge); + if (it == bridge_handle_interface_map.end()) { + std::cerr << "ERROR: Invalid bridge handle in destroy" << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + auto *bridge_interface = it->second; + const auto status = bridge_interface->destroy(bridge); + if (status == CUDAQ_OK) { + bridge_handle_interface_map.erase(it); + } + return status; +} + +// Retrieve the transport context information for the given bridge. +cudaq_status_t cudaq_bridge_get_transport_context( + cudaq_realtime_bridge_handle_t bridge, + cudaq_realtime_transport_context_t context_type, void *out_context) { + // Hold a shared lock since this is a read-only operation on the global maps. + std::shared_lock lock(bridge_interface_mutex); + + const auto it = bridge_handle_interface_map.find(bridge); + if (it == bridge_handle_interface_map.end()) { + std::cerr << "ERROR: Invalid bridge handle in get_transport_context" + << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + auto *bridge_interface = it->second; + return bridge_interface->get_transport_context(bridge, context_type, + out_context); +} + +cudaq_status_t cudaq_bridge_connect(cudaq_realtime_bridge_handle_t bridge) { + // Hold a shared lock since this is a read-only operation on the global maps. + std::shared_lock lock(bridge_interface_mutex); + + const auto it = bridge_handle_interface_map.find(bridge); + if (it == bridge_handle_interface_map.end()) { + std::cerr << "ERROR: Invalid bridge handle in connect" << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + auto *bridge_interface = it->second; + return bridge_interface->connect(bridge); +} + +cudaq_status_t cudaq_bridge_launch(cudaq_realtime_bridge_handle_t bridge) { + // Hold a shared lock since this is a read-only operation on the global maps. + std::shared_lock lock(bridge_interface_mutex); + + const auto it = bridge_handle_interface_map.find(bridge); + if (it == bridge_handle_interface_map.end()) { + std::cerr << "ERROR: Invalid bridge handle in launch" << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + auto *bridge_interface = it->second; + return bridge_interface->launch(bridge); +} + +cudaq_status_t cudaq_bridge_disconnect(cudaq_realtime_bridge_handle_t bridge) { + // Hold a shared lock since this is a read-only operation on the global maps. + std::shared_lock lock(bridge_interface_mutex); + + const auto it = bridge_handle_interface_map.find(bridge); + if (it == bridge_handle_interface_map.end()) { + std::cerr << "ERROR: Invalid bridge handle in disconnect" << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + auto *bridge_interface = it->second; + return bridge_interface->disconnect(bridge); +} diff --git a/realtime/lib/daemon/bridge/hololink/CMakeLists.txt b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt new file mode 100644 index 00000000000..456cfe82062 --- /dev/null +++ b/realtime/lib/daemon/bridge/hololink/CMakeLists.txt @@ -0,0 +1,223 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Hololink bridge implementation +# ============================================================================== +# These targets are gated by CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS and require +# a pre-built hololink (holoscan-sensor-bridge) with DOCA support. + + +if (NOT HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR must be set when building hololink tools.") +endif() +if (NOT HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR must be set when building hololink tools.") +endif() + +find_package(Threads REQUIRED) +find_package(CUDAToolkit REQUIRED) + +# --------------------------------------------------------------------------- # +# Find Hololink core library +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_CORE_LIB + NAMES hololink_core + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/core" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT HOLOLINK_CORE_LIB) + message(FATAL_ERROR + "Could not find hololink_core library under ${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}.") +endif() + +# --------------------------------------------------------------------------- # +# Find GPU RoCE Transceiver library +# --------------------------------------------------------------------------- # + +find_library(GPU_ROCE_TRANSCEIVER_LIB + NAMES gpu_roce_transceiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/gpu_roce_transceiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING + "Could not find gpu_roce_transceiver library. " + "hololink_bridge will not be built.") +endif() + +# --------------------------------------------------------------------------- # +# Find transitive Hololink libraries +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_COMMON_LIB + NAMES hololink + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/common" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(ROCE_RECEIVER_LIB + NAMES roce_receiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(BASE_RECEIVER_OP_LIB + NAMES base_receiver_op + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(IBVERBS_LIB NAMES ibverbs) + +# --------------------------------------------------------------------------- # +# Find DOCA libraries +# --------------------------------------------------------------------------- # + +set(DOCA_PATH "/opt/mellanox/doca") + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/x86_64-linux-gnu") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/aarch64-linux-gnu") +else() + set(DOCA_LIB_DIR "${DOCA_PATH}/lib") +endif() + +find_path(DOCA_INCLUDE_DIR doca_verbs.h + PATHS ${DOCA_PATH}/include + NO_DEFAULT_PATH) + +# RHEL may have DOCA libraries under ${DOCA_PATH}/lib64 +find_library(DOCA_VERBS_LIB doca_verbs + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +find_library(DOCA_GPUNETIO_LIB doca_gpunetio + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +find_library(DOCA_COMMON_LIB doca_common + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +# --------------------------------------------------------------------------- # +# Find Holoscan (required by gpu_roce_transceiver -> holoscan::core) +# --------------------------------------------------------------------------- # + +find_package(holoscan QUIET) + +# --------------------------------------------------------------------------- # +# Find fmt (transitive dependency of hololink logging) +# --------------------------------------------------------------------------- # + +find_path(FMT_INCLUDE_DIR + NAMES fmt/format.h + PATHS /opt/nvidia/holoscan /usr/local/cudaq /usr /usr/local + PATH_SUFFIXES include + NO_DEFAULT_PATH) + +# =========================================================================== # +# hololink_bridge library +# =========================================================================== # + +if (GPU_ROCE_TRANSCEIVER_LIB AND + DOCA_INCLUDE_DIR AND DOCA_VERBS_LIB AND DOCA_COMMON_LIB AND + DOCA_GPUNETIO_LIB) + + message(STATUS "Building hololink_bridge interface library with the following dependencies:") + message(STATUS " GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}") + + find_path(DOCA_GPU_INCLUDE_DIR doca_gpunetio_dev_verbs_common.cuh + HINTS /opt/mellanox/doca/include + ) + + find_library(CUDADEVRT_LIBRARY cudadevrt + HINTS ${CUDAToolkit_LIBRARY_DIR} + REQUIRED + ) + + # Library implementing the hololink bridge interface (includes unified kernel) + add_library(cudaq-realtime-bridge-hololink SHARED + bridge_impl.cpp + hololink_wrapper.cpp + unified_dispatch_kernel.cu) + + set_target_properties(cudaq-realtime-bridge-hololink PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + POSITION_INDEPENDENT_CODE ON) + + target_include_directories(cudaq-realtime-bridge-hololink + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} + "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src" + "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src/hololink/operators/gpu_roce_transceiver" + ${DOCA_INCLUDE_DIR} + ${DOCA_GPU_INCLUDE_DIR} + ${FMT_INCLUDE_DIR}) + + target_link_libraries(cudaq-realtime-bridge-hololink + PRIVATE + hololink_wrapper_generic + ${GPU_ROCE_TRANSCEIVER_LIB} + ${ROCE_RECEIVER_LIB} + ${BASE_RECEIVER_OP_LIB} + ${HOLOLINK_CORE_LIB} + ${HOLOLINK_COMMON_LIB} + cudaq-realtime + CUDA::cudart + CUDA::cuda_driver + ${CUDADEVRT_LIBRARY} + ${DOCA_VERBS_LIB} + ${DOCA_GPUNETIO_LIB} + ${DOCA_COMMON_LIB} + ${IBVERBS_LIB} + Threads::Threads + ${CMAKE_DL_LIBS}) + + target_link_options(cudaq-realtime-bridge-hololink PRIVATE + "LINKER:--allow-multiple-definition") + + if (holoscan_FOUND) + target_link_libraries(cudaq-realtime-bridge-hololink PRIVATE holoscan::core) + target_link_libraries(cudaq-realtime-bridge-hololink PRIVATE holoscan::core) + endif() + + # Set RPATH for shared libraries + set_target_properties(cudaq-realtime-bridge-hololink PROPERTIES + BUILD_RPATH "${DOCA_LIB_DIR}" + INSTALL_RPATH "${DOCA_LIB_DIR}") + # Build output directory (/lib) + set_target_properties(cudaq-realtime-bridge-hololink PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) + install(TARGETS cudaq-realtime-bridge-hololink DESTINATION ${CMAKE_INSTALL_LIBDIR}) +else() + if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING "gpu_roce_transceiver library not found. " + "hololink_bridge will not be built.") + endif() + if (NOT DOCA_INCLUDE_DIR OR NOT DOCA_VERBS_LIB) + message(WARNING "DOCA libraries not found. " + "hololink_bridge requires DOCA.") + endif() +endif() diff --git a/realtime/lib/daemon/bridge/hololink/bridge_impl.cpp b/realtime/lib/daemon/bridge/hololink/bridge_impl.cpp new file mode 100644 index 00000000000..ed65452723c --- /dev/null +++ b/realtime/lib/daemon/bridge/hololink/bridge_impl.cpp @@ -0,0 +1,263 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file bridge_impl.cpp +/// @brief Hololink bridge interface implementation for libcudaq-realtime +/// dispatch. + +#include + +#include "cudaq/realtime/daemon/bridge/bridge_interface.h" +#include "cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h" +#include "cudaq/realtime/hololink_bridge_common.h" + +namespace { +#define HANDLE_CUDA_ERROR(x) \ + { \ + const auto err = x; \ + if (err != cudaSuccess) { \ + std::stringstream ss; \ + ss << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \ + << cudaGetErrorString(err) << std::endl; \ + throw std::runtime_error(ss.str()); \ + } \ + } + +struct HololinkBridgeContext { + cudaq::realtime::BridgeConfig config; + hololink_transceiver_t transceiver = nullptr; + std::unique_ptr hololink_thread; + HololinkBridgeContext(const cudaq::realtime::BridgeConfig &cfg) + : config(cfg) { + //============================================================================ + // [1] Initialize CUDA + //============================================================================ + HANDLE_CUDA_ERROR(cudaSetDevice(config.gpu_id)); + cudaDeviceProp prop; + HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, config.gpu_id)); + + //============================================================================ + // [2] Create Hololink transceiver + //============================================================================ + // Ensure page_size >= frame_size + if (config.page_size < config.frame_size) { + config.page_size = config.frame_size; + } + + // Unified mode uses Hololink's forward/symmetric ring layout but doesn't + // run any Hololink kernels -- the unified dispatch kernel handles RX+TX. + bool use_forward_ring = config.forward || config.unified; + + transceiver = hololink_create_transceiver( + config.device.c_str(), 1, // ib_port (FIXME: make configurable?) + config.remote_qp, // remote QP number + config.gpu_id, // GPU device ID + config.frame_size, config.page_size, config.num_pages, + config.peer_ip.c_str(), // immediate connection + use_forward_ring ? 1 : 0, // forward (symmetric ring layout) + use_forward_ring ? 0 : 1, // rx_only + use_forward_ring ? 0 : 1 // tx_only + ); + } +}; +} // namespace + +extern "C" { +static cudaq_status_t +hololink_bridge_create(cudaq_realtime_bridge_handle_t *handle, int argc, + char **argv) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + cudaq::realtime::BridgeConfig config; + + // Parse common bridge args + cudaq::realtime::parse_bridge_args(argc, argv, config); + + // Frame size: RPCHeader + payload size + config.frame_size = sizeof(cudaq::realtime::RPCHeader) + config.payload_size; + + // Create and initialize the bridge context (including the Hololink + // transceiver) + HololinkBridgeContext *ctx = new HololinkBridgeContext(config); + if (!ctx) { + std::cerr << "ERROR: Failed to create HololinkBridgeContext" << std::endl; + return CUDAQ_ERR_INTERNAL; + } + // Set the output handle to the created context (opaque to the caller) + *handle = ctx; + + if (!ctx->transceiver) { + std::cerr << "ERROR: Failed to create Hololink transceiver" << std::endl; + delete ctx; + return CUDAQ_ERR_INTERNAL; + } + + if (!hololink_start(ctx->transceiver)) { + std::cerr << "ERROR: Failed to start Hololink transceiver" << std::endl; + hololink_destroy_transceiver(ctx->transceiver); + delete ctx; + return CUDAQ_ERR_INTERNAL; + } + + // Hololink start() pops the CUDA context via cuCtxPopCurrent; restore it. + HANDLE_CUDA_ERROR(cudaSetDevice(config.gpu_id)); + + return CUDAQ_OK; +} + +static cudaq_status_t +hololink_bridge_destroy(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + HololinkBridgeContext *ctx = + reinterpret_cast(handle); + if (ctx->transceiver) { + hololink_destroy_transceiver(ctx->transceiver); + } + delete ctx; + return CUDAQ_OK; +} + +static cudaq_status_t hololink_bridge_get_transport_context( + cudaq_realtime_bridge_handle_t handle, + cudaq_realtime_transport_context_t context_type, void *out_context) { + + if (!handle || !out_context) + return CUDAQ_ERR_INVALID_ARG; + HololinkBridgeContext *ctx = + reinterpret_cast(handle); + if (!ctx->transceiver) + return CUDAQ_ERR_INTERNAL; + + auto &transceiver = ctx->transceiver; + if (context_type == RING_BUFFER) { + cudaq_ringbuffer_t *ringbuffer = + reinterpret_cast(out_context); + + // Ring buffer pointers + uint8_t *rx_ring_data = reinterpret_cast( + hololink_get_rx_ring_data_addr(transceiver)); + uint64_t *rx_ring_flag = hololink_get_rx_ring_flag_addr(transceiver); + uint8_t *tx_ring_data = reinterpret_cast( + hololink_get_tx_ring_data_addr(transceiver)); + uint64_t *tx_ring_flag = hololink_get_tx_ring_flag_addr(transceiver); + + if (!rx_ring_data || !rx_ring_flag || !tx_ring_data || !tx_ring_flag) { + std::cerr << "ERROR: Failed to get ring buffer pointers" << std::endl; + return CUDAQ_ERR_INTERNAL; + } + + ringbuffer->rx_flags = reinterpret_cast(rx_ring_flag); + ringbuffer->tx_flags = reinterpret_cast(tx_ring_flag); + ringbuffer->rx_data = rx_ring_data; + ringbuffer->tx_data = tx_ring_data; + ringbuffer->rx_stride_sz = ctx->config.page_size; + ringbuffer->tx_stride_sz = ctx->config.page_size; + } else if (context_type == UNIFIED) { + cudaq_unified_dispatch_ctx_t *dispatch_ctx = + reinterpret_cast(out_context); + + static hololink_doca_transport_ctx doca_ctx{}; + doca_ctx.gpu_dev_qp = hololink_get_gpu_dev_qp(transceiver); + doca_ctx.rx_ring_data = reinterpret_cast( + hololink_get_rx_ring_data_addr(transceiver)); + doca_ctx.rx_ring_stride_sz = hololink_get_page_size(transceiver); + doca_ctx.rx_ring_mkey = htonl(hololink_get_rkey(transceiver)); + doca_ctx.rx_ring_stride_num = hololink_get_num_pages(transceiver); + doca_ctx.frame_size = ctx->config.frame_size; + + dispatch_ctx->launch_fn = &hololink_launch_unified_dispatch; + dispatch_ctx->transport_ctx = &doca_ctx; + } else { + std::cerr << "ERROR: Invalid transport context type" << std::endl; + return CUDAQ_ERR_INVALID_ARG; + } + + return CUDAQ_OK; +} + +static cudaq_status_t +hololink_bridge_connect(cudaq_realtime_bridge_handle_t handle) { + if (!handle) + return CUDAQ_ERR_INVALID_ARG; + HololinkBridgeContext *ctx = + reinterpret_cast(handle); + if (!ctx->transceiver) + return CUDAQ_ERR_INTERNAL; + if (ctx->hololink_thread && ctx->hololink_thread->joinable()) { + std::cerr << "ERROR: Hololink bridge already connected" << std::endl; + return CUDAQ_ERR_INTERNAL; + } + + auto &transceiver = ctx->transceiver; + + uint32_t our_qp = hololink_get_qp_number(transceiver); + uint32_t our_rkey = hololink_get_rkey(transceiver); + uint64_t our_buffer = hololink_get_buffer_addr(transceiver); + + // FIXME: Figure out a better way to share this info with the caller (e.g. via + // output params or context struct) rather than printing to stdout. Print QP + // info for FPGA stimulus tool + std::cout << "\n=== Bridge Ready ===" << std::endl; + std::cout << " QP Number: 0x" << std::hex << our_qp << std::dec << std::endl; + std::cout << " RKey: " << our_rkey << std::endl; + std::cout << " Buffer Addr: 0x" << std::hex << our_buffer << std::dec + << std::endl; + std::cout << "\nWaiting for data (Ctrl+C to stop, timeout=" + << ctx->config.timeout_sec << "s)..." << std::endl; + + return CUDAQ_OK; +} + +static cudaq_status_t +hololink_bridge_launch(cudaq_realtime_bridge_handle_t handle) { + auto *ctx = reinterpret_cast(handle); + if (!ctx || !ctx->transceiver) + return CUDAQ_ERR_INVALID_ARG; + auto &transceiver = ctx->transceiver; + + if (ctx->config.unified) { + std::cout << "\n Unified mode -- no hololink monitor thread needed" + << std::endl; + } else { + //============================================================================ + // Launch Hololink kernels and run + //============================================================================ + ctx->hololink_thread = std::make_unique( + [transceiver]() { hololink_blocking_monitor(transceiver); }); + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + return CUDAQ_OK; +} + +static cudaq_status_t +hololink_bridge_disconnect(cudaq_realtime_bridge_handle_t handle) { + auto *ctx = reinterpret_cast(handle); + if (!ctx || !ctx->transceiver) + return CUDAQ_ERR_INVALID_ARG; + auto &transceiver = ctx->transceiver; + hololink_close(transceiver); + if (ctx->hololink_thread && ctx->hololink_thread->joinable()) + ctx->hololink_thread->join(); + return CUDAQ_OK; +} + +cudaq_realtime_bridge_interface_t *cudaq_realtime_get_bridge_interface() { + static cudaq_realtime_bridge_interface_t cudaq_hololink_bridge_interface = { + CUDAQ_REALTIME_BRIDGE_INTERFACE_VERSION, + hololink_bridge_create, + hololink_bridge_destroy, + hololink_bridge_get_transport_context, + hololink_bridge_connect, + hololink_bridge_launch, + hololink_bridge_disconnect, + }; + return &cudaq_hololink_bridge_interface; +} +} diff --git a/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp new file mode 100644 index 00000000000..f618d2d5d70 --- /dev/null +++ b/realtime/lib/daemon/bridge/hololink/hololink_wrapper.cpp @@ -0,0 +1,196 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_wrapper.cpp +/// @brief C wrapper implementation for Hololink GpuRoceTransceiver. +/// +/// This file is compiled by g++ (not nvcc) to isolate Hololink's fmt +/// dependency from CUDA translation units. + +#include "cudaq/realtime/daemon/bridge/hololink/hololink_wrapper.h" + +// Include Hololink headers here (with Holoscan's fmt) +// Disable deprecation warnings for Hololink headers, which may use deprecated +// APIs +#if (defined(__GNUC__) && !defined(__clang__)) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif +#include +#if (defined(__GNUC__) && !defined(__clang__)) +#pragma GCC diagnostic pop +#endif + +#include + +extern "C" cudaError_t GpuRoceTransceiverQueryOccupancy(int *, int *, int *); + +using namespace hololink::operators; + +//============================================================================== +// Internal implementation struct +//============================================================================== + +struct HololinkTransceiverImpl { + std::unique_ptr transceiver; +}; + +//============================================================================== +// Lifecycle +//============================================================================== + +hololink_transceiver_t hololink_create_transceiver( + const char *device_name, int ib_port, unsigned tx_ibv_qp, int gpu_id, + size_t frame_size, size_t page_size, unsigned num_pages, + const char *peer_ip, int forward, int rx_only, int tx_only) { + try { + auto *impl = new HololinkTransceiverImpl(); + impl->transceiver = std::make_unique( + device_name, static_cast(ib_port), tx_ibv_qp, gpu_id, + frame_size, page_size, num_pages, peer_ip, forward != 0, rx_only != 0, + tx_only != 0); + return reinterpret_cast(impl); + } catch (const std::exception &e) { + std::cerr << "ERROR: Failed to create GpuRoceTransceiver: " << e.what() + << std::endl; + return nullptr; + } catch (...) { + std::cerr << "ERROR: Failed to create GpuRoceTransceiver: unknown exception" + << std::endl; + return nullptr; + } +} + +void hololink_destroy_transceiver(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + delete impl; + } +} + +int hololink_start(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->start() ? 1 : 0; + } + return 0; +} + +void hololink_close(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + impl->transceiver->close(); + } +} + +void hololink_blocking_monitor(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + impl->transceiver->blocking_monitor(); + } +} + +//============================================================================== +// QP information +//============================================================================== + +uint32_t hololink_get_qp_number(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_qp_number(); + } + return 0; +} + +uint32_t hololink_get_rkey(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rkey(); + } + return 0; +} + +uint64_t hololink_get_buffer_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->external_frame_memory(); + } + return 0; +} + +void *hololink_get_gpu_dev_qp(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_doca_gpu_dev_qp(0); + } + return nullptr; +} + +//============================================================================== +// Ring buffer access +//============================================================================== + +void *hololink_get_rx_ring_data_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_data_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_rx_ring_flag_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_flag_addr(); + } + return nullptr; +} + +void *hololink_get_tx_ring_data_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_tx_ring_data_addr(); + } + return nullptr; +} + +uint64_t *hololink_get_tx_ring_flag_addr(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_tx_ring_flag_addr(); + } + return nullptr; +} + +bool hololink_query_kernel_occupancy(void) { + int prep = 0, rx = 0, tx = 0; + cudaError_t err = GpuRoceTransceiverQueryOccupancy(&prep, &rx, &tx); + if (err != cudaSuccess) { + fprintf(stderr, "ERROR: Hololink kernel occupancy query failed: %s\n", + cudaGetErrorString(err)); + return false; + } + printf(" Hololink kernel occupancy: prepare=%d rx=%d tx=%d\n", prep, rx, tx); + return true; +} + +size_t hololink_get_page_size(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_stride_sz(); + } + return 0; +} + +unsigned hololink_get_num_pages(hololink_transceiver_t handle) { + if (handle) { + auto *impl = reinterpret_cast(handle); + return impl->transceiver->get_rx_ring_stride_num(); + } + return 0; +} diff --git a/realtime/lib/daemon/bridge/hololink/unified_dispatch_kernel.cu b/realtime/lib/daemon/bridge/hololink/unified_dispatch_kernel.cu new file mode 100644 index 00000000000..78104579739 --- /dev/null +++ b/realtime/lib/daemon/bridge/hololink/unified_dispatch_kernel.cu @@ -0,0 +1,191 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file unified_dispatch_kernel.cu +/// @brief Hololink/DOCA unified dispatch: RDMA RX + RPC dispatch + RDMA TX +/// in one GPU kernel, using Hololink's gpu_roce_transceiver.cuh device +/// functions for WQE preparation and BlueFlame TX. +/// +/// Compiled into libcudaq-realtime-bridge-hololink.so (transport-specific). +/// The core libcudaq-realtime-dispatch.a no longer contains this file. + +#include "cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +#include +#include + +#include "gpu_roce_transceiver.cuh" + +using namespace cudaq::realtime; + +//============================================================================== +// Device helpers +//============================================================================== + +/// Spin-poll the CQE owner bit with periodic shutdown_flag checks. Inlines +/// the DOCA CQ state update (fence + consumer-index advance) to avoid the +/// double CQE read that calling poll_cq_at would cause. Returns UINT32_MAX +/// on shutdown; otherwise returns the stride from the CQE immediate field. +__device__ static inline std::uint32_t +unified_poll_receive(struct doca_gpu_dev_verbs_cq *cq_rq, std::uint8_t *cqe, + std::uint32_t cqe_mask, + doca_gpu_dev_verbs_ticket_t ticket, + volatile int *shutdown_flag) { + auto *cqe64 = reinterpret_cast( + cqe + ((ticket & cqe_mask) * DOCA_GPUNETIO_VERBS_CQE_SIZE)); + std::uint32_t cqe_num = cqe_mask + 1; + int spin = 0; + std::uint8_t opown; + do { + opown = doca_gpu_dev_verbs_load_relaxed_sys_global( + reinterpret_cast(&cqe64->op_own)); + if (!((opown & MLX5_CQE_OWNER_MASK) ^ !!(ticket & cqe_num))) + break; + if (++spin >= 1024) { + spin = 0; + if (*shutdown_flag) + return UINT32_MAX; + } + } while (true); + doca_gpu_dev_verbs_fence_acquire(); + doca_gpu_dev_verbs_atomic_max( + &cq_rq->cqe_ci, ticket + 1); + return doca_gpu_dev_verbs_bswap32(cqe64->imm_inval_pkey) & 0xFFF; +} + +__device__ static inline const cudaq_function_entry_t * +unified_lookup_entry(std::uint32_t function_id, + cudaq_function_entry_t *entries, std::size_t count) { + for (std::size_t i = 0; i < count; ++i) { + if (entries[i].function_id == function_id) + return &entries[i]; + } + return nullptr; +} + +//============================================================================== +// Unified dispatch kernel -- single thread, single block. +// +// Uses Hololink's prepare_send_shared / send_bf / repost_receive from +// gpu_roce_transceiver.cuh instead of duplicating WQE manipulation code. +//============================================================================== + +__global__ void hololink_unified_dispatch_kernel( + struct doca_gpu_dev_verbs_qp *qp, volatile int *shutdown_flag, + std::uint8_t *ring_buf, std::size_t ring_buf_stride_sz, + std::uint32_t ring_buf_mkey, std::uint32_t ring_buf_stride_num, + std::size_t frame_size, cudaq_function_entry_t *function_table, + std::size_t func_count, std::uint64_t *stats) { + if (qp == nullptr) + return; + + auto *cq_rq = doca_gpu_dev_verbs_qp_get_cq_rq(qp); + auto *cqe = reinterpret_cast( + __ldg(reinterpret_cast(&cq_rq->cqe_daddr))); + const std::uint32_t cqe_mask = __ldg(&cq_rq->cqe_num) - 1; + + const bool use_inline = (frame_size <= MAX_SEND_INLINE_WQE); + + // Prepare WQE template in shared memory using Hololink's helper + __shared__ struct doca_gpu_dev_verbs_wqe wqe_sh; + prepare_send_shared(qp, &wqe_sh, frame_size, ring_buf_mkey); + + doca_gpu_dev_verbs_ticket_t cq_ticket = 0; + std::uint64_t sq_wqe_idx = 0; + std::uint64_t packet_count = 0; + + while (true) { + std::uint32_t stride = + unified_poll_receive(cq_rq, cqe, cqe_mask, cq_ticket, shutdown_flag); + if (stride == UINT32_MAX) + break; + if (stride >= ring_buf_stride_num) { + sq_wqe_idx++; + repost_receive(qp, sq_wqe_idx); + cq_ticket = sq_wqe_idx; + continue; + } + + auto *slot = + ring_buf + static_cast(stride) * ring_buf_stride_sz; + auto *header = reinterpret_cast(slot); + + if (header->magic == RPC_MAGIC_REQUEST) { + std::uint32_t function_id = header->function_id; + std::uint32_t arg_len = header->arg_len; + std::uint32_t request_id = header->request_id; + std::uint64_t ptp_timestamp = header->ptp_timestamp; + + const cudaq_function_entry_t *entry = + unified_lookup_entry(function_id, function_table, func_count); + + int status = -1; + std::uint32_t result_len = 0; + + if (entry != nullptr && + entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + auto func = reinterpret_cast( + entry->handler.device_fn_ptr); + void *arg_buffer = static_cast(header + 1); + auto *output_buffer = slot + sizeof(RPCResponse); + auto max_result_len = static_cast( + frame_size - sizeof(RPCResponse)); + + status = + func(arg_buffer, output_buffer, arg_len, max_result_len, + &result_len); + } + + auto *response = reinterpret_cast(slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + response->request_id = request_id; + response->ptp_timestamp = ptp_timestamp; + } + + // Send response via Hololink's BlueFlame helper + auto buffer_addr = + static_cast(ring_buf_stride_sz) * stride; + if (!use_inline) { + send_bf( + qp, &wqe_sh, sq_wqe_idx, buffer_addr); + } else { + send_bf( + qp, &wqe_sh, sq_wqe_idx, + reinterpret_cast(slot)); + } + + sq_wqe_idx++; + repost_receive(qp, sq_wqe_idx); + cq_ticket = sq_wqe_idx; + packet_count++; + } + + atomicAdd(reinterpret_cast(stats), packet_count); +} + +//============================================================================== +// Host launch wrapper -- matches cudaq_unified_launch_fn_t signature. +//============================================================================== + +extern "C" void hololink_launch_unified_dispatch( + void *transport_ctx, cudaq_function_entry_t *function_table, + size_t func_count, volatile int *shutdown_flag, uint64_t *stats, + cudaStream_t stream) { + auto *ctx = static_cast(transport_ctx); + + hololink_unified_dispatch_kernel<<<1, 1, 0, stream>>>( + static_cast(ctx->gpu_dev_qp), + shutdown_flag, ctx->rx_ring_data, ctx->rx_ring_stride_sz, + ctx->rx_ring_mkey, ctx->rx_ring_stride_num, ctx->frame_size, + function_table, func_count, stats); +} diff --git a/realtime/lib/daemon/cudaq-realtime.map b/realtime/lib/daemon/cudaq-realtime.map new file mode 100644 index 00000000000..3493ab917a2 --- /dev/null +++ b/realtime/lib/daemon/cudaq-realtime.map @@ -0,0 +1,11 @@ +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. +# All rights reserved. +# +# This source code and the accompanying materials are made available under +# the terms of the Apache License 2.0 which accompanies this distribution. +# + +{ + global: cudaq*; + local: *; +}; diff --git a/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp new file mode 100644 index 00000000000..5306088c388 --- /dev/null +++ b/realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp @@ -0,0 +1,234 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" + +#include +#include + +struct cudaq_dispatch_manager_t { + int reserved = 0; +}; + +struct cudaq_dispatcher_t { + cudaq_dispatcher_config_t config{}; + cudaq_ringbuffer_t ringbuffer{}; + cudaq_function_table_t table{}; + cudaq_dispatch_launch_fn_t launch_fn = nullptr; + cudaq_unified_launch_fn_t unified_launch_fn = nullptr; + void *transport_ctx = nullptr; + volatile int *shutdown_flag = nullptr; + uint64_t *stats = nullptr; + cudaStream_t stream = nullptr; + bool running = false; +}; + +static bool is_valid_kernel_type(cudaq_kernel_type_t kernel_type) { + switch (kernel_type) { + case CUDAQ_KERNEL_REGULAR: + case CUDAQ_KERNEL_COOPERATIVE: + case CUDAQ_KERNEL_UNIFIED: + return true; + default: + return false; + } +} + +static bool is_valid_dispatch_mode(cudaq_dispatch_mode_t dispatch_mode) { + switch (dispatch_mode) { + case CUDAQ_DISPATCH_DEVICE_CALL: + case CUDAQ_DISPATCH_GRAPH_LAUNCH: + return true; + default: + return false; + } +} + +static cudaq_status_t validate_dispatcher(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (!is_valid_kernel_type(dispatcher->config.kernel_type)) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->shutdown_flag || !dispatcher->stats) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->table.entries || dispatcher->table.count == 0) + return CUDAQ_ERR_INVALID_ARG; + + if (dispatcher->config.kernel_type == CUDAQ_KERNEL_UNIFIED) { + if (!dispatcher->unified_launch_fn || !dispatcher->transport_ctx) + return CUDAQ_ERR_INVALID_ARG; + } else { + if (!dispatcher->launch_fn) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->ringbuffer.rx_flags || !dispatcher->ringbuffer.tx_flags) + return CUDAQ_ERR_INVALID_ARG; + if (dispatcher->config.num_blocks == 0 || + dispatcher->config.threads_per_block == 0 || + dispatcher->config.num_slots == 0 || dispatcher->config.slot_size == 0) + return CUDAQ_ERR_INVALID_ARG; + if (!is_valid_dispatch_mode(dispatcher->config.dispatch_mode)) + return CUDAQ_ERR_INVALID_ARG; + } + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatch_manager_create(cudaq_dispatch_manager_t **out_mgr) { + if (!out_mgr) + return CUDAQ_ERR_INVALID_ARG; + auto *mgr = new (std::nothrow) cudaq_dispatch_manager_t(); + if (!mgr) + return CUDAQ_ERR_INTERNAL; + *out_mgr = mgr; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatch_manager_destroy(cudaq_dispatch_manager_t *mgr) { + if (mgr) + delete mgr; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_create(cudaq_dispatch_manager_t *, + const cudaq_dispatcher_config_t *config, + cudaq_dispatcher_t **out_dispatcher) { + if (!config || !out_dispatcher) + return CUDAQ_ERR_INVALID_ARG; + auto *dispatcher = new (std::nothrow) cudaq_dispatcher_t(); + if (!dispatcher) + return CUDAQ_ERR_INTERNAL; + dispatcher->config = *config; + *out_dispatcher = dispatcher; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_destroy(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + delete dispatcher; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_ringbuffer(cudaq_dispatcher_t *dispatcher, + const cudaq_ringbuffer_t *ringbuffer) { + if (!dispatcher || !ringbuffer) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->ringbuffer = *ringbuffer; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_function_table(cudaq_dispatcher_t *dispatcher, + const cudaq_function_table_t *table) { + if (!dispatcher || !table) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->table = *table; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_set_control(cudaq_dispatcher_t *dispatcher, + volatile int *shutdown_flag, + uint64_t *stats) { + if (!dispatcher || !shutdown_flag || !stats) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->shutdown_flag = shutdown_flag; + dispatcher->stats = stats; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_launch_fn(cudaq_dispatcher_t *dispatcher, + cudaq_dispatch_launch_fn_t launch_fn) { + if (!dispatcher || !launch_fn) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->launch_fn = launch_fn; + return CUDAQ_OK; +} + +cudaq_status_t +cudaq_dispatcher_set_unified_launch(cudaq_dispatcher_t *dispatcher, + cudaq_unified_launch_fn_t unified_launch_fn, + void *transport_ctx) { + if (!dispatcher || !unified_launch_fn || !transport_ctx) + return CUDAQ_ERR_INVALID_ARG; + dispatcher->unified_launch_fn = unified_launch_fn; + dispatcher->transport_ctx = transport_ctx; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) { + auto status = validate_dispatcher(dispatcher); + if (status != CUDAQ_OK) + return status; + if (dispatcher->running) + return CUDAQ_OK; + + int device_id = dispatcher->config.device_id; + if (device_id < 0) + device_id = 0; + if (cudaSetDevice(device_id) != cudaSuccess) + return CUDAQ_ERR_CUDA; + if (cudaStreamCreate(&dispatcher->stream) != cudaSuccess) + return CUDAQ_ERR_CUDA; + + if (dispatcher->config.kernel_type == CUDAQ_KERNEL_UNIFIED) { + dispatcher->unified_launch_fn( + dispatcher->transport_ctx, dispatcher->table.entries, + dispatcher->table.count, dispatcher->shutdown_flag, dispatcher->stats, + dispatcher->stream); + } else { + dispatcher->launch_fn( + dispatcher->ringbuffer.rx_flags, dispatcher->ringbuffer.tx_flags, + dispatcher->ringbuffer.rx_data, dispatcher->ringbuffer.tx_data, + dispatcher->ringbuffer.rx_stride_sz, + dispatcher->ringbuffer.tx_stride_sz, dispatcher->table.entries, + dispatcher->table.count, dispatcher->shutdown_flag, dispatcher->stats, + dispatcher->config.num_slots, dispatcher->config.num_blocks, + dispatcher->config.threads_per_block, dispatcher->stream); + } + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error in dispatcher launch: %s (%d)\n", + cudaGetErrorString(err), err); + return CUDAQ_ERR_CUDA; + } + + dispatcher->running = true; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_stop(cudaq_dispatcher_t *dispatcher) { + if (!dispatcher) + return CUDAQ_ERR_INVALID_ARG; + if (!dispatcher->running) + return CUDAQ_OK; + + int shutdown = 1; + if (cudaMemcpy(const_cast(dispatcher->shutdown_flag), &shutdown, + sizeof(int), cudaMemcpyHostToDevice) != cudaSuccess) + return CUDAQ_ERR_CUDA; + cudaStreamSynchronize(dispatcher->stream); + cudaStreamDestroy(dispatcher->stream); + dispatcher->stream = nullptr; + dispatcher->running = false; + return CUDAQ_OK; +} + +cudaq_status_t cudaq_dispatcher_get_processed(cudaq_dispatcher_t *dispatcher, + uint64_t *out_packets) { + if (!dispatcher || !out_packets || !dispatcher->stats) + return CUDAQ_ERR_INVALID_ARG; + + if (cudaMemcpy(out_packets, dispatcher->stats, sizeof(uint64_t), + cudaMemcpyDeviceToHost) != cudaSuccess) + return CUDAQ_ERR_CUDA; + + return CUDAQ_OK; +} diff --git a/realtime/lib/daemon/dispatcher/dispatch_kernel.cu b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu new file mode 100644 index 00000000000..82c3e030172 --- /dev/null +++ b/realtime/lib/daemon/dispatcher/dispatch_kernel.cu @@ -0,0 +1,640 @@ +/******************************************************************************* + * Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" + +#include +#include +#include + + +namespace cudaq::realtime { + +//============================================================================== +// Dispatch Kernel Implementation (compiled into libcudaq-realtime.so) +//============================================================================== + +/// @brief Lookup function entry in table by function_id. +__device__ inline const cudaq_function_entry_t* dispatch_lookup_entry( + std::uint32_t function_id, + cudaq_function_entry_t* entries, + std::size_t entry_count) { + for (std::size_t i = 0; i < entry_count; ++i) { + if (entries[i].function_id == function_id) { + return &entries[i]; + } + } + return nullptr; +} + +/// @brief Dispatch kernel for DEVICE_CALL mode only (no graph launch support). +/// This kernel does not contain any device-side graph launch code, avoiding +/// compatibility issues on systems where cudaGraphLaunch is not supported. +/// +/// Supports symmetric RX/TX data buffers for Hololink compatibility: +/// - RX data address comes from rx_flags[slot] (set by Hololink RX kernel) +/// - TX response is written to tx_data + slot * tx_stride_sz +/// - tx_flags[slot] is set to the TX slot address +/// +/// When KernelType::is_cooperative is true, the kernel is launched via +/// cudaLaunchCooperativeKernel and ALL threads participate in calling the +/// RPC handler (needed for multi-block cooperative decode kernels like BP). +/// Thread 0 polls/parses the header, broadcasts work via shared memory, +/// then all threads call the handler after a grid.sync(). +template +__global__ void dispatch_kernel_device_call_only( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* tx_data, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + std::uint64_t local_packet_count = 0; + std::size_t current_slot = 0; + + if constexpr (KernelType::is_cooperative) { + //========================================================================== + // Cooperative path: ALL threads call the handler. + // + // Work descriptor in shared memory (block 0 broadcasts via grid.sync). + // Only block 0 needs shared memory for the descriptor; other blocks + // read the device-memory copies after the grid barrier. + //========================================================================== + __shared__ DeviceRPCFunction s_func; + __shared__ void* s_arg_buffer; + __shared__ std::uint8_t* s_output_buffer; + __shared__ std::uint32_t s_arg_len; + __shared__ std::uint32_t s_max_result_len; + __shared__ std::uint32_t s_request_id; + __shared__ std::uint64_t s_ptp_timestamp; + __shared__ bool s_have_work; + + // Device-memory work descriptor visible to all blocks after grid.sync. + // We use a single set since the cooperative kernel processes one RPC at + // a time (all threads participate, so no pipelining). + __device__ static DeviceRPCFunction d_func; + __device__ static void* d_arg_buffer; + __device__ static std::uint8_t* d_output_buffer; + __device__ static std::uint32_t d_arg_len; + __device__ static std::uint32_t d_max_result_len; + __device__ static std::uint32_t d_request_id; + __device__ static std::uint64_t d_ptp_timestamp; + __device__ static bool d_have_work; + + while (!(*shutdown_flag)) { + // --- Phase 1: Thread 0 polls and parses --- + if (tid == 0) { + s_have_work = false; + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic == RPC_MAGIC_REQUEST) { + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + header->function_id, function_table, func_count); + if (entry != nullptr && + entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + s_func = reinterpret_cast( + entry->handler.device_fn_ptr); + s_arg_buffer = static_cast(header + 1); + s_output_buffer = tx_slot + sizeof(RPCResponse); + s_arg_len = header->arg_len; + s_max_result_len = tx_stride_sz - sizeof(RPCResponse); + s_request_id = header->request_id; + s_ptp_timestamp = header->ptp_timestamp; + s_have_work = true; + + // Publish to device memory for other blocks + d_func = s_func; + d_arg_buffer = s_arg_buffer; + d_output_buffer = s_output_buffer; + d_arg_len = s_arg_len; + d_max_result_len = s_max_result_len; + d_request_id = s_request_id; + d_ptp_timestamp = s_ptp_timestamp; + d_have_work = true; + } + } + if (!s_have_work) { + rx_flags[current_slot] = 0; + } + } + } + + // --- Phase 2: Broadcast to all threads --- + KernelType::sync(); + + // Non-block-0 threads read from device memory + bool have_work; + DeviceRPCFunction func; + void* arg_buffer; + std::uint8_t* output_buffer; + std::uint32_t arg_len; + std::uint32_t max_result_len; + std::uint32_t request_id; + std::uint64_t ptp_timestamp; + if (blockIdx.x == 0) { + have_work = s_have_work; + func = s_func; + arg_buffer = s_arg_buffer; + output_buffer = s_output_buffer; + arg_len = s_arg_len; + max_result_len = s_max_result_len; + request_id = s_request_id; + ptp_timestamp = s_ptp_timestamp; + } else { + have_work = d_have_work; + func = d_func; + arg_buffer = d_arg_buffer; + output_buffer = d_output_buffer; + arg_len = d_arg_len; + max_result_len = d_max_result_len; + request_id = d_request_id; + ptp_timestamp = d_ptp_timestamp; + } + + // --- Phase 3: ALL threads call the handler --- + std::uint32_t result_len = 0; + int status = 0; + if (have_work) { + status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); + } + + // --- Phase 4: Sync, then thread 0 writes response --- + KernelType::sync(); + + if (tid == 0 && have_work) { + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + RPCResponse* response = reinterpret_cast(tx_slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + response->request_id = request_id; + response->ptp_timestamp = ptp_timestamp; + + while (tx_flags[current_slot] != 0 && !(*shutdown_flag)) + ; + + __threadfence(); + tx_flags[current_slot] = reinterpret_cast(tx_slot); + + rx_flags[current_slot] = 0; + local_packet_count++; + current_slot = (current_slot + 1) % num_slots; + } + + // Reset device-memory work flag for next iteration + if (tid == 0) { + d_have_work = false; + } + + KernelType::sync(); + } + } else { + //========================================================================== + // Regular path: only thread 0 calls the handler (unchanged). + //========================================================================== + while (!(*shutdown_flag)) { + if (tid == 0) { + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + // RX data address comes from rx_flags (set by Hololink RX kernel + // or host test harness to the address of the RX data slot) + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic != RPC_MAGIC_REQUEST) { + rx_flags[current_slot] = 0; + continue; + } + + std::uint32_t function_id = header->function_id; + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + function_id, function_table, func_count); + + if (entry != nullptr && entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = + reinterpret_cast(entry->handler.device_fn_ptr); + + // Compute TX slot address from symmetric TX data buffer + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + // Handler writes results directly to TX slot (after response header) + std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); + std::uint32_t result_len = 0; + std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); + int status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); + + // Write RPC response header to TX slot + RPCResponse* response = reinterpret_cast(tx_slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + response->request_id = header->request_id; + response->ptp_timestamp = header->ptp_timestamp; + + while (tx_flags[current_slot] != 0 && !(*shutdown_flag)) + ; + + __threadfence(); + tx_flags[current_slot] = reinterpret_cast(tx_slot); + } + + rx_flags[current_slot] = 0; + local_packet_count++; + current_slot = (current_slot + 1) % num_slots; + } + } + + KernelType::sync(); + } + } + + if (tid == 0) { + atomicAdd(reinterpret_cast(stats), local_packet_count); + } +} + +/// @brief Dispatch kernel supporting both DEVICE_CALL and GRAPH_LAUNCH modes. +/// This kernel includes device-side graph launch code and requires compute capability >= 9.0. +/// NOTE: Graph launch code is conditionally compiled based on __CUDA_ARCH__. +/// +/// Supports symmetric RX/TX data buffers for Hololink compatibility. +template +__global__ void dispatch_kernel_with_graph( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* tx_data, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + GraphIOContext* graph_io_ctx, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + std::uint64_t local_packet_count = 0; + std::size_t current_slot = 0; + + while (!(*shutdown_flag)) { + if (tid == 0) { + std::uint64_t rx_value = rx_flags[current_slot]; + if (rx_value != 0) { + void* rx_slot = reinterpret_cast(rx_value); + RPCHeader* header = static_cast(rx_slot); + if (header->magic != RPC_MAGIC_REQUEST) { + rx_flags[current_slot] = 0; + continue; + } + + std::uint32_t function_id = header->function_id; + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + + const cudaq_function_entry_t* entry = dispatch_lookup_entry( + function_id, function_table, func_count); + + // Compute TX slot address from symmetric TX data buffer + std::uint8_t* tx_slot = tx_data + current_slot * tx_stride_sz; + + if (entry != nullptr) { + if (entry->dispatch_mode == CUDAQ_DISPATCH_DEVICE_CALL) { + DeviceRPCFunction func = + reinterpret_cast(entry->handler.device_fn_ptr); + + // Handler writes results directly to TX slot (after response header) + std::uint8_t* output_buffer = tx_slot + sizeof(RPCResponse); + std::uint32_t result_len = 0; + std::uint32_t max_result_len = tx_stride_sz - sizeof(RPCResponse); + int status = func(arg_buffer, output_buffer, arg_len, + max_result_len, &result_len); + + // Write RPC response to TX slot + RPCResponse* response = reinterpret_cast(tx_slot); + response->magic = RPC_MAGIC_RESPONSE; + response->status = status; + response->result_len = result_len; + response->request_id = header->request_id; + response->ptp_timestamp = header->ptp_timestamp; + + while (tx_flags[current_slot] != 0 && !(*shutdown_flag)) + ; + + __threadfence(); + tx_flags[current_slot] = reinterpret_cast(tx_slot); + } +#if __CUDA_ARCH__ >= 900 + else if (entry->dispatch_mode == CUDAQ_DISPATCH_GRAPH_LAUNCH) { + if (graph_io_ctx != nullptr) { + graph_io_ctx->rx_slot = rx_slot; + graph_io_ctx->tx_slot = tx_slot; + graph_io_ctx->tx_flag = &tx_flags[current_slot]; + graph_io_ctx->tx_flag_value = + reinterpret_cast(tx_slot); + graph_io_ctx->tx_stride_sz = tx_stride_sz; + __threadfence(); + } + + cudaGraphLaunch(entry->handler.graph_exec, + cudaStreamGraphFireAndForget); + } +#endif // __CUDA_ARCH__ >= 900 + } + + rx_flags[current_slot] = 0; + local_packet_count++; + current_slot = (current_slot + 1) % num_slots; + } + } + + KernelType::sync(); + } + + if (tid == 0) { + atomicAdd(reinterpret_cast(stats), local_packet_count); + } +} + +} // namespace cudaq::realtime + +//============================================================================== +// Host Launch Functions +//============================================================================== + +// Force eager CUDA module loading for the dispatch kernel. +// Call before launching persistent kernels to avoid lazy-loading deadlocks. +extern "C" cudaError_t cudaq_dispatch_kernel_query_occupancy( + int* out_blocks, uint32_t threads_per_block) { + int num_blocks = 0; + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, + cudaq::realtime::dispatch_kernel_device_call_only, + threads_per_block, 0); + if (err != cudaSuccess) return err; + if (out_blocks) *out_blocks = num_blocks; + return cudaSuccess; +} + +extern "C" cudaError_t cudaq_dispatch_kernel_cooperative_query_occupancy( + int* out_blocks, uint32_t threads_per_block) { + int num_blocks = 0; + cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks, + cudaq::realtime::dispatch_kernel_device_call_only< + cudaq::realtime::CooperativeKernel>, + threads_per_block, 0); + if (err != cudaSuccess) return err; + if (out_blocks) *out_blocks = num_blocks; + return cudaSuccess; +} + +extern "C" void cudaq_launch_dispatch_kernel_regular( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + // Use device-call-only kernel (no graph launch support) + // Note: rx_data/rx_stride_sz are available in the ringbuffer struct but + // not passed to the kernel since it reads RX addresses from rx_flags. + (void)rx_data; + (void)rx_stride_sz; + cudaq::realtime::dispatch_kernel_device_call_only + <<>>( + rx_flags, tx_flags, tx_data, tx_stride_sz, + function_table, func_count, + shutdown_flag, stats, num_slots); +} + +extern "C" void cudaq_launch_dispatch_kernel_cooperative( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + (void)rx_data; + (void)rx_stride_sz; + void* kernel_args[] = { + const_cast(&rx_flags), + const_cast(&tx_flags), + &tx_data, + &tx_stride_sz, + &function_table, + &func_count, + const_cast(&shutdown_flag), + &stats, + &num_slots + }; + + cudaLaunchCooperativeKernel( + reinterpret_cast( + cudaq::realtime::dispatch_kernel_device_call_only), + dim3(num_blocks), dim3(threads_per_block), kernel_args, 0, stream); +} + +//============================================================================== +// Graph-Based Dispatch (Proper Device-Side Graph Launch Support) +//============================================================================== +// +// To use device-side cudaGraphLaunch(), the dispatch kernel itself must be +// running inside a graph execution context. These functions create a graph +// containing the dispatch kernel, instantiate it with cudaGraphInstantiateFlagDeviceLaunch, +// and provide proper launch/cleanup functions. + +// Internal storage for graph-based dispatch context +// Parameters must be stored persistently since the graph may execute after +// the create function returns. +struct cudaq_dispatch_graph_context { + cudaGraph_t graph; + cudaGraphExec_t graph_exec; + cudaGraphNode_t kernel_node; + bool is_valid; + + // Persistent storage for kernel parameters (must outlive graph execution) + volatile std::uint64_t* rx_flags; + volatile std::uint64_t* tx_flags; + std::uint8_t* tx_data; + std::size_t tx_stride_sz; + cudaq_function_entry_t* function_table; + std::size_t func_count; + cudaq::realtime::GraphIOContext* graph_io_ctx; + volatile int* shutdown_flag; + std::uint64_t* stats; + std::size_t num_slots; +}; + +extern "C" cudaError_t cudaq_create_dispatch_graph_regular( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + void* graph_io_ctx_raw, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream, + cudaq_dispatch_graph_context** out_context) { + + (void)rx_data; + (void)rx_stride_sz; + cudaError_t err; + + // Allocate context with persistent parameter storage + cudaq_dispatch_graph_context* ctx = new cudaq_dispatch_graph_context(); + ctx->is_valid = false; + + // Store parameters persistently in the context + ctx->rx_flags = rx_flags; + ctx->tx_flags = tx_flags; + ctx->tx_data = tx_data; + ctx->tx_stride_sz = tx_stride_sz; + ctx->function_table = function_table; + ctx->func_count = func_count; + ctx->graph_io_ctx = + static_cast(graph_io_ctx_raw); + ctx->shutdown_flag = shutdown_flag; + ctx->stats = stats; + ctx->num_slots = num_slots; + + // Create graph + err = cudaGraphCreate(&ctx->graph, 0); + if (err != cudaSuccess) { + delete ctx; + return err; + } + + // Set up kernel parameters - point to persistent storage in context + cudaKernelNodeParams kernel_params = {}; + void* kernel_args[] = { + &ctx->rx_flags, + &ctx->tx_flags, + &ctx->tx_data, + &ctx->tx_stride_sz, + &ctx->function_table, + &ctx->func_count, + &ctx->graph_io_ctx, + &ctx->shutdown_flag, + &ctx->stats, + &ctx->num_slots + }; + + kernel_params.func = reinterpret_cast( + cudaq::realtime::dispatch_kernel_with_graph); + kernel_params.gridDim = dim3(num_blocks, 1, 1); + kernel_params.blockDim = dim3(threads_per_block, 1, 1); + kernel_params.sharedMemBytes = 0; + kernel_params.kernelParams = kernel_args; + kernel_params.extra = nullptr; + + // Add kernel node to graph + err = cudaGraphAddKernelNode(&ctx->kernel_node, ctx->graph, nullptr, 0, &kernel_params); + if (err != cudaSuccess) { + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Instantiate with device launch flag - THIS IS THE KEY! + err = cudaGraphInstantiate(&ctx->graph_exec, ctx->graph, + cudaGraphInstantiateFlagDeviceLaunch); + if (err != cudaSuccess) { + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Upload graph to device (required before device-side launch) + err = cudaGraphUpload(ctx->graph_exec, stream); + if (err != cudaSuccess) { + cudaGraphExecDestroy(ctx->graph_exec); + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + // Synchronize to ensure upload completes + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) { + cudaGraphExecDestroy(ctx->graph_exec); + cudaGraphDestroy(ctx->graph); + delete ctx; + return err; + } + + ctx->is_valid = true; + *out_context = ctx; + return cudaSuccess; +} + +extern "C" cudaError_t cudaq_launch_dispatch_graph( + cudaq_dispatch_graph_context* context, + cudaStream_t stream) { + if (context == nullptr || !context->is_valid) { + return cudaErrorInvalidValue; + } + + // Launch the graph - now device-side cudaGraphLaunch will work! + return cudaGraphLaunch(context->graph_exec, stream); +} + +extern "C" cudaError_t cudaq_destroy_dispatch_graph( + cudaq_dispatch_graph_context* context) { + if (context == nullptr) { + return cudaErrorInvalidValue; + } + + cudaError_t err = cudaSuccess; + + if (context->is_valid) { + cudaError_t err1 = cudaGraphExecDestroy(context->graph_exec); + cudaError_t err2 = cudaGraphDestroy(context->graph); + if (err1 != cudaSuccess) err = err1; + else if (err2 != cudaSuccess) err = err2; + } + + delete context; + return err; +} diff --git a/realtime/scripts/build_installer.sh b/realtime/scripts/build_installer.sh new file mode 100644 index 00000000000..74277a3b5da --- /dev/null +++ b/realtime/scripts/build_installer.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Installer build script for CUDA-Q realtime +# +# This script packages a CUDA-Q realtime installation into a self-extracting archive +# using makeself (https://makeself.io/). +# +# +# Prerequisites: +# - CUDA-Q realtime must be built (run build_cudaq.sh first) +# - makeself must be installed +# +# Environment variables: +# CUDAQ_REALTIME_INSTALL_PREFIX: Path to CUDA-Q realtime installation (default: $HOME/.cudaq_realtime) + +set -euo pipefail + +# ============================================================================ # +# Setup +# ============================================================================ # + +this_file_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cuda_variant="" +arch=$(uname -m) +output_dir="out" +install_dir="" + +usage() { + cat <<'EOF' +Usage: + bash realtime/scripts/build_installer.sh -c 12 + bash realtime/scripts/build_installer.sh -c 13 + +Options: + -c CUDA variant, 12 or 13 (required) + -o Output directory for installer (default: out) + -i Directory with built CUDA-Q realtime installation + (default: $CUDAQ_REALTIME_INSTALL_PREFIX or $HOME/.cudaq_realtime) +EOF +} + +# Parse command line arguments +__optind__=$OPTIND +OPTIND=1 +while getopts ":c:o:i:h" opt; do + case $opt in + c) cuda_variant="$OPTARG" ;; + o) output_dir="$OPTARG" ;; + i) install_dir="$OPTARG" ;; + h) + usage + exit 0 + ;; + \?) + echo "Invalid command line option -$OPTARG" >&2 + usage >&2 + exit 1 + ;; + esac +done +OPTIND=$__optind__ + +# require explicit -c option +if [ -z "$cuda_variant" ]; then + echo "Error: CUDA variant required. Use -c 12 or -c 13" >&2 + exit 1 +fi +if [ "$cuda_variant" != "12" ] && [ "$cuda_variant" != "13" ]; then + echo "Error: CUDA variant must be 12 or 13, got: $cuda_variant" >&2 + exit 1 +fi + + +installer_name=install_cuda_quantum_realtime_cu${cuda_variant}.${arch} + +echo "Building installer $installer_name for CUDA $cuda_variant on $arch..." + +if [ -z "$install_dir" ]; then + install_dir="${CUDAQ_REALTIME_INSTALL_PREFIX:-$HOME/.cudaq_realtime}" +fi + +# Verify CUDA-Q Realtime is built +if [ ! -d "$install_dir" ] || [ ! -f "$install_dir/lib/libcudaq-realtime.so" ]; then + echo "Error: CUDA-Q Realtime installation not found at $install_dir" >&2 + echo "Please build CUDA-Q Realtime first" >&2 + exit 1 +fi + +# Verify makeself is installed +if ! command -v makeself &>/dev/null; then + echo "Error: makeself not found" >&2 + echo "Install with: apt install makeself # or: yum install makeself" >&2 + exit 1 +fi + +echo "Using install directory: $install_dir" + +# ============================================================================ # +# Create self-extracting archive +# ============================================================================ # +mkdir -p "$output_dir" + +echo "Creating self-extracting archive..." + +declare -a makeself_args +makeself_args=(--gzip --sha256) +# Add license if available +if [ -f "$this_file_dir/../LICENSE" ]; then + makeself_args+=(--license "$this_file_dir/../LICENSE") +fi + +# Stage a clean payload directory so we don't mutate the install prefix. +staging_dir="$(mktemp -d "${TMPDIR:-/tmp}/cudaq-realtime-installer.XXXXXX")" +cleanup() { + rm -rf "$staging_dir" +} +trap cleanup EXIT + +if [ ! -f "$this_file_dir/migrate_assets.sh" ]; then + echo "Error: missing $this_file_dir/migrate_assets.sh" >&2 + exit 1 +fi + +echo "Staging payload in $staging_dir..." +mkdir -p "$staging_dir/payload" +cp -a "$install_dir/." "$staging_dir/payload/" +cp "$this_file_dir/migrate_assets.sh" "$staging_dir/payload/install.sh" +chmod a+x "$staging_dir/payload/install.sh" +cp "$this_file_dir/validate_installer.sh" "$staging_dir/payload/validate.sh" +chmod a+x "$staging_dir/payload/validate.sh" +cp "$this_file_dir/demo_docker.sh" "$staging_dir/payload/demo.sh" +chmod a+x "$staging_dir/payload/demo.sh" +cp "$this_file_dir/demo.Dockerfile" "$staging_dir/payload/demo.Dockerfile" + +# Copy the `unittests/utils`, which contains a reference implementation of the hololink wrapper, +# which is used by the install script to verify hololink functionality. +# Also, resolve any symlinks in the utils directory to avoid issues with makeself. +if [ -d "$this_file_dir/../unittests/utils" ]; then + mkdir -p "$staging_dir/payload/utils" + cp -aL "$this_file_dir/../unittests/utils/." "$staging_dir/payload/utils/" +fi + +# Copy all the docs, which contains the README and validation instructions. +if [ -d "$this_file_dir/../docs" ]; then + mkdir -p "$staging_dir/payload/docs" + cp -a "$this_file_dir/../docs/." "$staging_dir/payload/docs/" +fi + +# Default installation target +default_target='/opt/nvidia/cudaq/realtime' + +makeself "${makeself_args[@]}" \ + "$staging_dir/payload" \ + "$output_dir/$installer_name" \ + "CUDA-Q Realtime" \ + bash install.sh -t "$default_target" + +echo "" +echo "Done! Installer created: $output_dir/$installer_name" +echo "To install: bash $output_dir/$installer_name --accept" diff --git a/realtime/scripts/demo.Dockerfile b/realtime/scripts/demo.Dockerfile new file mode 100644 index 00000000000..c85aef3f157 --- /dev/null +++ b/realtime/scripts/demo.Dockerfile @@ -0,0 +1,21 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# This is a Dockerfile for building a container image that includes the CUDA-Q Realtime installation. +# It uses a base image from NVIDIA's DOCA repository, which is required for running CUDA-Q Realtime applications. + +ARG DOCA_VERSION=invalid +ARG CUDA_VERSION=invalid +FROM nvcr.io/nvidia/doca/doca:${DOCA_VERSION}-full-rt-cuda${CUDA_VERSION}.0.0 + +ARG CUDAQ_REALTIME_DIR=/opt/nvidia/cudaq/realtime + +ADD . ${CUDAQ_REALTIME_DIR} + +# Set LD_LIBRARY_PATH to include the CUDA-Q Realtime library path +ENV LD_LIBRARY_PATH="${CUDAQ_REALTIME_DIR}/lib:$LD_LIBRARY_PATH" diff --git a/realtime/scripts/demo_docker.sh b/realtime/scripts/demo_docker.sh new file mode 100644 index 00000000000..8dd44ee75d0 --- /dev/null +++ b/realtime/scripts/demo_docker.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# This script runs CUDA-Q realtime demo environment in a Docker container. +# It mounts the current directory, expected to be the installed CUDA-Q realtime directory, +# into the container to allow running demos and utilities from the container. + + +set -o errexit +set -o xtrace + +SCRIPT=`realpath "$0"` +HERE=`dirname "$SCRIPT"` + +# Default container name, can be overridden with --name= option. +NAME=cudaq_realtime_demo + +# See if we need to run our container differently. +while [ $# -ge 1 ] +do +case "$1" in + --name=*) + NAME="${1#--name=}" + ;; + *) + break + ;; +esac +shift +done + +# Determine the CUDA version from the host system's NVIDIA driver, to select the appropriate container image. +DRIVER_CUDA_VERSION_MAJOR=$(nvidia-smi | grep -oE "CUDA Version: [0-9]+" | awk '{print $3}' ) + +# DOCA version +DOCA_VERSION=3.3.0 + +# Only support CUDA 13 for now, as the DOCA version we use only supports CUDA 13. +if [ "$DRIVER_CUDA_VERSION_MAJOR" != "13" ]; then + echo "Warning: Detected NVIDIA driver CUDA version $DRIVER_CUDA_VERSION_MAJOR, but this demo script is designed for CUDA 13. The container image used may not be compatible with your system. Please ensure you have the appropriate NVIDIA driver installed for CUDA 13 to run this demo." >&2 + exit 1 +fi + +IMAGE_NAME=cudaq-realtime-demo:doca${DOCA_VERSION}-cuda${DRIVER_CUDA_VERSION_MAJOR} +# Build the Docker image for the demo environment, to set the LD_LIBRARY_PATH and ensure the correct DOCA version is used. +docker build \ + --build-arg DOCA_VERSION=${DOCA_VERSION} \ + --build-arg CUDA_VERSION=${DRIVER_CUDA_VERSION_MAJOR} \ + --build-arg CUDAQ_REALTIME_DIR=$HERE \ + -t $IMAGE_NAME \ + -f "$HERE/demo.Dockerfile" \ + "$HERE" + +# Check if $HERE is indeed the installed CUDA-Q realtime directory by looking for the `bin/`, `include/`, and `lib/` directories, and `validate.sh`. +if [ ! -d "$HERE/bin" ] || [ ! -d "$HERE/include" ] || [ ! -d "$HERE/lib" ] || [ ! -f "$HERE/validate.sh" ]; then + echo "Error: The current directory does not appear to be a valid CUDA-Q realtime installation. Please ensure you have built and installed CUDA-Q realtime, and that you are running this script from the installed directory." >&2 + exit 1 +fi + + +# Run the container with the appropriate mounts and environment variables. +# Add $ROOT/lib to LD_LIBRARY_PATH in the container to ensure it can find the CUDA-Q realtime libraries. +# The run command is adapted from NVIDIA's holoscan-sensor-bridge +# (https://github.com/nvidia-holoscan/holoscan-sensor-bridge) demo script. +docker run \ + -it \ + --rm \ + --net host \ + --gpus all \ + --runtime=nvidia \ + --shm-size=1gb \ + --privileged \ + --name "$NAME" \ + -v $PWD:$PWD \ + -v $ROOT:$ROOT \ + -v $HOME:$HOME \ + -v /sys/bus/pci/devices:/sys/bus/pci/devices \ + -v /sys/kernel/mm/hugepages:/sys/kernel/mm/hugepages \ + -v /dev:/dev \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + -v /tmp/argus_socket:/tmp/argus_socket \ + -v /sys/devices:/sys/devices \ + -v /var/nvidia/nvcam/settings:/var/nvidia/nvcam/settings \ + -v /opt/mellanox/doca \ + -w $PWD \ + -e NVIDIA_DRIVER_CAPABILITIES=graphics,video,compute,utility,display \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -e DISPLAY=$DISPLAY \ + -e enableRawReprocess=2 \ + $IMAGE_NAME \ + $* diff --git a/realtime/scripts/install_dev_prerequisites.sh b/realtime/scripts/install_dev_prerequisites.sh new file mode 100755 index 00000000000..2b5d6c11d4d --- /dev/null +++ b/realtime/scripts/install_dev_prerequisites.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Usage: +# This script builds and installs a minimal set of dependencies needed to build +# CUDA-Q realtime from source with DOCA/HSB support. +# +# Usage: +# bash install_dev_prerequisites.sh + + +if [ -x "$(command -v apt-get)" ]; then + CUDA_MAJOR_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\).*$/\1/p') + if [ -z "$CUDA_MAJOR_VERSION" ]; then + echo "Could not determine CUDA version from nvcc. Is the CUDA toolkit installed?" >&2 + echo "CUDA-Q Realtime requires CUDA toolkit to be installed." >&2 + exit 1 + fi + + # [libibverbs] + echo "Installing libibverbs..." + apt-get update && apt-get install -y --no-install-recommends libibverbs-dev + + # [DOCA Host] + if [ ! -x "$(command -v curl)" ]; then + apt-get update && apt-get install -y --no-install-recommends curl + fi + + DOCA_VERSION=3.3.0 + echo "Installing DOCA version $DOCA_VERSION..." + arch=$(uname -m) + if [ "$arch" == "aarch64" ] || [ "$arch" == "arm64" ]; then + arch="arm64-sbsa" + fi + distro=$(. /etc/os-release && echo ${ID}${VERSION_ID}) # e.g., ubuntu24.04 + export DOCA_URL="https://linux.mellanox.com/public/repo/doca/$DOCA_VERSION/$distro/$arch/" + echo "Using DOCA_REPO_LINK=${DOCA_URL}" + curl https://linux.mellanox.com/public/repo/doca/GPG-KEY-Mellanox.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub + echo "deb [signed-by=/etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub] $DOCA_URL ./" > /etc/apt/sources.list.d/doca.list + apt-get update + DEBIAN_FRONTEND=noninteractive apt-get -y install doca-all libdoca-sdk-gpunetio-dev + + # [Holoscan SDK] + apt-get update && apt-get install -y --no-install-recommends holoscan-cuda-$CUDA_MAJOR_VERSION + +elif [ -x "$(command -v dnf)" ]; then + echo "RHEL is not supported. Please install DOCA and Holoscan SDK manually." >&2 +else + echo "No supported package manager detected." >&2 +fi diff --git a/realtime/scripts/migrate_assets.sh b/realtime/scripts/migrate_assets.sh new file mode 100755 index 00000000000..5cf51c64e60 --- /dev/null +++ b/realtime/scripts/migrate_assets.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# This script migrates assets from an extracted installer directory to the expected locations. + +# Default target location: /opt/nvidia/cudaq/realtime +target=/opt/nvidia/cudaq/realtime + + +# Process command line arguments +__optind__=$OPTIND +OPTIND=1 +while getopts ":t:" opt; do + case $opt in + t) target="$OPTARG" + ;; + \?) echo "Invalid command line option -$OPTARG" >&2 + (return 0 2>/dev/null) && return 1 || exit 1 + ;; + esac +done +OPTIND=$__optind__ + +mkdir -p "$target" + +# Generate uninstall script from the full payload file list (not only what we +# move this run), so it is correct on first install and when re-running. +uninstall_script="$target/uninstall.sh" +target_quoted=$(printf '%q' "$target") + +{ + printf '#!/bin/bash\nset -euo pipefail\n\ntarget=%s\n\n' "$target_quoted" + printf 'echo "This will remove CUDA-Q Realtime files installed under $target."\n' + printf 'echo "The following files will be removed:"\n' + # List files for the user before asking for confirmation + find . -type f -print0 | while IFS= read -r -d '' file; do + [ "$file" = "./install.sh" ] && continue + # Strip leading ./ + rel="${file#./}" + printf 'echo " $target/%s"\n' "$rel" + done + printf '\nread -r -p "Continue? [y/N] " answer\n' + printf 'case "${answer,,}" in\n y|yes) ;;\n *) echo "Aborted."; exit 1 ;;\nesac\n\n' + # Removal commands for all payload files (excluding install.sh) + find . -type f -print0 | while IFS= read -r -d '' file; do + [ "$file" = "./install.sh" ] && continue + rel="${file#./}" + printf 'rm -f "$target/%s"\n' "$rel" + done + printf 'find "$target" -type d -empty -delete\n' + printf 'rm -f "$target/uninstall.sh"\n' + printf 'rmdir "$target" 2>/dev/null || true\n' +} > "$uninstall_script" +chmod a+x "$uninstall_script" + +echo "Migrating assets to $target..." + +echo "Uninstall script: $uninstall_script" + +find . -type f -print0 | while IFS= read -r -d '' file; +do + [ "$file" = "./install.sh" ] && continue + + echo "Processing $file..." + if [ ! -f "$target/$file" ]; then + target_path="$target/$(dirname "$file")" + mkdir -p "$target_path" + mv "$file" "$target_path/" + echo "Moved $file to $target_path/" + else + echo "File $target/$file already exists, skipping." + fi +done + +# For all files in bin/ dir, add +x permissions for the user. +find "$target/bin" -type f -exec chmod u+x {} \; + +# Done installing, print next steps +echo "Installation complete." +echo "***************************************************************" +echo "IMPORTANT: Please review the post-installation actions below to ensure your CUDA-Q Realtime installation is set up correctly and ready to use." +echo "***************************************************************" +echo "Post-installation Actions:" +echo "1. Environment Setup: the LD_LIBRARY_PATH variable needs to contain ${target}/lib." +echo "For example, you can run: " +echo "=============================================================== " +echo "export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${target}/lib" +echo "=============================================================== " +echo "2. Validation [Recommended]: Run the included validate.sh script to verify your installation is working correctly." +echo "Alternatively, you can run the demo.sh script to run a demo application in a containerized environment that uses the installed CUDA-Q Realtime libraries." +# Guide users to read the `user_guide.md` file to validate their installation. +echo "Please read the user guide at $target/docs/user_guide.md to validate your installation and learn how to use CUDA-Q Realtime." diff --git a/realtime/scripts/validate_installer.sh b/realtime/scripts/validate_installer.sh new file mode 100644 index 00000000000..00a535f3361 --- /dev/null +++ b/realtime/scripts/validate_installer.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# ============================================================================ # +# Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Default installation location: /opt/nvidia/cudaq/realtime +install_dir=/opt/nvidia/cudaq/realtime + +# Check LD_LIBRARY_PATH contains the install_dir/lib path +if [[ ":$LD_LIBRARY_PATH:" != *":$install_dir/lib:"* ]]; then + echo "Warning: LD_LIBRARY_PATH does not contain $install_dir/lib. Please add it to your environment variables to ensure CUDA-Q Realtime works correctly." >&2 + echo "For example, you can run: export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:$install_dir/lib" +fi + +bin_dir="$install_dir/bin" +# Call the `hololink_test.sh` script to validate the installation, forward all the command line arguments to it. +bash "$install_dir/utils/hololink_test.sh" "$@" --bin-dir "$bin_dir" + +# Check the status of the hololink test script to determine if the validation was successful. +if [[ $? -ne 0 ]]; then + echo "Failed to validate hololink test application. Please refer to the documentation for troubleshooting." >&2 + exit 1 +fi diff --git a/realtime/unittests/CMakeLists.txt b/realtime/unittests/CMakeLists.txt new file mode 100644 index 00000000000..4e1431d7ecf --- /dev/null +++ b/realtime/unittests/CMakeLists.txt @@ -0,0 +1,85 @@ +# ============================================================================ # +# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# External Dependencies +# ============================================================================== + +FetchContent_Declare( + googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG v1.17.0 + EXCLUDE_FROM_ALL +) + +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) + +FetchContent_MakeAvailable(googletest) + +# Bug in GCC 12 leads to spurious warnings (-Wrestrict) +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105329 +if (CMAKE_COMPILER_IS_GNUCXX + AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0.0 + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0.0) + target_compile_options(gtest PUBLIC --param=evrp-mode=legacy) +endif() +include(GoogleTest) + + +add_compile_options(-Wno-attributes) + +# ============================================================================== +# GPU Dispatch Kernel Tests +# ============================================================================== + +find_package(CUDAToolkit) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + + add_executable(test_dispatch_kernel test_dispatch_kernel.cu) + + set_target_properties(test_dispatch_kernel PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + ) + + target_include_directories(test_dispatch_kernel PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ) + + # Find CUDA device runtime library (required for device-side API calls like cudaGraphLaunch) + find_library(CUDADEVRT_LIBRARY cudadevrt + HINTS ${CUDAToolkit_LIBRARY_DIR} + REQUIRED + ) + + target_link_libraries(test_dispatch_kernel PRIVATE + GTest::gtest_main + CUDA::cudart + cudaq-realtime + cudaq-realtime-dispatch + ${CUDADEVRT_LIBRARY} + ) + + add_dependencies(CudaqRealtimeUnitTests test_dispatch_kernel) + gtest_discover_tests(test_dispatch_kernel + TEST_PREFIX "test_dispatch_kernel." + ) + + message(STATUS " - test_dispatch_kernel (GPU dispatch infrastructure)") +endif() + +# ============================================================================== +# Hololink bridge/emulator/playback tools (optional, not CI) +# ============================================================================== + +if (CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS) + add_subdirectory(utils) + add_subdirectory(bridge_interface/hololink) +endif() diff --git a/realtime/unittests/bridge_interface/hololink/CMakeLists.txt b/realtime/unittests/bridge_interface/hololink/CMakeLists.txt new file mode 100644 index 00000000000..cbc3e6be7f8 --- /dev/null +++ b/realtime/unittests/bridge_interface/hololink/CMakeLists.txt @@ -0,0 +1,67 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Hololink bridge and playback tools +# ============================================================================== +# These targets are gated by CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS + +# =========================================================================== # +# hololink_bridge (generic increment bridge) +# =========================================================================== # + +message(STATUS "Building hololink_bridge (generic increment)") + +# Demo application that uses the hololink bridge interface to implement a simple "increment" RPC function. +# This is the same as the one in cuda-quantum/realtime/unittests/utils +# (which is implemented end-to-end without using the bridge pluggable interface). +add_executable(hololink_app + hololink_bridge.cpp) + +set_target_properties(hololink_app PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +target_include_directories(hololink_app + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + +# Note: no direct dependency on hololink needed. +target_link_libraries(hololink_app + PRIVATE + cudaq-realtime-dispatch + cudaq-realtime-bridge-hololink + cudaq-realtime) + +# Find the rpc_increment_ft target built (device code for the increment function table) and link it to the app. +if (TARGET rpc_increment_ft) + add_dependencies(hololink_app rpc_increment_ft) + message(STATUS " - rpc_increment_ft target found, linking to app") + target_link_libraries(hololink_app PRIVATE rpc_increment_ft) +endif() + +# Find the `hololink_fpga_emulator` and `hololink_fpga_playback` targets built by the utils/ CMakeLists.txt, and link them to the app for easy invocation. +if (TARGET hololink_fpga_emulator) + add_dependencies(hololink_app hololink_fpga_emulator) + set(HOLOLINK_EMULATOR_BIN ${CMAKE_BINARY_DIR}/unittests/utils/hololink_fpga_emulator) +else() + message(WARNING "hololink_fpga_emulator target not found. " + "FPGA emulation will not be available for hololink_app.") +endif() +if (TARGET hololink_fpga_playback) + add_dependencies(hololink_app hololink_fpga_playback) + set(HOLOLINK_PLAYBACK_BIN ${CMAKE_BINARY_DIR}/unittests/utils/hololink_fpga_playback) +else() + message(WARNING "hololink_fpga_playback target not found. " + "FPGA playback will not be available for hololink_app.") +endif() + + +configure_file(hololink_test.sh.in hololink_test.sh @ONLY) diff --git a/realtime/unittests/bridge_interface/hololink/hololink_bridge.cpp b/realtime/unittests/bridge_interface/hololink/hololink_bridge.cpp new file mode 100644 index 00000000000..5df135ef0c9 --- /dev/null +++ b/realtime/unittests/bridge_interface/hololink/hololink_bridge.cpp @@ -0,0 +1,378 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_bridge.cpp +/// @brief Generic Hololink bridge tool for testing libcudaq-realtime dispatch. +/// +/// Registers a simple increment RPC handler (adds 1 to each byte) and wires +/// it through the Hololink GPU-RoCE Transceiver. No QEC or decoder dependency. +/// +/// Usage: +/// ./hololink_app \ +/// --device=mlx5_1 \ +/// --peer-ip=10.0.0.2 \ +/// --remote-qp=0x2 \ +/// --gpu=0 \ +/// --timeout=60 + +#include "cudaq/realtime/daemon/bridge/bridge_interface.h" +#include "cudaq/realtime/daemon/bridge/hololink/hololink_doca_transport_ctx.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//============================================================================== +// Increment RPC Handler Function Table +//============================================================================== + +// The actual __device__ rpc_increment_handler lives in +// init_rpc_increment_function_table.cu (compiled by nvcc). We declare the +// host-callable setup function here so this .cpp can be compiled by g++. + +extern "C" void +setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries); + +/// @brief Configuration for dispatch kernel setup. +struct DispatchConfig { + int gpu_id = 0; ///< GPU device ID + int timeout_sec = 60; ///< Runtime timeout in seconds + // Ring buffer sizing + size_t frame_size = 256; ///< Minimum frame size (RPCHeader + payload) + size_t page_size = + 384; ///< Ring buffer slot size (>= frame_size, 128-aligned) + unsigned num_pages = 64; ///< Number of ring buffer slots + /// @brief Dispatch kernel grid configuration. + /// Defaults match the regular (non-cooperative) kernel. + cudaq_kernel_type_t kernel_type = CUDAQ_KERNEL_REGULAR; + uint32_t num_blocks = 1; + uint32_t threads_per_block = 32; + // Forward mode: use Hololink's built-in forward kernel (echo) instead of + // separate RX + dispatch + TX kernels. Useful for baseline latency testing. + bool forward = false; + + // Unified dispatch mode: single kernel combines RDMA RX, RPC dispatch, and + // RDMA TX via direct DOCA verbs calls. Eliminates the inter-kernel flag + // handoff overhead of the 3-kernel path. Regular handlers only. + bool unified = false; +}; + +void parse_bridge_args(int argc, char *argv[], DispatchConfig &config) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.find("--gpu=") == 0) + config.gpu_id = std::stoi(arg.substr(6)); + else if (arg.find("--timeout=") == 0) + config.timeout_sec = std::stoi(arg.substr(10)); + else if (arg == "--forward") + config.forward = true; + else if (arg == "--unified") + config.unified = true; + } +} + +#define HANDLE_CUDAQ_REALTIME_ERROR(x) \ + { \ + const auto err = x; \ + if (err != CUDAQ_OK) { \ + std::stringstream ss; \ + ss << "CUDAQ realtime error at " << __FILE__ << ":" << __LINE__ << ": " \ + << err << std::endl; \ + throw std::runtime_error(ss.str()); \ + } \ + } + +//============================================================================== +// CUDA Error Checking +//============================================================================== + +#ifndef BRIDGE_CUDA_CHECK +#define BRIDGE_CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \ + << cudaGetErrorString(err) << std::endl; \ + return 1; \ + } \ + } while (0) +#endif + +std::atomic &bridge_shutdown_flag() { + static std::atomic flag{false}; + return flag; +} +void bridge_signal_handler(int) { bridge_shutdown_flag() = true; } + +//============================================================================== +// Main +//============================================================================== + +int main(int argc, char *argv[]) { + // Check for help + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "--help" || arg == "-h") { + std::cout + << "Usage: " << argv[0] << " [options]\n" + << "\n" + << "Generic Hololink bridge for testing libcudaq-realtime dispatch.\n" + << "Registers increment handler (adds 1 to each byte of the RPC " + "payload).\n" + << "\n" + << "Options:\n" + << " --device=NAME IB device (default: rocep1s0f0)\n" + << " --peer-ip=ADDR FPGA/emulator IP (default: 10.0.0.2)\n" + << " --remote-qp=N Remote QP number (default: 0x2)\n" + << " --gpu=N GPU device ID (default: 0)\n" + << " --timeout=N Timeout in seconds (default: 60)\n" + << " --payload-size=N RPC payload size in bytes (default: 8)\n" + << " --page-size=N Ring buffer slot size (default: 384)\n" + << " --num-pages=N Number of ring buffer slots (default: " + "64)\n" + << " --exchange-qp Enable QP exchange protocol\n" + << " --exchange-port=N TCP port for QP exchange (default: " + "12345)\n" + << " --forward Use Hololink forward kernel (echo) " + "instead of dispatch\n" + << " --unified Use unified dispatch kernel (RX + " + "dispatch + TX in one kernel)\n"; + return 0; + } + } + + try { + signal(SIGINT, bridge_signal_handler); + signal(SIGTERM, bridge_signal_handler); + auto &g_shutdown = bridge_shutdown_flag(); + std::cout << "=== Hololink Generic Bridge ===" << std::endl; + // Allocate control variables (shutdown flag) + void *tmp_shutdown = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + void *tmp_d_shutdown = nullptr; + BRIDGE_CUDA_CHECK( + cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + volatile int *d_shutdown_flag = static_cast(tmp_d_shutdown); + volatile int *shutdown_flag = static_cast(tmp_shutdown); + + // CUDA-Q realtime variables + uint64_t *d_stats = nullptr; + BRIDGE_CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + BRIDGE_CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + cudaq_function_entry_t *d_function_entries = nullptr; + cudaq_dispatch_manager_t *manager = nullptr; + cudaq_dispatcher_t *dispatcher = nullptr; + + //============================================================================ + // Parse configurations from args + //============================================================================ + DispatchConfig config; + parse_bridge_args(argc, argv, config); + //============================================================================ + // Set up the Hololink bridge + //============================================================================ + cudaq_realtime_bridge_handle_t bridge_handle = nullptr; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_create( + &bridge_handle, CUDAQ_PROVIDER_HOLOLINK, argc, argv)); + std::cout << "Bridge created successfully. Connecting..." << std::endl; + + if (!config.forward) { + if (!config.unified) { + int dispatch_blocks = 0; + cudaError_t occ_err; + if (config.kernel_type == CUDAQ_KERNEL_COOPERATIVE) { + occ_err = cudaq_dispatch_kernel_cooperative_query_occupancy( + &dispatch_blocks, config.threads_per_block); + } else { + occ_err = cudaq_dispatch_kernel_query_occupancy(&dispatch_blocks, 1); + } + if (occ_err != cudaSuccess) { + std::cerr << "ERROR: Dispatch kernel occupancy query failed: " + << cudaGetErrorString(occ_err) << std::endl; + return 1; + } + std::cout << " Dispatch kernel occupancy: " << dispatch_blocks + << " blocks/SM" << std::endl; + } + + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_connect(bridge_handle)); + + std::cout << "\nWiring dispatch kernel (" + << (config.unified ? "unified" : "3-kernel") << ")..." + << std::endl; + + *shutdown_flag = 0; + int zero = 0; + BRIDGE_CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag), &zero, + sizeof(int), cudaMemcpyHostToDevice)); + + // Create CUDA-Q dispatcher manager + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_dispatch_manager_create(&manager)); + cudaq_dispatcher_config_t dconfig{}; + dconfig.device_id = config.gpu_id; + dconfig.vp_id = 0; + dconfig.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + + if (config.unified) { + dconfig.kernel_type = CUDAQ_KERNEL_UNIFIED; + dconfig.num_blocks = 1; + dconfig.threads_per_block = 1; + dconfig.num_slots = 0; + dconfig.slot_size = 0; + } else { + dconfig.kernel_type = config.kernel_type; + dconfig.num_blocks = config.num_blocks; + dconfig.threads_per_block = config.threads_per_block; + dconfig.num_slots = static_cast(config.num_pages); + dconfig.slot_size = static_cast(config.page_size); + } + + // Create dispatcher with the above config + HANDLE_CUDAQ_REALTIME_ERROR( + cudaq_dispatcher_create(manager, &dconfig, &dispatcher)); + + // Transport context for unified mode (must outlive the dispatcher) + cudaq_unified_dispatch_ctx_t unified_dispatch{}; + + if (config.unified) { + std::cout << "Retrieving the unified dispatch context ..." << std::endl; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_get_transport_context( + bridge_handle, UNIFIED, &unified_dispatch)); + if (cudaq_dispatcher_set_unified_launch( + dispatcher, unified_dispatch.launch_fn, + unified_dispatch.transport_ctx) != CUDAQ_OK) { + std::cerr << "ERROR: Failed to set unified launch function" + << std::endl; + return 1; + } + } else { + std::cout << "Retrieving the ring buffer ..." << std::endl; + + cudaq_ringbuffer_t ringbuffer{}; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_get_transport_context( + bridge_handle, RING_BUFFER, &ringbuffer)); + if (cudaq_dispatcher_set_ringbuffer(dispatcher, &ringbuffer) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set ringbuffer" << std::endl; + return 1; + } + + if (cudaq_dispatcher_set_launch_fn( + dispatcher, &cudaq_launch_dispatch_kernel_regular) != + CUDAQ_OK) { + std::cerr << "ERROR: Failed to set launch function" << std::endl; + return 1; + } + } + + // Set up the function table with the increment handler entries + // Populate the GPU function table with the increment handler entry + BRIDGE_CUDA_CHECK( + cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t))); + setup_rpc_increment_function_table(d_function_entries); + // Create a function table struct to pass to the dispatcher + cudaq_function_table_t table{}; + table.entries = d_function_entries; + table.count = 1; // Only one handler (increment) + // Set the function table for the dispatcher + HANDLE_CUDAQ_REALTIME_ERROR( + cudaq_dispatcher_set_function_table(dispatcher, &table)); + // Set the control variables (shutdown flag and stats pointer) for the + // dispatcher + HANDLE_CUDAQ_REALTIME_ERROR( + cudaq_dispatcher_set_control(dispatcher, d_shutdown_flag, d_stats)); + + // Start the dispatcher (launches the dispatch kernel) + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_dispatcher_start(dispatcher)); + std::cout << " Dispatch kernel launched" << std::endl; + } else { + std::cout << "\n[4/5] Forward mode -- skipping dispatch kernel" + << std::endl; + } + + // Launch Hololink kernels and run + std::cout << "\n[5/5] Launching Hololink kernels...\n"; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_launch(bridge_handle)); + //============================================================================ + // Main run loop + //============================================================================ + cudaStream_t diag_stream = nullptr; + BRIDGE_CUDA_CHECK( + cudaStreamCreateWithFlags(&diag_stream, cudaStreamNonBlocking)); + + auto start_time = std::chrono::steady_clock::now(); + uint64_t last_processed = 0; + + while (!g_shutdown) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); + if (elapsed > config.timeout_sec) { + std::cout << "\nTimeout reached (" << config.timeout_sec << "s)" + << std::endl; + break; + } + + // Progress report every 5 seconds + if (!config.forward && d_stats && elapsed > 0 && elapsed % 5 == 0) { + uint64_t processed = 0; + cudaMemcpyAsync(&processed, d_stats, sizeof(uint64_t), + cudaMemcpyDeviceToHost, diag_stream); + cudaStreamSynchronize(diag_stream); + if (processed != last_processed) { + std::cout << " [" << elapsed << "s] Processed " << processed + << " packets" << std::endl; + last_processed = processed; + } + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } + + //============================================================================ + // Shutdown + //============================================================================ + std::cout << "\n=== Shutting down ===" << std::endl; + if (!config.forward) { + *shutdown_flag = 1; + __sync_synchronize(); + cudaq_dispatcher_stop(dispatcher); + + uint64_t total_processed = 0; + cudaq_dispatcher_get_processed(dispatcher, &total_processed); + std::cout << " Total packets processed (dispatch RX): " + << total_processed << std::endl; + } + std::cout << " Disconnecting bridge..." << std::endl; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_disconnect(bridge_handle)); + // Clean up + BRIDGE_CUDA_CHECK(cudaFree(d_function_entries)); + BRIDGE_CUDA_CHECK(cudaFree(d_stats)); + if (dispatcher) + cudaq_dispatcher_destroy(dispatcher); + if (manager) + cudaq_dispatch_manager_destroy(manager); + std::cout << " Destroying bridge..." << std::endl; + HANDLE_CUDAQ_REALTIME_ERROR(cudaq_bridge_destroy(bridge_handle)); + std::cout << "Bridge shut down successfully." << std::endl; + if (shutdown_flag) + BRIDGE_CUDA_CHECK(cudaFreeHost(const_cast(shutdown_flag))); + } catch (const std::exception &e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return 1; + } +} diff --git a/realtime/unittests/bridge_interface/hololink/hololink_test.sh.in b/realtime/unittests/bridge_interface/hololink/hololink_test.sh.in new file mode 100755 index 00000000000..fa00e6d169b --- /dev/null +++ b/realtime/unittests/bridge_interface/hololink/hololink_test.sh.in @@ -0,0 +1,231 @@ +#!/bin/bash +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +# +# hololink_test.sh +# +# Orchestration script for end-to-end Hololink RPC dispatch testing. +# Tests libcudaq-realtime dispatch kernel over Hololink RDMA with a +# simple increment RPC handler (no QEC or decoder dependency). +# + +set -euo pipefail + +# ============================================================================ +# Defaults +# ============================================================================ + +VERIFY=true + +# Network defaults +IB_DEVICE="" # auto-detect +BRIDGE_IP="10.0.0.1" +EMULATOR_IP="10.0.0.2" +MTU=4096 + +# Run defaults +GPU_ID=0 +TIMEOUT=60 +NUM_SHOTS=100 +PAYLOAD_SIZE=8 +PAGE_SIZE=384 +NUM_PAGES=64 +CONTROL_PORT=8193 + +# Build parallelism +JOBS=$(nproc 2>/dev/null || echo 8) + + +# ============================================================================ +# Auto-detect IB device +# ============================================================================ + +detect_ib_device() { + if [[ -n "$IB_DEVICE" ]]; then + echo "$IB_DEVICE" + return + fi + local dev + dev=$(ibstat -l 2>/dev/null | head -1 || true) + if [[ -z "$dev" ]]; then + dev=$(ls /sys/class/infiniband/ 2>/dev/null | head -1 || true) + fi + if [[ -z "$dev" ]]; then + echo "ERROR: Could not auto-detect IB device. Use --device." >&2 + exit 1 + fi + echo "$dev" +} + +# ============================================================================ +# Network interface name from IB device +# ============================================================================ + +get_netdev() { + local ib_dev=$1 + local netdev + netdev=$(ls "/sys/class/infiniband/$ib_dev/device/net/" 2>/dev/null | head -1 || true) + echo "$netdev" +} + +# ============================================================================ +# Network setup +# ============================================================================ + +do_setup_network() { + IB_DEVICE=$(detect_ib_device) + local netdev + netdev=$(get_netdev "$IB_DEVICE") + + echo "=== Setting up network ===" + echo " IB device: $IB_DEVICE" + echo " Net device: $netdev" + + if [[ -z "$netdev" ]]; then + echo "ERROR: No network device found for $IB_DEVICE" >&2 + exit 1 + fi + + sudo ip link set "$netdev" up mtu "$MTU" || true + sudo ip addr add "$BRIDGE_IP/24" dev "$netdev" 2>/dev/null || true + + sudo ip addr add "$EMULATOR_IP/24" dev "$netdev" 2>/dev/null || true + # Add static ARP entries + sudo ip neigh replace "$BRIDGE_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true + sudo ip neigh replace "$EMULATOR_IP" lladdr "$(cat /sys/class/net/$netdev/address)" dev "$netdev" nud permanent 2>/dev/null || true + + + echo "=== Network setup complete ===" +} + +# ============================================================================ +# Run +# ============================================================================ + +cleanup_pids() { + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + fi + done +} + +do_run() { + IB_DEVICE=$(detect_ib_device) + + local bridge_bin="@CMAKE_CURRENT_BINARY_DIR@/hololink_app" + local emulator_bin="@HOLOLINK_EMULATOR_BIN@" + local playback_bin="@HOLOLINK_PLAYBACK_BIN@" + + + PIDS=() + trap cleanup_pids EXIT + + + echo "=== Emulated mode ===" + + # Start emulator + echo "--- Starting emulator ---" + "$emulator_bin" \ + --device="$IB_DEVICE" \ + --port="$CONTROL_PORT" \ + --bridge-ip="$BRIDGE_IP" \ + --page-size="$PAGE_SIZE" \ + 2>&1 | tee /tmp/emulator.log & + PIDS+=($!) + + # Wait for emulator to print QP number + sleep 2 + FPGA_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/emulator.log | head -1) + if [[ -z "$FPGA_QP" ]]; then + echo "ERROR: Could not parse emulator QP from log" >&2 + exit 1 + fi + FPGA_QP="0x$FPGA_QP" + FPGA_TARGET_IP="$EMULATOR_IP" + + echo " Emulator QP: $FPGA_QP" + + + # Start bridge + echo "--- Starting bridge ---" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+LD_LIBRARY_PATH:}@CMAKE_BINARY_DIR@/lib" + "$bridge_bin" \ + --device="$IB_DEVICE" \ + --peer-ip="$FPGA_TARGET_IP" \ + --remote-qp="$FPGA_QP" \ + --gpu="$GPU_ID" \ + --timeout="$TIMEOUT" \ + --page-size="$PAGE_SIZE" \ + --num-pages="$NUM_PAGES" \ + 2>&1 | tee /tmp/bridge.log & + PIDS+=($!) + + # Wait for bridge to print QP info + sleep 3 + local BRIDGE_QP BRIDGE_RKEY BRIDGE_BUFFER + BRIDGE_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + BRIDGE_RKEY=$(grep -oP 'RKey: \K[0-9]+' /tmp/bridge.log | tail -1) + BRIDGE_BUFFER=$(grep -oP 'Buffer Addr: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + + if [[ -z "$BRIDGE_QP" || -z "$BRIDGE_RKEY" || -z "$BRIDGE_BUFFER" ]]; then + echo "ERROR: Could not parse bridge QP info from log" >&2 + echo " QP=$BRIDGE_QP RKEY=$BRIDGE_RKEY BUFFER=$BRIDGE_BUFFER" >&2 + exit 1 + fi + + echo " Bridge QP: 0x$BRIDGE_QP" + echo " Bridge RKey: $BRIDGE_RKEY" + echo " Bridge Buffer: 0x$BRIDGE_BUFFER" + + # Start playback + echo "--- Starting playback ---" + local verify_flag="" + if ! $VERIFY; then + verify_flag="--no-verify" + fi + + "$playback_bin" \ + --control-ip="$FPGA_TARGET_IP" \ + --control-port="$CONTROL_PORT" \ + --bridge-qp="0x$BRIDGE_QP" \ + --bridge-rkey="$BRIDGE_RKEY" \ + --bridge-buffer="0x$BRIDGE_BUFFER" \ + --page-size="$PAGE_SIZE" \ + --num-pages="$NUM_PAGES" \ + --num-shots="$NUM_SHOTS" \ + --payload-size="$PAYLOAD_SIZE" \ + --bridge-ip="$BRIDGE_IP" \ + $verify_flag + PLAYBACK_EXIT=$? + + # Wait for bridge to finish + sleep 2 + + # Cleanup + cleanup_pids + + echo "" + if [[ $PLAYBACK_EXIT -eq 0 ]]; then + echo "*** TEST PASSED ***" + else + echo "*** TEST FAILED ***" + fi + exit $PLAYBACK_EXIT +} + +# ============================================================================ +# Main +# ============================================================================ + +echo "=== Hololink Generic RPC Test ===" +echo "Mode: emulated" + +do_run +echo "Done." diff --git a/realtime/unittests/test_dispatch_kernel.cu b/realtime/unittests/test_dispatch_kernel.cu new file mode 100644 index 00000000000..bef7e049d89 --- /dev/null +++ b/realtime/unittests/test_dispatch_kernel.cu @@ -0,0 +1,727 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/kernel_types.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel.cuh" + +// Helper macro for CUDA error checking +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + ASSERT_EQ(err, cudaSuccess) << "CUDA error: " << cudaGetErrorString(err); \ + } while (0) + +namespace { + +//============================================================================== +// Test Handler: Simple noop that copies input to output +//============================================================================== + +/// @brief Test handler that adds 1 to each byte. +__device__ int increment_handler(const void* input, void* output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len) { + const std::uint8_t* in_data = static_cast(input); + std::uint8_t* out_data = static_cast(output); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + out_data[i] = in_data[i] + 1; + } + *result_len = arg_len; + return 0; +} + +//============================================================================== +// Host API Dispatch Kernel Test Helpers +//============================================================================== + +constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_increment"); + +__device__ int rpc_increment_handler(const void* input, void* output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len) { + const std::uint8_t* in_data = static_cast(input); + std::uint8_t* out_data = static_cast(output); + for (std::uint32_t i = 0; i < arg_len && i < max_result_len; ++i) { + out_data[i] = static_cast(in_data[i] + 1); + } + *result_len = arg_len; + return 0; +} + +__global__ void init_rpc_function_table(cudaq_function_entry_t* entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.device_fn_ptr = reinterpret_cast(&rpc_increment_handler); + entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + + // Schema: 1 array argument (uint8), 1 array result (uint8) + entries[0].schema.num_args = 1; + entries[0].schema.num_results = 1; + entries[0].schema.reserved = 0; + entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.args[0].reserved[0] = 0; + entries[0].schema.args[0].reserved[1] = 0; + entries[0].schema.args[0].reserved[2] = 0; + entries[0].schema.args[0].size_bytes = 0; // Variable size + entries[0].schema.args[0].num_elements = 0; // Variable size + entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.results[0].reserved[0] = 0; + entries[0].schema.results[0].reserved[1] = 0; + entries[0].schema.results[0].reserved[2] = 0; + entries[0].schema.results[0].size_bytes = 0; // Variable size + entries[0].schema.results[0].num_elements = 0; // Variable size + } +} + +bool allocate_ring_buffer(std::size_t num_slots, std::size_t slot_size, + volatile uint64_t** host_flags_out, + volatile uint64_t** device_flags_out, + std::uint8_t** host_data_out, + std::uint8_t** device_data_out) { + void* host_flags_ptr = nullptr; + cudaError_t err = cudaHostAlloc(&host_flags_ptr, + num_slots * sizeof(uint64_t), + cudaHostAllocMapped); + if (err != cudaSuccess) + return false; + + void* device_flags_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_flags_ptr, host_flags_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* host_data_ptr = nullptr; + err = cudaHostAlloc(&host_data_ptr, + num_slots * slot_size, + cudaHostAllocMapped); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + return false; + } + + void* device_data_ptr = nullptr; + err = cudaHostGetDevicePointer(&device_data_ptr, host_data_ptr, 0); + if (err != cudaSuccess) { + cudaFreeHost(host_flags_ptr); + cudaFreeHost(host_data_ptr); + return false; + } + + memset(host_flags_ptr, 0, num_slots * sizeof(uint64_t)); + + *host_flags_out = static_cast(host_flags_ptr); + *device_flags_out = static_cast(device_flags_ptr); + *host_data_out = static_cast(host_data_ptr); + *device_data_out = static_cast(device_data_ptr); + return true; +} + +void free_ring_buffer(volatile uint64_t* host_flags, + std::uint8_t* host_data) { + if (host_flags) + cudaFreeHost(const_cast(host_flags)); + if (host_data) + cudaFreeHost(host_data); +} + +extern "C" void launch_dispatch_kernel_wrapper( + volatile std::uint64_t* rx_flags, + volatile std::uint64_t* tx_flags, + std::uint8_t* rx_data, + std::uint8_t* tx_data, + std::size_t rx_stride_sz, + std::size_t tx_stride_sz, + cudaq_function_entry_t* function_table, + std::size_t func_count, + volatile int* shutdown_flag, + std::uint64_t* stats, + std::size_t num_slots, + std::uint32_t num_blocks, + std::uint32_t threads_per_block, + cudaStream_t stream) { + cudaq_launch_dispatch_kernel_regular( + rx_flags, tx_flags, rx_data, tx_data, rx_stride_sz, tx_stride_sz, + function_table, func_count, + shutdown_flag, stats, num_slots, num_blocks, threads_per_block, stream); +} + +//============================================================================== +// Test Kernel for DeviceCallMode +//============================================================================== + +using HandlerFunc = int (*)(const void*, void*, std::uint32_t, std::uint32_t, std::uint32_t*); + +__device__ HandlerFunc d_increment_handler = increment_handler; + +/// @brief Test kernel that dispatches to a handler using DeviceCallMode. +template +__global__ void test_dispatch_kernel( + HandlerFunc handler, + const void* input, + void* output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t* result_len, + int* status) { + + if (threadIdx.x == 0 && blockIdx.x == 0) { + *status = handler(input, output, arg_len, max_result_len, result_len); + } + + KernelType::sync(); +} + +//============================================================================== +// Test Fixture +//============================================================================== + +class DispatchKernelTest : public ::testing::Test { +protected: + void SetUp() override { + CUDA_CHECK(cudaMalloc(&d_buffer_, 1024)); + CUDA_CHECK(cudaMalloc(&d_result_len_, sizeof(std::uint32_t))); + CUDA_CHECK(cudaMalloc(&d_status_, sizeof(int))); + } + + void TearDown() override { + if (d_buffer_) cudaFree(d_buffer_); + if (d_result_len_) cudaFree(d_result_len_); + if (d_status_) cudaFree(d_status_); + } + + void* d_buffer_ = nullptr; + std::uint32_t* d_result_len_ = nullptr; + int* d_status_ = nullptr; +}; + +//============================================================================== +// Tests +//============================================================================== + +TEST_F(DispatchKernelTest, IncrementHandlerBasic) { + // Prepare test data - separate input and output buffers + std::vector input = {0, 1, 2, 3, 4}; + std::vector expected = {1, 2, 3, 4, 5}; + + void* d_input = nullptr; + CUDA_CHECK(cudaMalloc(&d_input, 1024)); + CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), + cudaMemcpyHostToDevice)); + + // Get device function pointer + HandlerFunc h_handler; + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + + // Launch kernel with separate input/output buffers + test_dispatch_kernel<<<1, 32>>>( + h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + // Check results + int status; + std::uint32_t result_len; + CUDA_CHECK(cudaMemcpy(&status, d_status_, sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + cudaMemcpyDeviceToHost)); + + EXPECT_EQ(status, 0) << "Handler should return success"; + EXPECT_EQ(result_len, input.size()) << "Result length should match input"; + + // Verify output buffer has incremented data + std::vector output(input.size()); + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(expected, output) << "Increment handler should add 1 to each byte"; + + // Verify input buffer is unchanged + std::vector input_readback(input.size()); + CUDA_CHECK(cudaMemcpy(input_readback.data(), d_input, input.size(), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(input, input_readback) << "Input buffer should be unchanged"; + + cudaFree(d_input); +} + +TEST_F(DispatchKernelTest, LargeBuffer) { + // Test with larger data - separate input/output buffers + const std::size_t size = 512; + std::vector input(size); + for (std::size_t i = 0; i < size; ++i) { + input[i] = static_cast(i & 0xFF); + } + + void* d_input = nullptr; + CUDA_CHECK(cudaMalloc(&d_input, 1024)); + CUDA_CHECK(cudaMemcpy(d_input, input.data(), input.size(), + cudaMemcpyHostToDevice)); + + HandlerFunc h_handler; + CUDA_CHECK(cudaMemcpyFromSymbol(&h_handler, d_increment_handler, + sizeof(HandlerFunc))); + + test_dispatch_kernel<<<1, 256>>>( + h_handler, d_input, d_buffer_, input.size(), 1024, d_result_len_, d_status_); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + std::uint32_t result_len; + CUDA_CHECK(cudaMemcpy(&result_len, d_result_len_, sizeof(std::uint32_t), + cudaMemcpyDeviceToHost)); + EXPECT_EQ(result_len, size) << "Should process all bytes"; + + // Verify all bytes incremented in output buffer + std::vector output(size); + CUDA_CHECK(cudaMemcpy(output.data(), d_buffer_, output.size(), + cudaMemcpyDeviceToHost)); + + for (std::size_t i = 0; i < size; ++i) { + uint8_t expected = static_cast((i + 1) & 0xFF); + EXPECT_EQ(output[i], expected) << "Mismatch at index " << i; + } + + cudaFree(d_input); +} + +class HostApiDispatchTest : public ::testing::Test { +protected: + void SetUp() override { + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &rx_flags_host_, + &rx_flags_, &rx_data_host_, &rx_data_)); + ASSERT_TRUE(allocate_ring_buffer(num_slots_, slot_size_, &tx_flags_host_, + &tx_flags_, &tx_data_host_, &tx_data_)); + + void* tmp_shutdown = nullptr; + CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + shutdown_flag_ = static_cast(tmp_shutdown); + void* tmp_d_shutdown = nullptr; + CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown_flag_ = static_cast(tmp_d_shutdown); + *shutdown_flag_ = 0; + int zero = 0; + CUDA_CHECK(cudaMemcpy(const_cast(d_shutdown_flag_), &zero, + sizeof(int), cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMalloc(&d_stats_, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats_, 0, sizeof(uint64_t))); + + CUDA_CHECK(cudaMalloc(&d_function_entries_, sizeof(cudaq_function_entry_t))); + init_rpc_function_table<<<1, 1>>>(d_function_entries_); + CUDA_CHECK(cudaDeviceSynchronize()); + func_count_ = 1; + + ASSERT_EQ(cudaq_dispatch_manager_create(&manager_), CUDAQ_OK); + cudaq_dispatcher_config_t config{}; + config.device_id = 0; + config.num_blocks = 1; + config.threads_per_block = 64; + config.num_slots = static_cast(num_slots_); + config.slot_size = static_cast(slot_size_); + config.vp_id = 0; + config.kernel_type = CUDAQ_KERNEL_REGULAR; + config.dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + ASSERT_EQ(cudaq_dispatcher_create(manager_, &config, &dispatcher_), CUDAQ_OK); + + cudaq_ringbuffer_t ringbuffer{}; + ringbuffer.rx_flags = rx_flags_; + ringbuffer.tx_flags = tx_flags_; + ringbuffer.rx_data = rx_data_; + ringbuffer.tx_data = tx_data_; + ringbuffer.rx_stride_sz = slot_size_; + ringbuffer.tx_stride_sz = slot_size_; + ASSERT_EQ(cudaq_dispatcher_set_ringbuffer(dispatcher_, &ringbuffer), CUDAQ_OK); + + cudaq_function_table_t table{}; + table.entries = d_function_entries_; + table.count = func_count_; + ASSERT_EQ(cudaq_dispatcher_set_function_table(dispatcher_, &table), CUDAQ_OK); + + ASSERT_EQ( + cudaq_dispatcher_set_control(dispatcher_, d_shutdown_flag_, d_stats_), + CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_set_launch_fn(dispatcher_, + &launch_dispatch_kernel_wrapper), + CUDAQ_OK); + ASSERT_EQ(cudaq_dispatcher_start(dispatcher_), CUDAQ_OK); + } + + void TearDown() override { + if (shutdown_flag_) { + *shutdown_flag_ = 1; + __sync_synchronize(); + } + if (dispatcher_) { + cudaq_dispatcher_stop(dispatcher_); + cudaq_dispatcher_destroy(dispatcher_); + dispatcher_ = nullptr; + } + if (manager_) { + cudaq_dispatch_manager_destroy(manager_); + manager_ = nullptr; + } + free_ring_buffer(rx_flags_host_, rx_data_host_); + free_ring_buffer(tx_flags_host_, tx_data_host_); + + if (shutdown_flag_) + cudaFreeHost(const_cast(shutdown_flag_)); + if (d_stats_) + cudaFree(d_stats_); + if (d_function_entries_) + cudaFree(d_function_entries_); + } + + void write_rpc_request(std::size_t slot, + const std::vector& payload) { + std::uint8_t* slot_data = + const_cast(rx_data_host_) + slot * slot_size_; + auto* header = reinterpret_cast(slot_data); + header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + header->function_id = RPC_INCREMENT_FUNCTION_ID; + header->arg_len = static_cast(payload.size()); + memcpy(slot_data + sizeof(cudaq::realtime::RPCHeader), payload.data(), + payload.size()); + } + + bool read_rpc_response(std::size_t slot, + std::vector& payload, + std::int32_t* status_out = nullptr, + std::uint32_t* result_len_out = nullptr) { + __sync_synchronize(); + // Read from TX buffer (dispatch kernel writes response to symmetric TX) + const std::uint8_t* slot_data = + const_cast(tx_data_host_) + slot * slot_size_; + auto* response = + reinterpret_cast(slot_data); + + if (response->magic != cudaq::realtime::RPC_MAGIC_RESPONSE) + return false; + if (status_out) + *status_out = response->status; + if (result_len_out) + *result_len_out = response->result_len; + if (response->status != 0) + return false; + + payload.resize(response->result_len); + memcpy(payload.data(), + slot_data + sizeof(cudaq::realtime::RPCResponse), + response->result_len); + return true; + } + + static constexpr std::size_t num_slots_ = 2; + std::size_t slot_size_ = 256; + volatile uint64_t* rx_flags_host_ = nullptr; + volatile uint64_t* tx_flags_host_ = nullptr; + volatile uint64_t* rx_flags_ = nullptr; + volatile uint64_t* tx_flags_ = nullptr; + std::uint8_t* rx_data_host_ = nullptr; + std::uint8_t* tx_data_host_ = nullptr; + std::uint8_t* rx_data_ = nullptr; + std::uint8_t* tx_data_ = nullptr; + + volatile int* shutdown_flag_ = nullptr; + volatile int* d_shutdown_flag_ = nullptr; + uint64_t* d_stats_ = nullptr; + + cudaq_function_entry_t* d_function_entries_ = nullptr; + std::size_t func_count_ = 0; + + cudaq_dispatch_manager_t* manager_ = nullptr; + cudaq_dispatcher_t* dispatcher_ = nullptr; +}; + +TEST_F(HostApiDispatchTest, RpcIncrementHandler) { + std::vector payload = {0, 1, 2, 3}; + write_rpc_request(0, payload); + + __sync_synchronize(); + const_cast(rx_flags_host_)[0] = + reinterpret_cast(rx_data_); + + int timeout = 50; + while (tx_flags_host_[0] == 0 && timeout-- > 0) { + usleep(1000); + } + ASSERT_GT(timeout, 0) << "Timeout waiting for dispatch kernel response"; + + std::vector response; + std::int32_t status = -1; + std::uint32_t result_len = 0; + ASSERT_TRUE(read_rpc_response(0, response, &status, &result_len)); + EXPECT_EQ(status, 0); + ASSERT_EQ(result_len, payload.size()); + + std::vector expected = {1, 2, 3, 4}; + EXPECT_EQ(response, expected); +} + +//============================================================================== +// Graph Launch Test +//============================================================================== + +// Graph kernel that processes RPC buffer via pointer indirection +__global__ void graph_increment_kernel(void** buffer_ptr) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + void* buffer = *buffer_ptr; + cudaq::realtime::RPCHeader* header = static_cast(buffer); + + std::uint32_t arg_len = header->arg_len; + void* arg_buffer = static_cast(header + 1); + std::uint8_t* data = static_cast(arg_buffer); + + // Increment each byte + for (std::uint32_t i = 0; i < arg_len; ++i) { + data[i] = data[i] + 1; + } + + // Write response + cudaq::realtime::RPCResponse* response = static_cast(buffer); + response->magic = cudaq::realtime::RPC_MAGIC_RESPONSE; + response->status = 0; + response->result_len = arg_len; + } +} + +constexpr std::uint32_t RPC_GRAPH_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_graph_increment"); + +__global__ void init_graph_function_table(cudaq_function_entry_t* entries, + cudaGraphExec_t graph_exec) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.graph_exec = graph_exec; + entries[0].function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_GRAPH_LAUNCH; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + } +} + +TEST(GraphLaunchTest, DispatchKernelGraphLaunch) { + // Check compute capability + int device; + CUDA_CHECK(cudaGetDevice(&device)); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + + if (prop.major < 9) { + GTEST_SKIP() << "Graph device launch requires compute capability 9.0+, found " + << prop.major << "." << prop.minor; + } + + // Allocate graph buffer pointer (for pointer indirection pattern) + void** d_graph_buffer_ptr; + CUDA_CHECK(cudaMalloc(&d_graph_buffer_ptr, sizeof(void*))); + CUDA_CHECK(cudaMemset(d_graph_buffer_ptr, 0, sizeof(void*))); + + // Allocate test buffer + constexpr size_t buffer_size = 1024; + void* d_buffer; + CUDA_CHECK(cudaMalloc(&d_buffer, buffer_size)); + + // Create the child graph (the one that will be launched from device) + cudaGraph_t child_graph; + cudaGraphExec_t child_graph_exec; + + CUDA_CHECK(cudaGraphCreate(&child_graph, 0)); + + // Add kernel node to child graph + cudaKernelNodeParams kernel_params = {}; + void* kernel_args[] = {&d_graph_buffer_ptr}; + kernel_params.func = reinterpret_cast(&graph_increment_kernel); + kernel_params.gridDim = dim3(1, 1, 1); + kernel_params.blockDim = dim3(32, 1, 1); + kernel_params.sharedMemBytes = 0; + kernel_params.kernelParams = kernel_args; + kernel_params.extra = nullptr; + + cudaGraphNode_t kernel_node; + CUDA_CHECK(cudaGraphAddKernelNode(&kernel_node, child_graph, nullptr, 0, &kernel_params)); + + // Instantiate CHILD graph with DEVICE LAUNCH FLAG + CUDA_CHECK(cudaGraphInstantiate(&child_graph_exec, child_graph, + cudaGraphInstantiateFlagDeviceLaunch)); + + // Create stream for operations + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + // Upload the child graph to device + CUDA_CHECK(cudaGraphUpload(child_graph_exec, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Set up function table with graph launch entry + cudaq_function_entry_t* d_function_entries; + CUDA_CHECK(cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t))); + init_graph_function_table<<<1, 1>>>(d_function_entries, child_graph_exec); + CUDA_CHECK(cudaDeviceSynchronize()); + + // Set up RPC buffer on host + std::uint8_t* h_buffer = new std::uint8_t[buffer_size]; + cudaq::realtime::RPCHeader* h_header = reinterpret_cast(h_buffer); + h_header->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + h_header->function_id = RPC_GRAPH_INCREMENT_FUNCTION_ID; + h_header->arg_len = 4; + + std::uint8_t* h_data = h_buffer + sizeof(cudaq::realtime::RPCHeader); + h_data[0] = 0; + h_data[1] = 1; + h_data[2] = 2; + h_data[3] = 3; + + // Copy to device + CUDA_CHECK(cudaMemcpy(d_buffer, h_buffer, buffer_size, cudaMemcpyHostToDevice)); + + // Set up fake RX/TX flags for single-shot test + volatile uint64_t* d_rx_flags; + volatile uint64_t* d_tx_flags; + CUDA_CHECK(cudaMalloc(&d_rx_flags, sizeof(uint64_t))); + CUDA_CHECK(cudaMalloc(&d_tx_flags, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset((void*)d_rx_flags, 0, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset((void*)d_tx_flags, 0, sizeof(uint64_t))); + + // Set RX flag to point to our buffer (simulating incoming RPC) + uint64_t buffer_addr = reinterpret_cast(d_buffer); + CUDA_CHECK(cudaMemcpy((void*)d_rx_flags, &buffer_addr, sizeof(uint64_t), cudaMemcpyHostToDevice)); + + // Set up shutdown flag using pinned mapped memory so the dispatch kernel + // can see host updates immediately + volatile int* h_shutdown; + volatile int* d_shutdown; + { + void* tmp_shutdown; + CUDA_CHECK(cudaHostAlloc(&tmp_shutdown, sizeof(int), cudaHostAllocMapped)); + h_shutdown = static_cast(tmp_shutdown); + *h_shutdown = 0; + + void* tmp_d_shutdown; + CUDA_CHECK(cudaHostGetDevicePointer(&tmp_d_shutdown, tmp_shutdown, 0)); + d_shutdown = static_cast(tmp_d_shutdown); + } + + // Set up stats + uint64_t* d_stats; + CUDA_CHECK(cudaMalloc(&d_stats, sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(d_stats, 0, sizeof(uint64_t))); + + // Create dispatch graph context - THIS WRAPS THE DISPATCH KERNEL IN A GRAPH + // so that device-side cudaGraphLaunch() can work! + cudaq_dispatch_graph_context* dispatch_ctx = nullptr; + cudaError_t err = cudaq_create_dispatch_graph_regular( + d_rx_flags, d_tx_flags, + reinterpret_cast(d_buffer), // rx_data + reinterpret_cast(d_buffer), // tx_data (same buffer for single-slot test) + buffer_size, // rx_stride_sz + buffer_size, // tx_stride_sz + d_function_entries, 1, + d_graph_buffer_ptr, d_shutdown, d_stats, 1, + 1, 32, stream, &dispatch_ctx); + + if (err != cudaSuccess) { + GTEST_SKIP() << "Device-side graph launch not supported: " + << cudaGetErrorString(err) << " (" << err << ")"; + } + + // Launch dispatch graph - now device-side cudaGraphLaunch will work! + CUDA_CHECK(cudaq_launch_dispatch_graph(dispatch_ctx, stream)); + + // Poll for the response using pinned memory and async operations + // The child graph runs asynchronously (fire-and-forget) so we need to poll + std::uint8_t* h_poll_buffer; + CUDA_CHECK(cudaHostAlloc(&h_poll_buffer, sizeof(cudaq::realtime::RPCResponse), cudaHostAllocDefault)); + memset(h_poll_buffer, 0, sizeof(cudaq::realtime::RPCResponse)); + + cudaStream_t poll_stream; + CUDA_CHECK(cudaStreamCreate(&poll_stream)); + + int timeout_ms = 5000; + int poll_interval_ms = 100; + bool got_response = false; + + for (int elapsed = 0; elapsed < timeout_ms; elapsed += poll_interval_ms) { + CUDA_CHECK(cudaMemcpyAsync(h_poll_buffer, d_buffer, sizeof(cudaq::realtime::RPCResponse), + cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaStreamSynchronize(poll_stream)); + + cudaq::realtime::RPCResponse* peek = reinterpret_cast(h_poll_buffer); + if (peek->magic == cudaq::realtime::RPC_MAGIC_RESPONSE) { + got_response = true; + break; + } + + usleep(poll_interval_ms * 1000); + } + + // Signal shutdown to allow kernel to exit + *h_shutdown = 1; + __sync_synchronize(); + usleep(100000); // Give kernel time to see shutdown flag + + // Copy final results + CUDA_CHECK(cudaMemcpyAsync(h_buffer, d_buffer, buffer_size, cudaMemcpyDeviceToHost, poll_stream)); + CUDA_CHECK(cudaStreamSynchronize(poll_stream)); + + // Clean up poll resources + CUDA_CHECK(cudaStreamDestroy(poll_stream)); + cudaFreeHost(h_poll_buffer); + + // Sync main stream (dispatch kernel should have exited) + CUDA_CHECK(cudaStreamSynchronize(stream)); + + ASSERT_TRUE(got_response) << "Timeout waiting for device-side graph launch response"; + + // Verify response + cudaq::realtime::RPCResponse* h_response = reinterpret_cast(h_buffer); + EXPECT_EQ(h_response->magic, cudaq::realtime::RPC_MAGIC_RESPONSE) + << "Expected RPC_MAGIC_RESPONSE, got 0x" << std::hex << h_response->magic; + EXPECT_EQ(h_response->status, 0) << "Handler returned error status"; + EXPECT_EQ(h_response->result_len, 4u) << "Unexpected result length"; + + // Verify data was incremented by graph kernel launched from dispatch kernel + std::uint8_t* h_result = h_buffer + sizeof(cudaq::realtime::RPCResponse); + EXPECT_EQ(h_result[0], 1) << "Expected h_result[0]=1"; + EXPECT_EQ(h_result[1], 2) << "Expected h_result[1]=2"; + EXPECT_EQ(h_result[2], 3) << "Expected h_result[2]=3"; + EXPECT_EQ(h_result[3], 4) << "Expected h_result[3]=4"; + + // Cleanup + delete[] h_buffer; + CUDA_CHECK(cudaq_destroy_dispatch_graph(dispatch_ctx)); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(d_stats)); + CUDA_CHECK(cudaFreeHost(const_cast(h_shutdown))); // Free mapped memory + CUDA_CHECK(cudaFree((void*)d_tx_flags)); + CUDA_CHECK(cudaFree((void*)d_rx_flags)); + CUDA_CHECK(cudaFree(d_function_entries)); + CUDA_CHECK(cudaGraphExecDestroy(child_graph_exec)); + CUDA_CHECK(cudaGraphDestroy(child_graph)); + CUDA_CHECK(cudaFree(d_graph_buffer_ptr)); + CUDA_CHECK(cudaFree(d_buffer)); +} + +} // namespace diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt new file mode 100644 index 00000000000..aa8108b5dca --- /dev/null +++ b/realtime/unittests/utils/CMakeLists.txt @@ -0,0 +1,288 @@ +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +# Hololink bridge and playback tools +# ============================================================================== +# These targets are gated by CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS and require +# a pre-built hololink (holoscan-sensor-bridge) with DOCA support. +# They are NOT CI tests -- they need FPGA hardware or an FPGA emulator. + +if (NOT HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR must be set when building hololink tools.") +endif() +if (NOT HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR) + message(FATAL_ERROR + "HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR must be set when building hololink tools.") +endif() + +find_package(Threads REQUIRED) +find_package(CUDAToolkit REQUIRED) + +# --------------------------------------------------------------------------- # +# Find Hololink core library +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_CORE_LIB + NAMES hololink_core + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/core" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT HOLOLINK_CORE_LIB) + message(FATAL_ERROR + "Could not find hololink_core library under ${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}.") +endif() + +# --------------------------------------------------------------------------- # +# Find GPU RoCE Transceiver library +# --------------------------------------------------------------------------- # + +find_library(GPU_ROCE_TRANSCEIVER_LIB + NAMES gpu_roce_transceiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/gpu_roce_transceiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING + "Could not find gpu_roce_transceiver library. " + "hololink_bridge will not be built.") +endif() + +# --------------------------------------------------------------------------- # +# Find transitive Hololink libraries +# --------------------------------------------------------------------------- # + +find_library(HOLOLINK_COMMON_LIB + NAMES hololink + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/common" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(ROCE_RECEIVER_LIB + NAMES roce_receiver + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators/roce_receiver" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(BASE_RECEIVER_OP_LIB + NAMES base_receiver_op + PATHS + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/src/hololink/operators" + "${HOLOSCAN_SENSOR_BRIDGE_BUILD_DIR}/lib" + NO_DEFAULT_PATH) + +find_library(IBVERBS_LIB NAMES ibverbs) + +# --------------------------------------------------------------------------- # +# Find DOCA libraries +# --------------------------------------------------------------------------- # + +set(DOCA_PATH "/opt/mellanox/doca") + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/x86_64-linux-gnu") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") + set(DOCA_LIB_DIR "${DOCA_PATH}/lib/aarch64-linux-gnu") +else() + set(DOCA_LIB_DIR "${DOCA_PATH}/lib") +endif() + +find_path(DOCA_INCLUDE_DIR doca_verbs.h + PATHS ${DOCA_PATH}/include + NO_DEFAULT_PATH) + +# RHEL may have DOCA libraries under ${DOCA_PATH}/lib64 +find_library(DOCA_VERBS_LIB doca_verbs + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +find_library(DOCA_GPUNETIO_LIB doca_gpunetio + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +find_library(DOCA_COMMON_LIB doca_common + PATHS ${DOCA_LIB_DIR} ${DOCA_PATH}/lib/ ${DOCA_PATH}/lib64/ + NO_DEFAULT_PATH) + +# --------------------------------------------------------------------------- # +# Find Holoscan (required by gpu_roce_transceiver -> holoscan::core) +# --------------------------------------------------------------------------- # + +find_package(holoscan QUIET) + +# --------------------------------------------------------------------------- # +# Find fmt (transitive dependency of hololink logging) +# --------------------------------------------------------------------------- # + +find_path(FMT_INCLUDE_DIR + NAMES fmt/format.h + PATHS /opt/nvidia/holoscan /usr/local/cudaq /usr /usr/local + PATH_SUFFIXES include + NO_DEFAULT_PATH) + +# =========================================================================== # +# hololink_fpga_playback (uses hololink_core for FPGA control plane) +# =========================================================================== # + +add_executable(hololink_fpga_playback + hololink_fpga_playback.cpp) + +target_include_directories(hololink_fpga_playback + PRIVATE + ${CUDAQ_REALTIME_INCLUDE_DIR} + "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src" + "${CUDAToolkit_INCLUDE_DIRS}") + +if (FMT_INCLUDE_DIR) + target_include_directories(hololink_fpga_playback + PRIVATE "${FMT_INCLUDE_DIR}") +endif() + +target_link_libraries(hololink_fpga_playback + PRIVATE ${HOLOLINK_CORE_LIB} CUDA::cuda_driver Threads::Threads) + +# Install this to so it's available for testing the installer +install(TARGETS hololink_fpga_playback + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT hololink-tools) + +# =========================================================================== # +# hololink_bridge (generic increment bridge) +# =========================================================================== # + +if (GPU_ROCE_TRANSCEIVER_LIB AND + DOCA_INCLUDE_DIR AND DOCA_VERBS_LIB AND DOCA_COMMON_LIB AND + DOCA_GPUNETIO_LIB) + + message(STATUS "Building hololink_bridge (generic increment)") + message(STATUS " GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}") + + # Hololink wrapper static library (compiled by g++, isolates fmt) + add_library(hololink_wrapper_generic STATIC + ${CMAKE_SOURCE_DIR}/lib/daemon/bridge/hololink/hololink_wrapper.cpp) + + target_include_directories(hololink_wrapper_generic + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} + "${HOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR}/src" + ${DOCA_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS} + ${FMT_INCLUDE_DIR}) + + target_link_libraries(hololink_wrapper_generic + PRIVATE ${GPU_ROCE_TRANSCEIVER_LIB}) + + target_compile_options(hololink_wrapper_generic PRIVATE -Wno-deprecated-declarations) + + # Increment function table (compiled by nvcc) + add_library(rpc_increment_ft STATIC + init_rpc_increment_function_table.cu) + + set_target_properties(rpc_increment_ft PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17) + + target_include_directories(rpc_increment_ft PRIVATE + ${CUDAQ_REALTIME_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + + # Bridge executable (.cpp, linked with CUDA) + add_executable(hololink_bridge + hololink_bridge.cpp) + + set_target_properties(hololink_bridge PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + + target_include_directories(hololink_bridge + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CUDAQ_REALTIME_INCLUDE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + + # Link order: static archives first, then shared + target_link_libraries(hololink_bridge + PRIVATE + rpc_increment_ft + cudaq-realtime-dispatch + cudaq-realtime-bridge-hololink + hololink_wrapper_generic + ${GPU_ROCE_TRANSCEIVER_LIB} + ${ROCE_RECEIVER_LIB} + ${BASE_RECEIVER_OP_LIB} + ${HOLOLINK_CORE_LIB} + ${HOLOLINK_COMMON_LIB} + cudaq-realtime + CUDA::cudart + CUDA::cuda_driver + ${DOCA_VERBS_LIB} + ${DOCA_GPUNETIO_LIB} + ${DOCA_COMMON_LIB} + ${IBVERBS_LIB} + Threads::Threads + ${CMAKE_DL_LIBS}) + + if (holoscan_FOUND) + target_link_libraries(hololink_bridge PRIVATE holoscan::core) + target_link_libraries(hololink_wrapper_generic PRIVATE holoscan::core) + endif() + + # Set RPATH for shared libraries + set_target_properties(hololink_bridge PROPERTIES + BUILD_RPATH "${DOCA_LIB_DIR}" + INSTALL_RPATH "${DOCA_LIB_DIR}") + + # Install the bridge executable + install(TARGETS hololink_bridge + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT hololink-tools) +else() + if (NOT GPU_ROCE_TRANSCEIVER_LIB) + message(WARNING "gpu_roce_transceiver library not found. " + "hololink_bridge will not be built.") + endif() + if (NOT DOCA_INCLUDE_DIR OR NOT DOCA_VERBS_LIB) + message(WARNING "DOCA libraries not found. " + "hololink_bridge requires DOCA.") + endif() +endif() + +# =========================================================================== # +# hololink_fpga_emulator (software FPGA, libibverbs only) +# =========================================================================== # + +if (IBVERBS_LIB) + message(STATUS "Building hololink_fpga_emulator") + + add_executable(hololink_fpga_emulator + hololink_fpga_emulator.cpp) + + target_link_libraries(hololink_fpga_emulator + PRIVATE + ${IBVERBS_LIB} + Threads::Threads) + # install the emulator binary for testing the installer + install(TARGETS hololink_fpga_emulator + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT hololink-tools) +else() + message(WARNING "libibverbs not found. hololink_fpga_emulator will not be built.") +endif() diff --git a/realtime/unittests/utils/hololink_bridge.cpp b/realtime/unittests/utils/hololink_bridge.cpp new file mode 100644 index 00000000000..98c98a23b8e --- /dev/null +++ b/realtime/unittests/utils/hololink_bridge.cpp @@ -0,0 +1,126 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_bridge.cpp +/// @brief Generic Hololink bridge tool for testing libcudaq-realtime dispatch. +/// +/// Registers a simple increment RPC handler (adds 1 to each byte) and wires +/// it through the Hololink GPU-RoCE Transceiver. No QEC or decoder dependency. +/// +/// Usage: +/// ./hololink_bridge \ +/// --device=rocep1s0f0 \ +/// --peer-ip=10.0.0.2 \ +/// --remote-qp=0x2 \ +/// --gpu=0 \ +/// --timeout=60 + +#include +#include +#include +#include + +#include + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/hololink_bridge_common.h" + +//============================================================================== +// Increment RPC Handler Function Table +//============================================================================== + +// The actual __device__ rpc_increment_handler lives in +// init_rpc_increment_function_table.cu (compiled by nvcc). We declare the +// host-callable setup function here so this .cpp can be compiled by g++. + +extern "C" void +setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries); + +//============================================================================== +// Main +//============================================================================== + +int main(int argc, char *argv[]) { + // Check for help + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "--help" || arg == "-h") { + std::cout + << "Usage: " << argv[0] << " [options]\n" + << "\n" + << "Generic Hololink bridge for testing libcudaq-realtime dispatch.\n" + << "Registers increment handler (adds 1 to each byte of the RPC " + "payload).\n" + << "\n" + << "Options:\n" + << " --device=NAME IB device (default: rocep1s0f0)\n" + << " --peer-ip=ADDR FPGA/emulator IP (default: 10.0.0.2)\n" + << " --remote-qp=N Remote QP number (default: 0x2)\n" + << " --gpu=N GPU device ID (default: 0)\n" + << " --timeout=N Timeout in seconds (default: 60)\n" + << " --payload-size=N RPC payload size in bytes (default: 8)\n" + << " --page-size=N Ring buffer slot size (default: 384)\n" + << " --num-pages=N Number of ring buffer slots (default: " + "64)\n" + << " --exchange-qp Enable QP exchange protocol\n" + << " --exchange-port=N TCP port for QP exchange (default: " + "12345)\n" + << " --forward Use Hololink forward kernel (echo) " + "instead of dispatch\n" + << " --unified Use unified dispatch kernel (RX + " + "dispatch + TX in one kernel)\n"; + return 0; + } + } + + try { + std::cout << "=== Hololink Generic Bridge ===" << std::endl; + + // Parse common bridge args + cudaq::realtime::BridgeConfig config; + cudaq::realtime::parse_bridge_args(argc, argv, config); + + std::cout << "Device: " << config.device << std::endl; + std::cout << "Peer IP: " << config.peer_ip << std::endl; + std::cout << "Remote QP: 0x" << std::hex << config.remote_qp << std::dec + << std::endl; + std::cout << "GPU: " << config.gpu_id << std::endl; + + // Initialize CUDA early to allocate function table + cudaError_t err = cudaSetDevice(config.gpu_id); + if (err != cudaSuccess) { + std::cerr << "ERROR: cudaSetDevice failed: " << cudaGetErrorString(err) + << std::endl; + return 1; + } + + // Set up increment RPC function table on GPU + cudaq_function_entry_t *d_function_entries = nullptr; + err = cudaMalloc(&d_function_entries, sizeof(cudaq_function_entry_t)); + if (err != cudaSuccess) { + std::cerr << "ERROR: cudaMalloc failed: " << cudaGetErrorString(err) + << std::endl; + return 1; + } + setup_rpc_increment_function_table(d_function_entries); + + config.d_function_entries = d_function_entries; + config.func_count = 1; + config.launch_fn = &cudaq::realtime::bridge_launch_dispatch_kernel; + config.cleanup_fn = [d_function_entries]() { + cudaFree(d_function_entries); + }; + + return cudaq::realtime::bridge_run(config); + + } catch (const std::exception &e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return 1; + } +} diff --git a/realtime/unittests/utils/hololink_fpga_emulator.cpp b/realtime/unittests/utils/hololink_fpga_emulator.cpp new file mode 100644 index 00000000000..1461991d980 --- /dev/null +++ b/realtime/unittests/utils/hololink_fpga_emulator.cpp @@ -0,0 +1,1218 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_fpga_emulator.cpp +/// @brief Software FPGA emulator for Hololink RPC testing. +/// +/// Emulates the FPGA's role in the RPC pipeline: +/// 1. Hololink UDP control plane server (register read/write) +/// 2. Playback BRAM (receives payloads from playback tool) +/// 3. RDMA transmit (sends RPC requests to bridge) +/// 4. RDMA receive (receives RPC responses from bridge) +/// 5. ILA capture RAM (stores responses for verification readback) +/// +/// Three-tool workflow: +/// 1. Start this emulator (prints QP number) +/// 2. Start hololink_mock_decoder_bridge with --remote-qp= +/// 3. Start hololink_fpga_syndrome_playback --control-port= +/// with bridge's QP/RKEY/buffer-addr +/// +/// The playback tool drives the emulator via UDP just as it would a real FPGA. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +//============================================================================== +// Global shutdown flag +//============================================================================== + +static std::atomic g_shutdown{false}; +static void signal_handler(int) { g_shutdown = true; } + +//============================================================================== +// Hololink Protocol Constants +//============================================================================== + +static constexpr uint8_t WR_DWORD = 0x04; +static constexpr uint8_t WR_BLOCK = 0x09; +static constexpr uint8_t RD_DWORD = 0x14; +static constexpr uint8_t RD_BLOCK = 0x19; + +static constexpr uint8_t REQUEST_FLAGS_ACK_REQUEST = 0x01; +static constexpr uint8_t RESPONSE_SUCCESS = 0x00; + +// VP register offsets (relative to vp_address) +static constexpr uint32_t DP_QP = 0x00; +static constexpr uint32_t DP_RKEY = 0x04; +static constexpr uint32_t DP_PAGE_LSB = 0x08; +static constexpr uint32_t DP_PAGE_MSB = 0x0C; +static constexpr uint32_t DP_PAGE_INC = 0x10; +static constexpr uint32_t DP_MAX_BUFF = 0x14; +static constexpr uint32_t DP_BUFFER_LENGTH = 0x18; + +// HIF register offsets (relative to hif_address) +static constexpr uint32_t DP_VP_MASK = 0x0C; + +// Player registers +static constexpr uint32_t PLAYER_BASE = 0x50000000; +static constexpr uint32_t PLAYER_ENABLE = PLAYER_BASE + 0x04; +static constexpr uint32_t PLAYER_TIMER = PLAYER_BASE + 0x08; +static constexpr uint32_t PLAYER_WIN_SIZE = PLAYER_BASE + 0x0C; +static constexpr uint32_t PLAYER_WIN_NUM = PLAYER_BASE + 0x10; + +// Playback BRAM +static constexpr uint32_t RAM_BASE = 0x50100000; +static constexpr int BRAM_NUM_BANKS = 16; +static constexpr int BRAM_W_SAMPLE_ADDR = 9; // log2(512 entries) +static constexpr int BRAM_BANK_STRIDE = 1 << (BRAM_W_SAMPLE_ADDR + 2); // 2048 + +// ILA capture +static constexpr uint32_t ILA_BASE = 0x40000000; +static constexpr uint32_t ILA_CTRL = ILA_BASE + 0x00; +static constexpr uint32_t ILA_STATUS = ILA_BASE + 0x80; +static constexpr uint32_t ILA_SAMPLE_ADDR = ILA_BASE + 0x84; +static constexpr uint32_t ILA_DATA_BASE = 0x40100000; +static constexpr int ILA_NUM_BANKS = 19; +static constexpr int ILA_W_ADDR = 13; // log2(8192 entries) +static constexpr int ILA_BANK_STRIDE = 1 << (ILA_W_ADDR + 2); // 32768 + +// Ring buffer +static constexpr int NUM_BUFFERS = 64; + +//============================================================================== +// RDMA Context (adapted from cuda-qx rdma_utils.hpp) +//============================================================================== + +class RdmaContext { +public: + ~RdmaContext() { cleanup(); } + + bool open(const std::string &device_name, int port = 1) { + int num_devices; + ibv_device **devices = ibv_get_device_list(&num_devices); + if (!devices || num_devices == 0) + return false; + + ibv_device *target = nullptr; + for (int i = 0; i < num_devices; i++) { + if (device_name == ibv_get_device_name(devices[i])) { + target = devices[i]; + break; + } + } + if (!target) { + ibv_free_device_list(devices); + return false; + } + + ctx_ = ibv_open_device(target); + ibv_free_device_list(devices); + if (!ctx_) + return false; + + port_ = port; + pd_ = ibv_alloc_pd(ctx_); + if (!pd_) { + cleanup(); + return false; + } + + if (ibv_query_port(ctx_, port_, &port_attr_) != 0) { + cleanup(); + return false; + } + + gid_index_ = find_roce_v2_gid_index(); + return true; + } + + ibv_cq *create_cq(int size) { + return ibv_create_cq(ctx_, size, nullptr, nullptr, 0); + } + + ibv_mr *register_memory(void *addr, size_t size, + int access = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE) { + return ibv_reg_mr(pd_, addr, size, access); + } + + ibv_qp *create_qp(ibv_cq *send_cq, ibv_cq *recv_cq, uint32_t max_send_wr = 64, + uint32_t max_recv_wr = 64) { + ibv_qp_init_attr init_attr{}; + init_attr.qp_type = IBV_QPT_UC; // Unreliable Connected - matches FPGA + init_attr.send_cq = send_cq; + init_attr.recv_cq = recv_cq; + init_attr.cap.max_send_wr = max_send_wr; + init_attr.cap.max_recv_wr = max_recv_wr; + init_attr.cap.max_send_sge = 1; + init_attr.cap.max_recv_sge = 1; + return ibv_create_qp(pd_, &init_attr); + } + + bool qp_to_init(ibv_qp *qp) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_INIT; + attr.port_num = port_; + attr.pkey_index = 0; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; + return ibv_modify_qp(qp, &attr, + IBV_QP_STATE | IBV_QP_PORT | IBV_QP_PKEY_INDEX | + IBV_QP_ACCESS_FLAGS) == 0; + } + + bool qp_to_rtr(ibv_qp *qp, const ibv_gid &remote_gid, uint32_t remote_qp_num, + uint32_t psn = 0) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_RTR; + attr.path_mtu = port_attr_.active_mtu; + attr.dest_qp_num = remote_qp_num; + attr.rq_psn = psn; + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.dgid = remote_gid; + attr.ah_attr.grh.sgid_index = gid_index_; + attr.ah_attr.grh.hop_limit = 64; + attr.ah_attr.grh.traffic_class = 0; + attr.ah_attr.dlid = 0; + attr.ah_attr.sl = 0; + attr.ah_attr.src_path_bits = 0; + attr.ah_attr.port_num = port_; + return ibv_modify_qp(qp, &attr, + IBV_QP_STATE | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | IBV_QP_AV) == 0; + } + + bool qp_to_rts(ibv_qp *qp, uint32_t psn = 0) { + ibv_qp_attr attr{}; + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = psn; + return ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_SQ_PSN) == 0; + } + + bool post_recv(ibv_qp *qp, uint64_t wr_id, void *addr, uint32_t length, + uint32_t lkey) { + ibv_sge sge{}; + sge.addr = reinterpret_cast(addr); + sge.length = length; + sge.lkey = lkey; + + ibv_recv_wr wr{}; + wr.wr_id = wr_id; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.next = nullptr; + + ibv_recv_wr *bad_wr = nullptr; + return ibv_post_recv(qp, &wr, &bad_wr) == 0; + } + + bool post_rdma_write_imm(ibv_qp *qp, uint64_t wr_id, void *local_addr, + uint32_t length, uint32_t lkey, uint64_t remote_addr, + uint32_t rkey, uint32_t imm_data) { + ibv_sge sge{}; + sge.addr = reinterpret_cast(local_addr); + sge.length = length; + sge.lkey = lkey; + + ibv_send_wr wr{}; + wr.wr_id = wr_id; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = IBV_SEND_SIGNALED; + wr.imm_data = htonl(imm_data); + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + wr.next = nullptr; + + ibv_send_wr *bad_wr = nullptr; + return ibv_post_send(qp, &wr, &bad_wr) == 0; + } + + int poll_cq(ibv_cq *cq, ibv_wc *wc, int max_wc = 1) { + return ibv_poll_cq(cq, max_wc, wc); + } + + int get_gid_index() const { return gid_index_; } + +private: + void cleanup() { + if (pd_) { + ibv_dealloc_pd(pd_); + pd_ = nullptr; + } + if (ctx_) { + ibv_close_device(ctx_); + ctx_ = nullptr; + } + } + + int find_roce_v2_gid_index() { + int best_gid = -1; + for (int i = 0; i < port_attr_.gid_tbl_len; i++) { + ibv_gid gid; + if (ibv_query_gid(ctx_, port_, i, &gid) == 0) { + if (gid.raw[10] == 0xff && gid.raw[11] == 0xff) { + best_gid = i; // Last match = RoCE v2 + } + } + } + return (best_gid >= 0) ? best_gid : 0; + } + + ibv_context *ctx_ = nullptr; + ibv_pd *pd_ = nullptr; + ibv_port_attr port_attr_{}; + int port_ = 1; + int gid_index_ = 0; +}; + +//============================================================================== +// RDMA Buffer +//============================================================================== + +class RdmaBuffer { +public: + ~RdmaBuffer() { release(); } + + bool allocate(RdmaContext &ctx, size_t size) { + size_t page_size = 4096; + size_t aligned = ((size + page_size - 1) / page_size) * page_size; + data_ = aligned_alloc(page_size, aligned); + if (!data_) + return false; + size_ = size; + memset(data_, 0, aligned); + mr_ = ctx.register_memory(data_, aligned, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + if (!mr_) { + ::free(data_); + data_ = nullptr; + return false; + } + return true; + } + + void release() { + if (mr_) { + ibv_dereg_mr(mr_); + mr_ = nullptr; + } + if (data_) { + ::free(data_); + data_ = nullptr; + } + } + + void *data() const { return data_; } + size_t size() const { return size_; } + uint32_t lkey() const { return mr_ ? mr_->lkey : 0; } + uint32_t rkey() const { return mr_ ? mr_->rkey : 0; } + +private: + void *data_ = nullptr; + size_t size_ = 0; + ibv_mr *mr_ = nullptr; +}; + +//============================================================================== +// Emulated Register File +//============================================================================== + +class RegisterFile { +public: + void write(uint32_t addr, uint32_t value) { + std::lock_guard lock(mu_); + regs_[addr] = value; + } + + uint32_t read(uint32_t addr) const { + std::lock_guard lock(mu_); + auto it = regs_.find(addr); + return (it != regs_.end()) ? it->second : 0; + } + + /// Batch write (for BRAM loading efficiency). + void write_batch(const std::vector> &writes) { + std::lock_guard lock(mu_); + for (auto &[addr, val] : writes) { + regs_[addr] = val; + } + } + + /// Read a range of contiguous 32-bit registers. + std::vector read_range(uint32_t base_addr, uint32_t count) const { + std::lock_guard lock(mu_); + std::vector result(count); + for (uint32_t i = 0; i < count; i++) { + auto it = regs_.find(base_addr + i * 4); + result[i] = (it != regs_.end()) ? it->second : 0; + } + return result; + } + +private: + mutable std::mutex mu_; + std::unordered_map regs_; +}; + +//============================================================================== +// RDMA Target Config (decoded from VP register writes) +//============================================================================== + +struct RdmaTargetConfig { + uint32_t qp_number = 0; + uint32_t rkey = 0; + uint64_t buffer_addr = 0; + uint32_t page_inc = 0; // bytes + uint32_t max_buff = 0; // max buffer index + uint32_t buffer_length = 0; + + // Temporary storage for two-part address + uint32_t page_lsb = 0; + uint32_t page_msb = 0; + + // Track whether key fields were explicitly set (buffer_addr=0 is valid + // when Hololink uses IOVA with dmabuf). + bool qp_set = false; + bool rkey_set = false; + + void update_addr() { + // Hololink encodes: PAGE_LSB = addr >> 7, PAGE_MSB = addr >> 32 + // Reconstruct: addr = (MSB << 32) | (LSB << 7) + buffer_addr = (static_cast(page_msb) << 32) | + (static_cast(page_lsb) << 7); + } + + bool is_complete() const { + // buffer_addr=0 is valid (Hololink IOVA/dmabuf), so we only check + // that QP and RKEY were explicitly set. + return qp_set && rkey_set; + } + + void print() const { + std::cout << " RDMA Target Config:" << std::endl; + std::cout << " QP: 0x" << std::hex << qp_number << std::dec << std::endl; + std::cout << " RKEY: 0x" << std::hex << rkey << std::dec << std::endl; + std::cout << " Buffer addr: 0x" << std::hex << buffer_addr << std::dec + << std::endl; + std::cout << " Page inc: " << page_inc << " bytes" << std::endl; + std::cout << " Max buff: " << max_buff << std::endl; + } +}; + +//============================================================================== +// UDP Control Plane Server +//============================================================================== + +class ControlPlaneServer { +public: + ControlPlaneServer(uint16_t port, uint32_t vp_address, uint32_t hif_address, + RegisterFile ®s) + : port_(port), vp_addr_(vp_address), hif_addr_(hif_address), regs_(regs) { + } + + ~ControlPlaneServer() { stop(); } + + void set_my_qp(uint32_t qp) { my_qp_ = qp; } + + bool start() { + fd_ = socket(AF_INET, SOCK_DGRAM, 0); + if (fd_ < 0) + return false; + + int opt = 1; + setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sockaddr_in addr{}; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = htons(port_); + if (bind(fd_, reinterpret_cast(&addr), sizeof(addr)) < 0) { + ::close(fd_); + fd_ = -1; + return false; + } + + running_ = true; + thread_ = std::thread(&ControlPlaneServer::run, this); + return true; + } + + void stop() { + running_ = false; + if (fd_ >= 0) { + shutdown(fd_, SHUT_RDWR); + ::close(fd_); + fd_ = -1; + } + if (thread_.joinable()) + thread_.join(); + } + + /// Block until RDMA config is complete or timeout. + bool wait_for_config(int timeout_ms = 60000) { + auto start = std::chrono::steady_clock::now(); + while (!target_.is_complete() && !g_shutdown) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + if (elapsed >= timeout_ms) + return false; + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return target_.is_complete(); + } + + const RdmaTargetConfig &target() const { return target_; } + + /// Check if player_enable was set to 1. + bool playback_triggered() const { return playback_triggered_.load(); } + void clear_playback_trigger() { playback_triggered_ = false; } + + /// Get player config. + uint32_t window_size() const { return regs_.read(PLAYER_WIN_SIZE); } + uint32_t window_number() const { return regs_.read(PLAYER_WIN_NUM); } + uint32_t timer_spacing() const { return regs_.read(PLAYER_TIMER); } + +private: + void run() { + std::vector buf(4096); + while (running_ && !g_shutdown) { + fd_set fds; + FD_ZERO(&fds); + FD_SET(fd_, &fds); + timeval tv{0, 100000}; // 100ms + + int ready = select(fd_ + 1, &fds, nullptr, nullptr, &tv); + if (ready <= 0) + continue; + + sockaddr_in client{}; + socklen_t clen = sizeof(client); + ssize_t len = recvfrom(fd_, buf.data(), buf.size(), 0, + reinterpret_cast(&client), &clen); + if (len < 6) + continue; + + handle_packet(buf.data(), static_cast(len), client); + } + } + + // --- Packet helpers --- + + static uint32_t read_be32(const uint8_t *p) { + return (uint32_t(p[0]) << 24) | (uint32_t(p[1]) << 16) | + (uint32_t(p[2]) << 8) | p[3]; + } + + static uint16_t read_be16(const uint8_t *p) { + return (uint16_t(p[0]) << 8) | p[1]; + } + + static void write_be32(uint8_t *p, uint32_t v) { + p[0] = (v >> 24) & 0xFF; + p[1] = (v >> 16) & 0xFF; + p[2] = (v >> 8) & 0xFF; + p[3] = v & 0xFF; + } + + static void write_be16(uint8_t *p, uint16_t v) { + p[0] = (v >> 8) & 0xFF; + p[1] = v & 0xFF; + } + + // --- Handle incoming packet --- + + void handle_packet(const uint8_t *data, size_t len, + const sockaddr_in &client) { + uint8_t opcode = data[0]; + uint8_t flags = data[1]; + uint16_t seq = read_be16(data + 2); + + switch (opcode) { + case WR_DWORD: + if (len >= 14) + handle_wr_dword(data, flags, seq, client); + break; + case WR_BLOCK: + handle_wr_block(data, len, flags, seq, client); + break; + case RD_DWORD: + if (len >= 10) + handle_rd_dword(data, flags, seq, client); + break; + case RD_BLOCK: + handle_rd_block(data, len, flags, seq, client); + break; + default: + // Unknown opcode - send error ACK + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, opcode, flags, seq); + break; + } + } + + void handle_wr_dword(const uint8_t *data, uint8_t flags, uint16_t seq, + const sockaddr_in &client) { + uint32_t addr = read_be32(data + 6); + uint32_t val = read_be32(data + 10); + process_register_write(addr, val); + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, WR_DWORD, flags, seq); + } + + void handle_wr_block(const uint8_t *data, size_t len, uint8_t flags, + uint16_t seq, const sockaddr_in &client) { + // Pairs start at offset 6, each pair is 8 bytes + size_t offset = 6; + std::vector> batch; + while (offset + 8 <= len) { + uint32_t addr = read_be32(data + offset); + uint32_t val = read_be32(data + offset + 4); + batch.push_back({addr, val}); + offset += 8; + } + + // Batch write to register file + regs_.write_batch(batch); + + // Process VP register updates + for (auto &[addr, val] : batch) { + process_vp_update(addr, val); + check_player_enable(addr, val); + } + + if (flags & REQUEST_FLAGS_ACK_REQUEST) + send_write_ack(client, WR_BLOCK, flags, seq); + } + + void handle_rd_dword(const uint8_t *data, uint8_t flags, uint16_t seq, + const sockaddr_in &client) { + uint32_t addr = read_be32(data + 6); + uint32_t val = regs_.read(addr); + + // Response: cmd(1) + flags(1) + seq(2) + response_code(1) + reserved(1) + + // addr(4) + value(4) + latched_seq(2) = 16 bytes + uint8_t resp[16]; + resp[0] = RD_DWORD; + resp[1] = flags; + write_be16(resp + 2, seq); + resp[4] = RESPONSE_SUCCESS; + resp[5] = 0; // reserved + write_be32(resp + 6, addr); + write_be32(resp + 10, val); + write_be16(resp + 14, seq); // latched sequence + + sendto(fd_, resp, sizeof(resp), 0, + reinterpret_cast(&client), sizeof(client)); + } + + void handle_rd_block(const uint8_t *data, size_t len, uint8_t flags, + uint16_t seq, const sockaddr_in &client) { + // Parse addresses from request + std::vector addrs; + size_t offset = 6; + while (offset + 8 <= len) { + addrs.push_back(read_be32(data + offset)); + offset += 8; + } + + // Build response: cmd(1) + flags(1) + seq(2) + rc(1) + reserved(1) + + // N*(addr(4)+value(4)) + latched_seq(2) + size_t resp_len = 6 + addrs.size() * 8 + 2; + std::vector resp(resp_len); + resp[0] = RD_BLOCK; + resp[1] = flags; + write_be16(resp.data() + 2, seq); + resp[4] = RESPONSE_SUCCESS; + resp[5] = 0; + + size_t roff = 6; + for (uint32_t a : addrs) { + uint32_t val = regs_.read(a); + write_be32(resp.data() + roff, a); + write_be32(resp.data() + roff + 4, val); + roff += 8; + } + write_be16(resp.data() + roff, seq); // latched sequence + + sendto(fd_, resp.data(), resp.size(), 0, + reinterpret_cast(&client), sizeof(client)); + } + + // --- Write ACK for WR_DWORD / WR_BLOCK --- + + void send_write_ack(const sockaddr_in &client, uint8_t cmd, uint8_t flags, + uint16_t seq) { + uint8_t resp[5]; + resp[0] = cmd; + resp[1] = flags; + write_be16(resp + 2, seq); + resp[4] = RESPONSE_SUCCESS; + sendto(fd_, resp, sizeof(resp), 0, + reinterpret_cast(&client), sizeof(client)); + } + + // --- Register write processing --- + + void process_register_write(uint32_t addr, uint32_t val) { + regs_.write(addr, val); + process_vp_update(addr, val); + check_player_enable(addr, val); + } + + void process_vp_update(uint32_t addr, uint32_t val) { + // Check if this is a VP register (relative to vp_addr_) + if (addr < vp_addr_ || addr >= vp_addr_ + 0x100) + return; + + uint32_t offset = addr - vp_addr_; + switch (offset) { + case DP_QP: + target_.qp_number = val; + target_.qp_set = true; + break; + case DP_RKEY: + target_.rkey = val; + target_.rkey_set = true; + break; + case DP_PAGE_LSB: + target_.page_lsb = val; + target_.update_addr(); + break; + case DP_PAGE_MSB: + target_.page_msb = val; + target_.update_addr(); + break; + case DP_PAGE_INC: + target_.page_inc = val << 7; // PAGES encoding: value * 128 + break; + case DP_MAX_BUFF: + target_.max_buff = val; + break; + case DP_BUFFER_LENGTH: + target_.buffer_length = val; + break; + } + } + + void check_player_enable(uint32_t addr, uint32_t val) { + if (addr == PLAYER_ENABLE && (val & 1)) { + playback_triggered_ = true; + } + } + + uint16_t port_; + uint32_t vp_addr_; + uint32_t hif_addr_; + RegisterFile ®s_; + int fd_ = -1; + std::atomic running_{false}; + std::thread thread_; + uint32_t my_qp_ = 0; + RdmaTargetConfig target_; + std::atomic playback_triggered_{false}; +}; + +//============================================================================== +// BRAM Reassembly +//============================================================================== + +/// Reassemble one window from the 16-bank BRAM layout. +/// Each 64-byte beat is spread across 16 banks (4 bytes each). +/// @param regs Register file to read from +/// @param window_index Window number +/// @param cycles_per_window Number of 64-byte beats per window +/// @return Reassembled window payload +static std::vector reassemble_window(const RegisterFile ®s, + uint32_t window_index, + uint32_t cycles_per_window) { + std::vector payload(cycles_per_window * 64, 0); + for (uint32_t cycle = 0; cycle < cycles_per_window; cycle++) { + uint32_t sample_index = window_index * cycles_per_window + cycle; + for (int bank = 0; bank < BRAM_NUM_BANKS; bank++) { + uint32_t addr = + RAM_BASE + (bank << (BRAM_W_SAMPLE_ADDR + 2)) + (sample_index * 4); + uint32_t val = regs.read(addr); + // Store as little-endian (matching FPGA BRAM word order) + size_t byte_offset = cycle * 64 + bank * 4; + memcpy(&payload[byte_offset], &val, 4); + } + } + return payload; +} + +//============================================================================== +// ILA Capture Storage +//============================================================================== + +/// Store a correction response into the ILA capture register file. +/// The ILA stores each sample across 19 banks of 32-bit words (585 bits): +/// Banks 0-15 = 512-bit AXI data bus (raw correction bytes) +/// Bank 16 = control signals: +/// bit 0 = tvalid (bit 512), bit 1 = tlast (bit 513), +/// bits [8:2] = wr_tcnt (bits 520:514, 7-bit write transaction count) +/// Banks 17-18 = PTP timestamp at bits [584:521] (zero in emulator) +static void store_ila_sample(RegisterFile ®s, uint32_t sample_index, + const uint8_t *data, size_t data_len) { + // Banks 0-15: 512-bit AXI data bus + for (int bank = 0; bank < 16; bank++) { + uint32_t addr = + ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (sample_index * 4); + uint32_t val = 0; + size_t byte_offset = bank * 4; + if (byte_offset < data_len) { + size_t copy_len = std::min(4, data_len - byte_offset); + memcpy(&val, data + byte_offset, copy_len); + } + regs.write(addr, val); + } + + // Bank 16: control signals (tvalid=1, tlast=1, wr_tcnt=1) + { + uint32_t ctrl_addr = + ILA_DATA_BASE + (16 << (ILA_W_ADDR + 2)) + (sample_index * 4); + uint32_t ctrl_val = 0; + ctrl_val |= (1u << 0); // tvalid (bit 512) + ctrl_val |= (1u << 1); // tlast (bit 513) + ctrl_val |= (1u << 2); // wr_tcnt = 1 (bits 514+, value 1 in 7-bit field) + regs.write(ctrl_addr, ctrl_val); + } + + // Banks 17-18: PTP timestamp placeholder (no PTP hardware in emulator) + for (int bank = 17; bank < ILA_NUM_BANKS; bank++) { + uint32_t addr = + ILA_DATA_BASE + (bank << (ILA_W_ADDR + 2)) + (sample_index * 4); + regs.write(addr, 0); + } + + // Update sample count + regs.write(ILA_SAMPLE_ADDR, sample_index + 1); +} + +//============================================================================== +// Command-Line Arguments +//============================================================================== + +struct EmulatorArgs { + std::string device = "rocep1s0f0"; + int ib_port = 1; + uint16_t control_port = 8193; + std::string bridge_ip = ""; // Bridge IP (for GID, auto-detect if empty) + uint32_t vp_address = 0x1000; + uint32_t hif_address = 0x0800; + size_t page_size = 256; // Default slot size for responses RX +}; + +static void print_usage(const char *prog) { + std::cout + << "Usage: " << prog << " [options]\n" + << "\nFPGA emulator for QEC decode loop testing.\n" + << "\nOptions:\n" + << " --device=NAME IB device name (default: rocep1s0f0)\n" + << " --ib-port=N IB port number (default: 1)\n" + << " --port=N UDP control plane port (default: 8193)\n" + << " --bridge-ip=ADDR Bridge tool IP for GID (default: auto)\n" + << " --vp-address=ADDR VP register base (default: 0x1000)\n" + << " --hif-address=ADDR HIF register base (default: 0x0800)\n" + << " --page-size=N Slot size for correction RX (default: 256)\n" + << " --help Show this help\n"; +} + +static EmulatorArgs parse_args(int argc, char *argv[]) { + EmulatorArgs args; + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg.find("--device=") == 0) + args.device = arg.substr(9); + else if (arg.find("--ib-port=") == 0) + args.ib_port = std::stoi(arg.substr(10)); + else if (arg.find("--port=") == 0) + args.control_port = std::stoi(arg.substr(7)); + else if (arg.find("--bridge-ip=") == 0) + args.bridge_ip = arg.substr(12); + else if (arg.find("--vp-address=") == 0) + args.vp_address = std::stoul(arg.substr(13), nullptr, 0); + else if (arg.find("--hif-address=") == 0) + args.hif_address = std::stoul(arg.substr(14), nullptr, 0); + else if (arg.find("--page-size=") == 0) + args.page_size = std::stoull(arg.substr(12)); + else if (arg == "--help" || arg == "-h") { + print_usage(argv[0]); + exit(0); + } + } + return args; +} + +//============================================================================== +// MAIN +//============================================================================== + +int main(int argc, char *argv[]) { + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + try { + auto args = parse_args(argc, argv); + + std::cout << "=== Hololink FPGA Emulator ===" << std::endl; + std::cout << "IB Device: " << args.device << std::endl; + std::cout << "Control port: " << args.control_port << std::endl; + std::cout << "VP address: 0x" << std::hex << args.vp_address << std::dec + << std::endl; + + //========================================================================== + // [1/4] Initialize RDMA + //========================================================================== + std::cout << "\n[1/4] Initializing RDMA..." << std::endl; + + RdmaContext rdma; + if (!rdma.open(args.device, args.ib_port)) { + std::cerr << "ERROR: Failed to open RDMA device: " << args.device + << std::endl; + return 1; + } + std::cout << " GID index: " << rdma.get_gid_index() << std::endl; + + // TX buffer for outgoing syndromes + RdmaBuffer tx_buffer; + if (!tx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { + std::cerr << "ERROR: Failed to allocate TX buffer" << std::endl; + return 1; + } + + // RX buffer for incoming responses (same page_size as bridge for + // symmetry) + RdmaBuffer rx_buffer; + if (!rx_buffer.allocate(rdma, NUM_BUFFERS * args.page_size)) { + std::cerr << "ERROR: Failed to allocate RX buffer" << std::endl; + return 1; + } + + // Create CQs and QP + ibv_cq *tx_cq = rdma.create_cq(NUM_BUFFERS * 2); + ibv_cq *rx_cq = rdma.create_cq(NUM_BUFFERS * 2); + if (!tx_cq || !rx_cq) { + std::cerr << "ERROR: Failed to create CQs" << std::endl; + return 1; + } + + ibv_qp *qp = rdma.create_qp(tx_cq, rx_cq, NUM_BUFFERS, NUM_BUFFERS); + if (!qp) { + std::cerr << "ERROR: Failed to create QP" << std::endl; + return 1; + } + if (!rdma.qp_to_init(qp)) { + std::cerr << "ERROR: Failed to set QP to INIT" << std::endl; + return 1; + } + + std::cout << " QP Number: 0x" << std::hex << qp->qp_num << std::dec + << std::endl; + std::cout << " TX buffer: " << tx_buffer.size() << " bytes" << std::endl; + std::cout << " RX buffer: " << rx_buffer.size() << " bytes" << std::endl; + + //========================================================================== + // [2/4] Start UDP control plane server + //========================================================================== + std::cout << "\n[2/4] Starting control plane server..." << std::endl; + + RegisterFile regs; + ControlPlaneServer server(args.control_port, args.vp_address, + args.hif_address, regs); + server.set_my_qp(qp->qp_num); + + if (!server.start()) { + std::cerr << "ERROR: Failed to start control plane server" << std::endl; + return 1; + } + std::cout << " Listening on UDP port " << args.control_port << std::endl; + std::cout << " Emulator QP: 0x" << std::hex << qp->qp_num << std::dec + << std::endl; + + //========================================================================== + // [3/4] Wait for RDMA config from playback tool + //========================================================================== + std::cout << "\n[3/4] Waiting for RDMA configuration..." << std::endl; + std::cout << " (Start bridge tool, then playback tool with " + "--control-port=" + << args.control_port << ")" << std::endl; + + if (!server.wait_for_config(300000)) { // 5 minute timeout + std::cerr << "ERROR: Timeout waiting for RDMA configuration" << std::endl; + return 1; + } + + auto &target = server.target(); + target.print(); + + // Connect QP to bridge + ibv_gid remote_gid{}; + if (!args.bridge_ip.empty()) { + // Use provided IP + remote_gid.raw[10] = 0xff; + remote_gid.raw[11] = 0xff; + inet_pton(AF_INET, args.bridge_ip.c_str(), &remote_gid.raw[12]); + } else { + // Derive from VP HOST_IP register if available + uint32_t host_ip = regs.read(args.vp_address + 0x28); // DP_HOST_IP + if (host_ip != 0) { + remote_gid.raw[10] = 0xff; + remote_gid.raw[11] = 0xff; + // DP_HOST_IP is in network byte order from inet_network() + memcpy(&remote_gid.raw[12], &host_ip, 4); + } else { + std::cerr << "ERROR: No bridge IP available. Use --bridge-ip or ensure " + "configure_roce sets HOST_IP." + << std::endl; + return 1; + } + } + + std::cout << " Connecting QP to bridge QP 0x" << std::hex + << target.qp_number << std::dec << "..." << std::endl; + + if (!rdma.qp_to_rtr(qp, remote_gid, target.qp_number, 0)) { + std::cerr << "ERROR: Failed QP -> RTR" << std::endl; + return 1; + } + if (!rdma.qp_to_rts(qp, 0)) { + std::cerr << "ERROR: Failed QP -> RTS" << std::endl; + return 1; + } + std::cout << " QP connected!" << std::endl; + + // Post receive WQEs for responses + for (size_t i = 0; i < NUM_BUFFERS; i++) { + void *addr = + static_cast(rx_buffer.data()) + (i * args.page_size); + if (!rdma.post_recv(qp, i, addr, args.page_size, rx_buffer.lkey())) { + std::cerr << "ERROR: Failed to post receive WQE " << i << std::endl; + return 1; + } + } + std::cout << " Posted " << NUM_BUFFERS << " receive WQEs" << std::endl; + + //========================================================================== + // [4/4] Wait for playback trigger, then run + //========================================================================== + std::cout << "\n[4/4] Waiting for playback trigger..." << std::endl; + + while (!server.playback_triggered() && !g_shutdown) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + if (g_shutdown) { + std::cout << "Shutdown requested" << std::endl; + return 0; + } + + std::cout << "\n=== Playback triggered ===" << std::endl; + + uint32_t win_size = server.window_size(); + uint32_t win_num = server.window_number(); + uint32_t timer = server.timer_spacing(); + uint32_t cycles_per_window = (win_size + 63) / 64; // 64 bytes per beat + + std::cout << " Window size: " << win_size << " bytes" << std::endl; + std::cout << " Window count: " << win_num << std::endl; + std::cout << " Timer spacing: " << timer << " (raw)" << std::endl; + std::cout << " Cycles per window: " << cycles_per_window << std::endl; + + // Compute pacing interval from timer register (timer = 322 * microseconds) + int pacing_us = (timer > 0) ? (timer / 322) : 10; + + // Check if ILA is armed + bool ila_armed = (regs.read(ILA_CTRL) & 0x01) != 0; + std::cout << " ILA capture: " << (ila_armed ? "armed" : "not armed") + << std::endl; + + // Determine page_size for RDMA addressing from target config + uint32_t rdma_page_size = + (target.page_inc > 0) ? target.page_inc : args.page_size; + uint32_t num_pages = target.max_buff + 1; + + std::cout << "\n=== Starting syndrome transmission ===" << std::endl; + + auto start_time = std::chrono::high_resolution_clock::now(); + uint32_t responses_received = 0; + uint32_t send_errors = 0; + uint32_t recv_timeouts = 0; + + for (uint32_t window = 0; window < win_num && !g_shutdown; window++) { + uint32_t remote_slot = window % num_pages; + uint32_t local_slot = window % NUM_BUFFERS; + + // Reassemble syndrome payload from BRAM + auto payload = reassemble_window(regs, window, cycles_per_window); + + // Copy to RDMA TX buffer slot (local buffer has NUM_BUFFERS slots) + uint8_t *tx_addr = static_cast(tx_buffer.data()) + + (local_slot * rdma_page_size); + size_t copy_len = std::min(payload.size(), rdma_page_size); + memcpy(tx_addr, payload.data(), copy_len); + + // RDMA WRITE to bridge's ring buffer (remote has num_pages slots) + uint64_t remote_addr = + target.buffer_addr + (remote_slot * rdma_page_size); + if (!rdma.post_rdma_write_imm(qp, window, tx_addr, copy_len, + tx_buffer.lkey(), remote_addr, target.rkey, + remote_slot)) { + std::cerr << "ERROR: RDMA WRITE failed for window " << window + << std::endl; + send_errors++; + continue; + } + + // Wait for send completion + bool send_ok = false; + auto t0 = std::chrono::steady_clock::now(); + while (!send_ok && !g_shutdown) { + ibv_wc wc; + int n = rdma.poll_cq(tx_cq, &wc, 1); + if (n > 0) { + send_ok = (wc.status == IBV_WC_SUCCESS); + if (!send_ok) { + std::cerr << "ERROR: Send CQE error: " + << ibv_wc_status_str(wc.status) << std::endl; + send_errors++; + } + break; + } + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - t0) + .count(); + if (elapsed > 5000) { + std::cerr << "ERROR: Send timeout for window " << window << std::endl; + recv_timeouts++; + break; + } + } + if (!send_ok) + continue; + + // Wait for correction response (natural pacing) + bool corr_ok = false; + t0 = std::chrono::steady_clock::now(); + while (!corr_ok && !g_shutdown) { + ibv_wc wc; + int n = rdma.poll_cq(rx_cq, &wc, 1); + if (n > 0) { + if (wc.status == IBV_WC_SUCCESS) { + corr_ok = true; + responses_received++; + + // Store in ILA capture if armed + if (ila_armed) { + uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; + uint8_t *resp_data = static_cast(rx_buffer.data()) + + (rx_slot * args.page_size); + store_ila_sample(regs, window, resp_data, wc.byte_len); + } + + // Re-post receive WQE + uint32_t rx_slot = wc.wr_id % NUM_BUFFERS; + void *rx_addr = static_cast(rx_buffer.data()) + + (rx_slot * args.page_size); + rdma.post_recv(qp, rx_slot, rx_addr, args.page_size, + rx_buffer.lkey()); + } else { + std::cerr << "ERROR: Recv CQE error: " + << ibv_wc_status_str(wc.status) << std::endl; + } + break; + } + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - t0) + .count(); + if (elapsed > 10000) { + std::cerr << "ERROR: Correction timeout for window " << window + << std::endl; + recv_timeouts++; + break; + } + } + + // Progress + if ((window + 1) % 10 == 0 || window == win_num - 1) { + std::cout << " Window " << (window + 1) << "/" << win_num + << " (responses: " << responses_received + << ", errors: " << send_errors << ")" << std::endl; + } + + // Pacing delay + if (pacing_us > 0 && window + 1 < win_num) { + std::this_thread::sleep_for(std::chrono::microseconds(pacing_us)); + } + } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( + end_time - start_time); + + // Mark ILA as done + if (ila_armed) { + regs.write(ILA_STATUS, regs.read(ILA_STATUS) | 0x02); // done bit + } + + // Report results + std::cout << "\n=== Emulator Results ===" << std::endl; + std::cout << " Windows sent: " << win_num << std::endl; + std::cout << " Responses received: " << responses_received << std::endl; + std::cout << " Send errors: " << send_errors << std::endl; + std::cout << " Timeouts: " << recv_timeouts << std::endl; + std::cout << " Duration: " << duration.count() << " ms" << std::endl; + + // Keep running to allow playback tool to read ILA capture data + if (ila_armed) { + std::cout << "\nWaiting for ILA readback (Ctrl+C to stop)..." + << std::endl; + while (!g_shutdown) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + // Cleanup + server.stop(); + ibv_destroy_qp(qp); + ibv_destroy_cq(tx_cq); + ibv_destroy_cq(rx_cq); + + if (send_errors == 0 && recv_timeouts == 0 && + responses_received == win_num) { + std::cout << "\n*** EMULATOR: ALL WINDOWS PROCESSED ***" << std::endl; + return 0; + } else { + std::cout << "\n*** EMULATOR: ERRORS DETECTED ***" << std::endl; + return 1; + } + + } catch (const std::exception &e) { + std::cerr << "ERROR: " << e.what() << std::endl; + return 1; + } +} diff --git a/realtime/unittests/utils/hololink_fpga_playback.cpp b/realtime/unittests/utils/hololink_fpga_playback.cpp new file mode 100644 index 00000000000..96344567b2d --- /dev/null +++ b/realtime/unittests/utils/hololink_fpga_playback.cpp @@ -0,0 +1,814 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file hololink_fpga_playback.cpp +/// @brief Generic RPC playback tool for Hololink FPGA / emulator testing. +/// +/// Sends RPC messages to the FPGA (or emulator) via the Hololink control +/// plane, triggering RDMA transmission to the bridge. After playback, reads +/// back responses from the ILA capture RAM and verifies them. +/// +/// For the generic bridge, the payload is a sequence of ascending bytes and +/// the expected response is each byte incremented by 1. +/// +/// Usage: +/// ./hololink_fpga_playback \ +/// --hololink 192.168.0.2 \ +/// --bridge-qp=0x5 --bridge-rkey=12345 --bridge-buffer=0x7f... \ +/// --page-size=384 --num-pages=128 --num-messages=100 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" + +namespace { + +// ============================================================================ +// Playback BRAM Constants +// ============================================================================ +constexpr std::uint32_t PLAYER_ADDR = 0x5000'0000; +constexpr std::uint32_t RAM_ADDR = 0x5010'0000; +constexpr std::uint32_t PLAYER_TIMER_OFFSET = 0x0008; +constexpr std::uint32_t PLAYER_WINDOW_SIZE_OFFSET = 0x000C; +constexpr std::uint32_t PLAYER_WINDOW_NUMBER_OFFSET = 0x0010; +constexpr std::uint32_t PLAYER_ENABLE_OFFSET = 0x0004; +constexpr std::uint32_t RAM_NUM = 16; +constexpr std::uint32_t RAM_DEPTH = 512; + +constexpr std::uint32_t PLAYER_ENABLE_SINGLEPASS = 0x0000'000D; +constexpr std::uint32_t PLAYER_DISABLE = 0x0000'0000; + +constexpr std::uint32_t SIF_TX_THRESHOLD_ADDR = 0x0120'0000; +constexpr std::uint32_t SIF_TX_THRESHOLD_IMMEDIATE = 0x0000'0005; + +constexpr std::uint32_t METADATA_PACKET_ADDR = 0x102C; + +constexpr std::uint32_t DEFAULT_TIMER_SPACING_US = 120; +constexpr std::uint32_t RF_SOC_TIMER_SCALE = 322; + +// ============================================================================ +// ILA Capture Constants (SIF TX at 0x4000_0000) +// +// Each captured sample is 585 bits: +// [511:0] sif_tx_axis_tdata_0 +// [512] sif_tx_axis_tvalid_0 +// [513] sif_tx_axis_tlast_0 +// [520:514] sif_ila_wr_tcnt_0 +// [584:521] current_ptp_timestamp {sec[31:0], nsec[31:0]} +// ============================================================================ +constexpr std::uint32_t ILA_BASE_ADDR = 0x4000'0000; +constexpr std::uint32_t ILA_CTRL_OFFSET = 0x0000; +constexpr std::uint32_t ILA_SAMPLE_ADDR_OFFSET = 0x0084; +constexpr std::uint32_t ILA_W_DATA = 585; +constexpr std::uint32_t ILA_NUM_RAM = (ILA_W_DATA + 31) / 32; // 19 +constexpr std::uint32_t ILA_W_ADDR = 13; // log2(8192) +constexpr std::uint32_t ILA_W_RAM = 5; // ceil(log2(19)) + +constexpr std::uint32_t ILA_CTRL_ENABLE = 0x0000'0001; +constexpr std::uint32_t ILA_CTRL_RESET = 0x0000'0002; +constexpr std::uint32_t ILA_CTRL_DISABLE = 0x0000'0000; + +constexpr std::uint32_t ILA_TVALID_BIT = 512; + +constexpr std::uint32_t ROCEV2_UDP_PORT = 4791; + +// ============================================================================ +// Arguments +// ============================================================================ + +struct PlaybackArgs { + std::string hololink_ip = "192.168.0.2"; + uint16_t control_port = 8192; + uint32_t bridge_qp = 0; + uint32_t bridge_rkey = 0; + uint64_t bridge_buffer = 0; + size_t page_size = 384; + unsigned num_pages = 128; + uint32_t num_messages = 100; + uint32_t payload_size = 8; + uint32_t vp_address = 0x1000; + uint32_t hif_address = 0x0800; + std::string bridge_ip = "10.0.0.1"; + bool verify = true; + bool emulator = false; + bool forward = false; ///< Forward (echo) mode: accept RPC_MAGIC_REQUEST +}; + +PlaybackArgs parse_args(int argc, char *argv[]) { + PlaybackArgs args; + for (int i = 1; i < argc; i++) { + std::string a = argv[i]; + auto next = [&]() -> std::string { + if (i + 1 < argc) + return argv[++i]; + return {}; + }; + auto val_of = [&](const std::string &prefix) -> std::string { + return a.substr(prefix.size()); + }; + if (a.find("--hololink=") == 0) + args.hololink_ip = val_of("--hololink="); + else if (a == "--hololink") + args.hololink_ip = next(); + else if (a.find("--control-ip=") == 0) + args.hololink_ip = val_of("--control-ip="); + else if (a.find("--control-port=") == 0) + args.control_port = std::stoi(val_of("--control-port=")); + else if (a.find("--bridge-qp=") == 0) + args.bridge_qp = std::stoul(val_of("--bridge-qp="), nullptr, 0); + else if (a.find("--bridge-rkey=") == 0) + args.bridge_rkey = std::stoul(val_of("--bridge-rkey="), nullptr, 0); + else if (a.find("--bridge-buffer=") == 0) + args.bridge_buffer = std::stoull(val_of("--bridge-buffer="), nullptr, 0); + else if (a.find("--page-size=") == 0) + args.page_size = std::stoull(val_of("--page-size=")); + else if (a.find("--num-pages=") == 0) + args.num_pages = std::stoul(val_of("--num-pages=")); + else if (a == "--num-pages") + args.num_pages = std::stoul(next()); + else if (a.find("--num-messages=") == 0) + args.num_messages = std::stoul(val_of("--num-messages=")); + else if (a == "--num-messages") + args.num_messages = std::stoul(next()); + else if (a.find("--payload-size=") == 0) + args.payload_size = std::stoul(val_of("--payload-size=")); + else if (a.find("--vp-address=") == 0) + args.vp_address = std::stoul(val_of("--vp-address="), nullptr, 0); + else if (a.find("--hif-address=") == 0) + args.hif_address = std::stoul(val_of("--hif-address="), nullptr, 0); + else if (a.find("--bridge-ip=") == 0) + args.bridge_ip = val_of("--bridge-ip="); + else if (a == "--no-verify") + args.verify = false; + else if (a == "--emulator") + args.emulator = true; + else if (a == "--forward") + args.forward = true; + else if (a == "--help" || a == "-h") { + std::cout + << "Usage: hololink_fpga_playback [options]\n" + << "\nGeneric RPC playback tool for Hololink FPGA/emulator.\n" + << "\nOptions:\n" + << " --hololink ADDR FPGA/emulator IP (default: 192.168.0.2)\n" + << " --control-port=N UDP control port (default: 8192)\n" + << " --bridge-qp=N Bridge QP number\n" + << " --bridge-rkey=N Bridge RKey\n" + << " --bridge-buffer=ADDR Bridge buffer address\n" + << " --page-size=N Ring buffer slot size (default: 384)\n" + << " --num-pages N Ring buffer slots (default: 128)\n" + << " --num-messages N Number of RPC messages (default: 100)\n" + << " --payload-size=N Bytes per RPC payload (default: 8)\n" + << " --vp-address=ADDR VP register base (default: 0x1000)\n" + << " --hif-address=ADDR HIF register base (default: 0x0800)\n" + << " --bridge-ip=ADDR Bridge IP for FPGA (default: 10.0.0.1)\n" + << " --emulator Using emulator (skip FPGA reset)\n" + << " --no-verify Skip ILA response verification\n" + << " --forward Forward (echo) mode: accept echoed " + "requests\n"; + exit(0); + } + } + return args; +} + +// ============================================================================ +// BRAM helpers +// ============================================================================ + +std::uint32_t bram_w_sample_addr() { + std::uint32_t w = 0; + while ((1u << w) < RAM_DEPTH) + ++w; + return w; +} + +std::uint32_t load_le_u32(const std::uint8_t *p) { + return std::uint32_t(p[0]) | (std::uint32_t(p[1]) << 8) | + (std::uint32_t(p[2]) << 16) | (std::uint32_t(p[3]) << 24); +} + +/// Build one RPC request for the increment handler. +/// Layout: [RPCHeader (24 bytes, ptp_timestamp zeroed)][data bytes...] +/// The FPGA overwrites header bytes 16-23 (ptp_timestamp field) with the PTP +/// send timestamp at transmit time. +std::vector build_rpc_message(uint32_t msg_index, + uint32_t payload_size) { + using cudaq::realtime::fnv1a_hash; + using cudaq::realtime::RPCHeader; + + constexpr uint32_t FUNC_ID = fnv1a_hash("rpc_increment"); + + std::vector msg(sizeof(RPCHeader) + payload_size, 0); + auto *hdr = reinterpret_cast(msg.data()); + hdr->magic = cudaq::realtime::RPC_MAGIC_REQUEST; + hdr->function_id = FUNC_ID; + hdr->arg_len = payload_size; + hdr->request_id = msg_index; + hdr->ptp_timestamp = 0; + + uint8_t *payload = msg.data() + sizeof(RPCHeader); + for (uint32_t i = 0; i < payload_size; i++) + payload[i] = static_cast((msg_index + i) & 0xFF); + + return msg; +} + +/// Pad message to 64-byte aligned window and write to BRAM. +void write_bram(hololink::Hololink &hl, + const std::vector> &windows, + std::size_t bytes_per_window) { + if (bytes_per_window % 64 != 0) + throw std::runtime_error("bytes_per_window must be a multiple of 64"); + + std::size_t cycles = bytes_per_window / 64; + if (cycles == 0) + throw std::runtime_error("bytes_per_window is too small"); + + if (windows.size() * cycles > RAM_DEPTH) { + std::ostringstream msg; + msg << "Requested " << windows.size() << " windows with " << cycles + << " cycles each exceeds RAM depth " << RAM_DEPTH; + throw std::runtime_error(msg.str()); + } + + const std::uint32_t w_sample_addr = bram_w_sample_addr(); + + constexpr std::size_t kBatchWrites = 180; + hololink::Hololink::WriteData write_data; + + for (std::size_t w = 0; w < windows.size(); ++w) { + const auto &window = windows[w]; + for (std::size_t s = 0; s < cycles; ++s) { + for (std::size_t i = 0; i < RAM_NUM; ++i) { + std::size_t word_index = s * RAM_NUM + i; + std::size_t byte_offset = word_index * sizeof(std::uint32_t); + std::uint32_t value = 0; + if (byte_offset + sizeof(std::uint32_t) <= window.size()) + value = load_le_u32(window.data() + byte_offset); + + auto ram_addr = static_cast(i << (w_sample_addr + 2)); + auto sample_addr = static_cast((s + (w * cycles)) * 0x4); + std::uint32_t address = RAM_ADDR + ram_addr + sample_addr; + + write_data.queue_write_uint32(address, value); + if (write_data.size() >= kBatchWrites) { + if (!hl.write_uint32(write_data)) + throw std::runtime_error("Failed to write BRAM batch"); + write_data = hololink::Hololink::WriteData(); + } + } + } + } + + if (write_data.size() > 0) { + if (!hl.write_uint32(write_data)) + throw std::runtime_error("Failed to write BRAM batch"); + } +} + +/// Read back BRAM and verify contents. +bool verify_bram(hololink::Hololink &hl, + const std::vector> &windows, + std::size_t bytes_per_window) { + const std::size_t cycles = bytes_per_window / 64; + const auto total_cycles = static_cast(windows.size() * cycles); + const std::uint32_t w_sample_addr = bram_w_sample_addr(); + + bool all_ok = true; + std::size_t mismatches = 0; + + for (std::uint32_t i = 0; i < RAM_NUM; ++i) { + std::uint32_t bank_base = RAM_ADDR + (i << (w_sample_addr + 2)); + auto [ok, readback] = hl.read_uint32(bank_base, total_cycles, + hololink::Timeout::default_timeout()); + if (!ok) { + std::cerr << "BRAM readback: failed to read bank " << i << "\n"; + return false; + } + + for (std::size_t w = 0; w < windows.size(); ++w) { + const auto &window = windows[w]; + for (std::size_t s = 0; s < cycles; ++s) { + std::size_t word_index = s * RAM_NUM + i; + std::size_t byte_offset = word_index * sizeof(std::uint32_t); + std::uint32_t expected = 0; + if (byte_offset + sizeof(std::uint32_t) <= window.size()) + expected = load_le_u32(window.data() + byte_offset); + + std::size_t sample_idx = w * cycles + s; + std::uint32_t actual = readback[sample_idx]; + + if (actual != expected) { + if (mismatches < 10) { + std::cerr << " BRAM mismatch: bank=" << i + << " sample=" << sample_idx << " expected=0x" << std::hex + << expected << " got=0x" << actual << std::dec << "\n"; + } + all_ok = false; + ++mismatches; + } + } + } + } + + if (mismatches > 10) + std::cerr << " ... and " << (mismatches - 10) << " more mismatches\n"; + + return all_ok; +} + +// ============================================================================ +// ILA functions +// ============================================================================ + +void ila_reset(hololink::Hololink &hl) { + if (!hl.write_uint32(ILA_BASE_ADDR + ILA_CTRL_OFFSET, ILA_CTRL_RESET)) + throw std::runtime_error("ILA reset write failed"); + std::this_thread::sleep_for(std::chrono::seconds(1)); + if (!hl.write_uint32(ILA_BASE_ADDR + ILA_CTRL_OFFSET, ILA_CTRL_DISABLE)) + throw std::runtime_error("ILA disable-after-reset write failed"); +} + +void ila_enable(hololink::Hololink &hl) { + if (!hl.write_uint32(ILA_BASE_ADDR + ILA_CTRL_OFFSET, ILA_CTRL_ENABLE)) + throw std::runtime_error("ILA enable write failed"); +} + +void ila_disable(hololink::Hololink &hl) { + if (!hl.write_uint32(ILA_BASE_ADDR + ILA_CTRL_OFFSET, ILA_CTRL_DISABLE)) + throw std::runtime_error("ILA disable write failed"); +} + +std::uint32_t ila_sample_count(hololink::Hololink &hl) { + return hl.read_uint32(ILA_BASE_ADDR + ILA_SAMPLE_ADDR_OFFSET); +} + +/// Read captured ILA samples from RAM banks. Returns +/// vector of samples; each sample is ILA_NUM_RAM uint32 words (LSW-first). +std::vector> ila_dump(hololink::Hololink &hl, + std::uint32_t num_samples) { + constexpr std::uint32_t ctrl_switch = 1u << (ILA_W_ADDR + 2 + ILA_W_RAM); + // Max entries per block read: (1472 - 6 byte header) / 8 bytes per entry. + // Use 128 for comfortable margin on both request and reply packets. + constexpr std::uint32_t kChunkSize = 128; + auto timeout = hololink::Timeout::default_timeout(); + + std::vector> bank_data(ILA_NUM_RAM); + for (std::uint32_t y = 0; y < ILA_NUM_RAM; ++y) { + std::uint32_t bank_base = + ILA_BASE_ADDR + ctrl_switch + (y << (ILA_W_ADDR + 2)); + bank_data[y].reserve(num_samples); + for (std::uint32_t off = 0; off < num_samples; off += kChunkSize) { + std::uint32_t n = std::min(kChunkSize, num_samples - off); + auto [ok, data] = hl.read_uint32(bank_base + off * 4, n, timeout); + if (!ok) + throw std::runtime_error("Failed to read ILA bank " + + std::to_string(y)); + bank_data[y].insert(bank_data[y].end(), data.begin(), data.end()); + } + } + + std::vector> samples( + num_samples, std::vector(ILA_NUM_RAM)); + for (std::uint32_t i = 0; i < num_samples; ++i) + for (std::uint32_t y = 0; y < ILA_NUM_RAM; ++y) + samples[i][y] = bank_data[y][i]; + + return samples; +} + +/// Extract a single bit from a 585-bit ILA sample stored as 19 uint32 words. +bool get_bit(const std::vector &sample, uint32_t bit_pos) { + uint32_t word = bit_pos / 32; + uint32_t offset = bit_pos % 32; + return (sample[word] >> offset) & 1; +} + +/// Extract the first 64 bytes (512 bits) of payload from an ILA sample. +std::vector +extract_payload_bytes(const std::vector &sample, + std::size_t num_bytes) { + std::vector bytes(num_bytes, 0); + for (std::size_t i = 0; i < num_bytes && i < 64; ++i) { + uint32_t word = sample[i / 4]; + bytes[i] = static_cast((word >> ((i % 4) * 8)) & 0xFF); + } + return bytes; +} + +/// Extract the 64-bit current_ptp_timestamp from ILA bits [584:521]. +/// Returns {sec[31:0], nsec[31:0]} packed as a uint64. +std::uint64_t +extract_ila_ptp_timestamp(const std::vector &sample) { + // Bits 521..584 span words 16 and 17 (and partially 18). + // bit 521 = word 16, offset 9 + // We need 64 bits starting at bit 521. + uint64_t raw = 0; + for (int b = 0; b < 64; ++b) { + uint32_t bit_pos = 521 + b; + uint32_t w = bit_pos / 32; + uint32_t off = bit_pos % 32; + if ((sample[w] >> off) & 1) + raw |= (uint64_t(1) << b); + } + return raw; +} + +/// Extract the echoed PTP send timestamp from RPCResponse.ptp_timestamp. +/// The dispatch kernel echoes this field from RPCHeader.ptp_timestamp. +std::uint64_t +extract_echoed_ptp_timestamp(const cudaq::realtime::RPCResponse *resp) { + return resp->ptp_timestamp; +} + +struct PtpTimestamp { + uint32_t sec; + uint32_t nsec; +}; + +PtpTimestamp decode_ptp(uint64_t raw) { + // {sec[31:0], nsec[31:0]} -- sec in upper 32 bits, nsec in lower 32 + return {static_cast(raw >> 32), + static_cast(raw & 0xFFFF'FFFF)}; +} + +/// Compute signed nanosecond difference (recv - send). +int64_t ptp_delta_ns(PtpTimestamp send, PtpTimestamp recv) { + int64_t d_sec = static_cast(recv.sec) - send.sec; + int64_t d_nsec = static_cast(recv.nsec) - send.nsec; + return d_sec * 1'000'000'000LL + d_nsec; +} + +} // namespace + +// ============================================================================ +// Main +// ============================================================================ + +int main(int argc, char *argv[]) { + auto args = parse_args(argc, argv); + + std::cout << "=== Hololink Generic RPC Playback ===" << std::endl; + std::cout << "Hololink: " << args.hololink_ip << std::endl; + std::cout << "Messages: " << args.num_messages << std::endl; + std::cout << "Payload size: " << args.payload_size << " bytes" << std::endl; + + // ------------------------------------------------------------------ + // Build Hololink DataChannel + // ------------------------------------------------------------------ + hololink::Metadata channel_metadata; + + if (args.emulator) { + channel_metadata["channel_ip"] = args.hololink_ip; + channel_metadata["cpnx_ip"] = args.hololink_ip; + channel_metadata["control_port"] = + static_cast(args.control_port); + channel_metadata["hsb_ip_version"] = static_cast(0x2501); + channel_metadata["fpga_uuid"] = std::string("emulator"); + channel_metadata["serial_number"] = std::string("emulator-0"); + channel_metadata["peer_ip"] = args.hololink_ip; + channel_metadata["vp_mask"] = static_cast(0x1); + channel_metadata["data_plane"] = static_cast(0); + channel_metadata["sensor"] = static_cast(0); + channel_metadata["sif_address"] = static_cast(0); + channel_metadata["vp_address"] = static_cast(args.vp_address); + channel_metadata["hif_address"] = + static_cast(args.hif_address); + } else { + channel_metadata = hololink::Enumerator::find_channel(args.hololink_ip); + hololink::DataChannel::use_sensor(channel_metadata, 0); + } + + hololink::DataChannel hololink_channel(channel_metadata); + auto hololink = hololink_channel.hololink(); + + hololink->start(); + if (!args.emulator) { + hololink->reset(); + } + + // ------------------------------------------------------------------ + // Configure FPGA SIF for RDMA target + // ------------------------------------------------------------------ + size_t frame_size = sizeof(cudaq::realtime::RPCHeader) + args.payload_size; + size_t bytes_per_window = ((frame_size + 63) / 64) * 64; + + std::cout << "\n[1/4] Configuring RDMA target..." << std::endl; + std::cout << " Bridge QP: 0x" << std::hex << args.bridge_qp << std::dec + << std::endl; + std::cout << " Bridge RKey: " << args.bridge_rkey << std::endl; + std::cout << " Bridge Buffer: 0x" << std::hex << args.bridge_buffer + << std::dec << std::endl; + std::cout << " Page size: " << args.page_size << " bytes" << std::endl; + std::cout << " Num pages: " << args.num_pages << std::endl; + std::cout << " Frame size: " << bytes_per_window << " bytes" << std::endl; + + hololink_channel.authenticate(args.bridge_qp, args.bridge_rkey); + hololink_channel.configure_roce( + args.bridge_buffer, static_cast(bytes_per_window), + static_cast(args.page_size), args.num_pages, ROCEV2_UDP_PORT); + + std::cout << " RDMA target configured" << std::endl; + + // ------------------------------------------------------------------ + // Disable player, configure registers, load BRAM + // ------------------------------------------------------------------ + std::cout << "\n[2/4] Loading RPC messages into BRAM..." << std::endl; + + if (!hololink->write_uint32(PLAYER_ADDR + PLAYER_ENABLE_OFFSET, + PLAYER_DISABLE)) + throw std::runtime_error("Failed to disable player"); + + hololink::Hololink::WriteData config_write; + config_write.queue_write_uint32(PLAYER_ADDR + PLAYER_WINDOW_SIZE_OFFSET, + static_cast(bytes_per_window)); + config_write.queue_write_uint32( + PLAYER_ADDR + PLAYER_WINDOW_NUMBER_OFFSET, + static_cast(args.num_messages)); + config_write.queue_write_uint32(PLAYER_ADDR + PLAYER_TIMER_OFFSET, + RF_SOC_TIMER_SCALE * + DEFAULT_TIMER_SPACING_US); + if (!hololink->write_uint32(config_write)) + throw std::runtime_error("Failed to configure player"); + + // Build and load RPC messages + std::vector> windows; + windows.reserve(args.num_messages); + for (uint32_t i = 0; i < args.num_messages; i++) { + auto msg = build_rpc_message(i, args.payload_size); + msg.resize(bytes_per_window, 0); // pad to window boundary + windows.push_back(std::move(msg)); + } + + write_bram(*hololink, windows, bytes_per_window); + std::cout << " BRAM write completed (" << args.num_messages << " messages)" + << std::endl; + + // Verify BRAM contents + std::cout << " Verifying BRAM..." << std::endl; + if (!verify_bram(*hololink, windows, bytes_per_window)) { + std::cerr << " BRAM readback verification FAILED" << std::endl; + } else { + std::cout << " BRAM readback verification PASSED" << std::endl; + } + + // ------------------------------------------------------------------ + // Arm ILA and trigger playback + // ------------------------------------------------------------------ + std::cout << "\n[3/4] Triggering playback..." << std::endl; + + if (args.verify) { + ila_disable(*hololink); + ila_reset(*hololink); + ila_enable(*hololink); + std::cout << " ILA: armed for capture" << std::endl; + } + + // Disable metadata packet (set bit 16 of METADATA_PACKET_ADDR via RMW) + // Needed for FPGA bitfile 0x0227+; comment out for older bitfiles (e.g. + // 0x2601). + { + std::uint32_t val = hololink->read_uint32(METADATA_PACKET_ADDR); + if (!hololink->write_uint32(METADATA_PACKET_ADDR, val | (1u << 16))) + throw std::runtime_error("Failed to disable metadata packet"); + } + + // Set SIF TX streaming threshold to zero for immediate streaming. + if (!hololink->write_uint32(SIF_TX_THRESHOLD_ADDR, + SIF_TX_THRESHOLD_IMMEDIATE)) + throw std::runtime_error("Failed to set SIF TX streaming threshold"); + + // Enable player in single-pass mode + if (!hololink->write_uint32(PLAYER_ADDR + PLAYER_ENABLE_OFFSET, + PLAYER_ENABLE_SINGLEPASS)) + throw std::runtime_error("Failed to enable player"); + + std::cout << " Playback triggered: " << args.num_messages << " messages" + << std::endl; + + // ------------------------------------------------------------------ + // Wait and verify ILA capture + // ------------------------------------------------------------------ + if (args.verify) { + std::cout << "\n[4/4] Verifying responses..." << std::endl; + + constexpr int kStableChecks = 2; + constexpr int kPollIntervalMs = 500; + constexpr int kVerifyTimeoutMs = 30000; + std::cout << " Waiting for ILA capture to stabilize (timeout " + << kVerifyTimeoutMs << " ms)..." << std::endl; + + std::uint32_t prev_count = 0; + int stable = 0; + int elapsed = 0; + while (elapsed < kVerifyTimeoutMs) { + std::this_thread::sleep_for(std::chrono::milliseconds(kPollIntervalMs)); + elapsed += kPollIntervalMs; + std::uint32_t count = ila_sample_count(*hololink); + if (count > 0 && count == prev_count) + ++stable; + else + stable = 0; + prev_count = count; + if (stable >= kStableChecks) + break; + } + + std::uint32_t actual_samples = ila_sample_count(*hololink); + ila_disable(*hololink); + + if (actual_samples == 0) { + std::cerr << " ILA: captured 0 samples (timeout " << kVerifyTimeoutMs + << " ms)" << std::endl; + return 1; + } + std::cout << " ILA: captured " << actual_samples << " samples" + << std::endl; + + // Read all captured samples + auto samples = ila_dump(*hololink, actual_samples); + std::cout << " Read " << samples.size() << " samples from ILA" + << std::endl; + + uint32_t matched = 0; + uint32_t header_errors = 0; + uint32_t payload_errors = 0; + uint32_t tvalid_zero = 0; + uint32_t rpc_responses = 0; + uint32_t non_rpc_frames = 0; + std::set seen_request_ids; + + int64_t lat_min = std::numeric_limits::max(); + int64_t lat_max = std::numeric_limits::min(); + int64_t lat_sum = 0; + uint32_t lat_count = 0; + + struct LatencySample { + uint32_t shot; + uint32_t send_sec, send_nsec; + uint32_t recv_sec, recv_nsec; + int64_t delta_ns; + }; + std::vector lat_samples; + + for (auto &sample : samples) { + if (!get_bit(sample, ILA_TVALID_BIT)) { + ++tvalid_zero; + continue; + } + + auto payload = extract_payload_bytes(sample, 64); + + auto *resp = reinterpret_cast( + payload.data()); + + uint32_t expected_magic = args.forward + ? cudaq::realtime::RPC_MAGIC_REQUEST + : cudaq::realtime::RPC_MAGIC_RESPONSE; + + if (resp->magic != expected_magic) { + ++non_rpc_frames; + continue; + } + + ++rpc_responses; + + if (!args.forward && resp->status != 0) { + std::cerr << " Response request_id=" << resp->request_id + << ": error status " << resp->status << std::endl; + ++header_errors; + continue; + } + + uint32_t rid = resp->request_id; + seen_request_ids.insert(rid); + + if (!args.forward) { + const uint8_t *result_data = + payload.data() + sizeof(cudaq::realtime::RPCResponse); + bool ok = true; + uint32_t check_len = std::min(resp->result_len, args.payload_size); + + for (uint32_t j = 0; j < check_len && ok; j++) { + uint8_t expected = static_cast(((rid + j) & 0xFF) + 1); + if (result_data[j] != expected) { + if (payload_errors < 5) { + std::cerr << " Shot " << rid << " byte " << j << ": expected " + << (int)expected << " got " << (int)result_data[j] + << std::endl; + } + ok = false; + } + } + + if (ok) + ++matched; + else + ++payload_errors; + } else { + ++matched; + } + + // PTP round-trip latency: send timestamp from response header, + // receive timestamp from ILA bits [584:521]. + { + uint64_t send_raw = extract_echoed_ptp_timestamp(resp); + uint64_t recv_raw = extract_ila_ptp_timestamp(sample); + if (send_raw != 0 && recv_raw != 0) { + auto send_ts = decode_ptp(send_raw); + auto recv_ts = decode_ptp(recv_raw); + int64_t delta = ptp_delta_ns(send_ts, recv_ts); + lat_sum += delta; + ++lat_count; + if (delta < lat_min) + lat_min = delta; + if (delta > lat_max) + lat_max = delta; + + lat_samples.push_back({rid, send_ts.sec, send_ts.nsec, recv_ts.sec, + recv_ts.nsec, delta}); + + if (lat_count <= 5) { + std::cout << " Msg " << std::setw(3) << rid + << ": send={sec=" << send_ts.sec + << ", nsec=" << send_ts.nsec + << "} recv={sec=" << recv_ts.sec + << ", nsec=" << recv_ts.nsec << "} delta=" << delta + << " ns" << std::endl; + } + } + } + } + + std::cout << "\n=== Verification Summary ===" << std::endl; + std::cout << " ILA samples captured: " << actual_samples << std::endl; + std::cout << " tvalid=0 (idle): " << tvalid_zero << std::endl; + std::cout << " RPC responses: " << rpc_responses << std::endl; + std::cout << " Non-RPC frames: " << non_rpc_frames << std::endl; + std::cout << " Unique messages verified: " << seen_request_ids.size() + << " of " << args.num_messages << std::endl; + std::cout << " Responses matched: " << matched << std::endl; + std::cout << " Header errors: " << header_errors << std::endl; + std::cout << " Payload errors: " << payload_errors << std::endl; + + if (lat_count > 0) { + double lat_avg = static_cast(lat_sum) / lat_count; + std::cout << "\n=== PTP Round-Trip Latency ===" << std::endl; + std::cout << " Samples: " << lat_count << std::endl; + std::cout << " Min: " << lat_min << " ns" << std::endl; + std::cout << " Max: " << lat_max << " ns" << std::endl; + std::cout << " Avg: " << std::fixed << std::setprecision(1) + << lat_avg << " ns" << std::endl; + const std::string csv_path = "ptp_latency.csv"; + std::ofstream csv(csv_path); + if (csv.is_open()) { + csv << "shot,send_sec,send_nsec,recv_sec,recv_nsec,delta_ns\n"; + for (auto &s : lat_samples) + csv << s.shot << "," << s.send_sec << "," << s.send_nsec << "," + << s.recv_sec << "," << s.recv_nsec << "," << s.delta_ns << "\n"; + csv.close(); + std::cout << " CSV written: " << csv_path << std::endl; + } + } else { + std::cout << "\n PTP latency: no valid timestamps found" << std::endl; + } + + if (payload_errors == 0 && header_errors == 0 && + seen_request_ids.size() > 0) { + std::cout << " RESULT: PASS" << std::endl; + return 0; + } else { + std::cout << " RESULT: FAIL" << std::endl; + return 1; + } + } else { + std::cout << "\n[4/4] Verification skipped (--no-verify)" << std::endl; + std::this_thread::sleep_for(std::chrono::seconds(10)); + std::cout << "\n*** PLAYBACK COMPLETE ***" << std::endl; + return 0; + } +} diff --git a/realtime/unittests/utils/hololink_test.sh b/realtime/unittests/utils/hololink_test.sh new file mode 100755 index 00000000000..0894418799d --- /dev/null +++ b/realtime/unittests/utils/hololink_test.sh @@ -0,0 +1,556 @@ +#!/bin/bash +# ============================================================================ # +# Copyright (c) 2026 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +# +# hololink_test.sh +# +# Orchestration script for end-to-end Hololink RPC dispatch testing. +# Tests libcudaq-realtime dispatch kernel over Hololink RDMA with a +# simple increment RPC handler (no QEC or decoder dependency). +# +# Modes: +# Default (FPGA): bridge + playback (requires real FPGA) +# --emulate: emulator + bridge + playback (no FPGA needed) +# +# Actions (can be combined): +# --build Build all required tools +# --setup-network Configure ConnectX interfaces +# (run is implicit unless only --build / --setup-network are given) +# +# Examples: +# # Full emulated test: build, configure network, run +# ./hololink_test.sh --emulate --build --setup-network +# +# # Just run with real FPGA (tools already built, network already set up) +# ./hololink_test.sh --fpga-ip 192.168.0.2 +# +# # Build only +# ./hololink_test.sh --build --no-run +# +set -euo pipefail + +# ============================================================================ +# Defaults +# ============================================================================ + +EMULATE=false +DO_BUILD=false +DO_SETUP_NETWORK=false +DO_RUN=true +VERIFY=true + +# Directory defaults +HOLOLINK_DIR="/workspaces/hololink" +CUDA_QUANTUM_DIR="/workspaces/cuda-quantum" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BIN_DIR="" + +# Network defaults +IB_DEVICE="" # auto-detect +BRIDGE_IP="10.0.0.1" +EMULATOR_IP="10.0.0.2" +FPGA_IP="192.168.0.2" +MTU=4096 + +# Run defaults +GPU_ID=0 +TIMEOUT=60 +NUM_MESSAGES=100 +PAYLOAD_SIZE=8 +PAGE_SIZE=384 +NUM_PAGES=128 +CONTROL_PORT=8193 +FORWARD=false +UNIFIED=false + +# Build parallelism +JOBS=$(nproc 2>/dev/null || echo 8) + +# ============================================================================ +# Argument Parsing +# ============================================================================ + +print_usage() { + cat <<'EOF' +Usage: hololink_test.sh [options] + +Modes: + --emulate Use FPGA emulator (3-tool mode, no FPGA needed) + Default: FPGA mode (2-tool, requires real FPGA) + +Actions: + --build Build all required tools before running + --setup-network Configure ConnectX network interfaces + --no-run Skip running the test (useful with --build) + +Build options: + --hololink-dir DIR Hololink source directory + (default: /workspaces/hololink) + --cuda-quantum-dir DIR cuda-quantum source directory + (default: /workspaces/cuda-quantum) + --jobs N Parallel build jobs (default: nproc) + +Network options: + --device DEV ConnectX IB device name (default: auto-detect) + --bridge-ip ADDR Bridge tool IP (default: 10.0.0.1) + --emulator-ip ADDR Emulator IP (default: 10.0.0.2) + --fpga-ip ADDR FPGA IP for non-emulate mode (default: 192.168.0.2) + --mtu N MTU size (default: 4096) + +Run options: + --gpu N GPU device ID (default: 0) + --timeout N Timeout in seconds (default: 60) + --no-verify Skip ILA response verification + --num-messages N Number of RPC messages (default: 100) + --payload-size N Bytes per RPC payload (default: 16) + --page-size N Ring buffer slot size in bytes (default: 384) + --num-pages N Number of ring buffer slots (default: 128) + --control-port N UDP control port for emulator (default: 8193) + --bin-dir DIR Binary directory containing executables (default: None) + --help, -h Show this help +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --emulate) EMULATE=true ;; + --build) DO_BUILD=true ;; + --setup-network) DO_SETUP_NETWORK=true ;; + --no-run) DO_RUN=false ;; + --no-verify) VERIFY=false ;; + --hololink-dir) HOLOLINK_DIR="$2"; shift ;; + --cuda-quantum-dir) CUDA_QUANTUM_DIR="$2"; shift ;; + --bin-dir) BIN_DIR="$2"; shift ;; + --jobs) JOBS="$2"; shift ;; + --device) IB_DEVICE="$2"; shift ;; + --bridge-ip) BRIDGE_IP="$2"; shift ;; + --emulator-ip) EMULATOR_IP="$2"; shift ;; + --fpga-ip) FPGA_IP="$2"; shift ;; + --mtu) MTU="$2"; shift ;; + --gpu) GPU_ID="$2"; shift ;; + --forward) FORWARD=true ;; + --unified) UNIFIED=true ;; + --timeout) TIMEOUT="$2"; shift ;; + --num-messages) NUM_MESSAGES="$2"; shift ;; + --payload-size) PAYLOAD_SIZE="$2"; shift ;; + --page-size) PAGE_SIZE="$2"; shift ;; + --num-pages) NUM_PAGES="$2"; shift ;; + --control-port) CONTROL_PORT="$2"; shift ;; + --help|-h) print_usage; exit 0 ;; + *) + echo "ERROR: Unknown option: $1" >&2 + print_usage >&2 + exit 1 + ;; + esac + shift +done + +# ============================================================================ +# Auto-detect IB device +# ============================================================================ + +detect_ib_device() { + if [[ -n "$IB_DEVICE" ]]; then + echo "$IB_DEVICE" + return + fi + local dev + dev=$(ibstat -l 2>/dev/null | head -1 || true) + if [[ -z "$dev" ]]; then + dev=$(ls /sys/class/infiniband/ 2>/dev/null | head -1 || true) + fi + if [[ -z "$dev" ]]; then + echo "ERROR: Could not auto-detect IB device. Use --device." >&2 + exit 1 + fi + echo "$dev" +} + +# ============================================================================ +# Network interface name from IB device +# ============================================================================ + +get_netdev() { + local ib_dev=$1 + local netdev + netdev=$(ls "/sys/class/infiniband/$ib_dev/device/net/" 2>/dev/null | head -1 || true) + echo "$netdev" +} + +# ============================================================================ +# Build +# ============================================================================ + +detect_cuda_arch() { + local max_arch + max_arch=$(nvcc --list-gpu-arch 2>/dev/null \ + | grep -oP 'compute_\K[0-9]+' | sort -n | tail -1) + if [ -n "$max_arch" ]; then + echo "$max_arch" + fi +} + +do_build() { + echo "=== Building tools ===" + + local realtime_dir="$CUDA_QUANTUM_DIR/realtime" + local realtime_build="$realtime_dir/build" + local hololink_build="$HOLOLINK_DIR/build" + + # Detect target arch + local arch + arch=$(uname -m) + local target_arch="amd64" + if [[ "$arch" == "aarch64" ]]; then + target_arch="arm64" + fi + + # Detect highest CUDA arch supported by nvcc + local cuda_arch + cuda_arch=$(detect_cuda_arch) + local cuda_arch_flag="" + if [ -n "$cuda_arch" ]; then + cuda_arch_flag="-DCMAKE_CUDA_ARCHITECTURES=$cuda_arch" + echo " CUDA arch: $cuda_arch" + fi + + # Build hololink (only the two libraries we need) + echo "--- Building hololink ($target_arch) ---" + cmake -G Ninja -S "$HOLOLINK_DIR" -B "$hololink_build" \ + -DCMAKE_BUILD_TYPE=Release \ + $cuda_arch_flag \ + -DTARGET_ARCH="$target_arch" \ + -DHOLOLINK_BUILD_ONLY_NATIVE=OFF \ + -DHOLOLINK_BUILD_PYTHON=OFF \ + -DHOLOLINK_BUILD_TESTS=OFF \ + -DHOLOLINK_BUILD_TOOLS=OFF \ + -DHOLOLINK_BUILD_EXAMPLES=OFF \ + -DHOLOLINK_BUILD_EMULATOR=OFF + cmake --build "$hololink_build" -j"$JOBS" \ + --target roce_receiver gpu_roce_transceiver hololink_core + + # Build cuda-quantum/realtime with hololink tools enabled + echo "--- Building cuda-quantum/realtime ---" + cmake -G Ninja -S "$realtime_dir" -B "$realtime_build" \ + -DCMAKE_BUILD_TYPE=Release \ + $cuda_arch_flag \ + -DCUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS=ON \ + -DHOLOSCAN_SENSOR_BRIDGE_SOURCE_DIR="$HOLOLINK_DIR" \ + -DHOLOSCAN_SENSOR_BRIDGE_BUILD_DIR="$hololink_build" + cmake --build "$realtime_build" -j"$JOBS" \ + --target hololink_bridge hololink_fpga_emulator hololink_fpga_playback + + echo "=== Build complete ===" +} + +# ============================================================================ +# Network setup +# ============================================================================ + +setup_port() { + local iface="$1" + local ip="$2" + local mtu="$3" + + echo " Configuring $iface: ip=$ip mtu=$mtu" + + # Remove this IP from any other interface to prevent routing ambiguity + local other + for other in $(ip -o addr show to "${ip}/24" 2>/dev/null | awk '{print $2}' | sort -u); do + if [[ "$other" != "$iface" ]]; then + echo " Removing stale ${ip}/24 from $other" + sudo ip addr del "${ip}/24" dev "$other" 2>/dev/null || true + fi + done + + sudo ip link set "$iface" up + sudo ip link set "$iface" mtu "$mtu" + sudo ip addr flush dev "$iface" + sudo ip addr add "${ip}/24" dev "$iface" + + # Configure RoCEv2 mode + local ib_dev + if command -v ibdev2netdev &>/dev/null; then + ib_dev=$(ibdev2netdev | awk -v iface="$iface" '$5 == iface { print $1 }') + fi + if [[ -z "$ib_dev" ]]; then + ib_dev=$(basename "$(ls -d /sys/class/net/$iface/device/infiniband/* 2>/dev/null | head -1)" 2>/dev/null || true) + fi + if [[ -n "$ib_dev" ]]; then + local has_rocev2=false + for f in /sys/class/infiniband/${ib_dev}/ports/*/gid_attrs/types/*; do + if [[ -f "$f" ]] && grep -q "RoCE v2" "$f" 2>/dev/null; then + has_rocev2=true; break + fi + done + if $has_rocev2; then + echo " RoCEv2 GID available for $ib_dev" + elif command -v rdma &>/dev/null && rdma link set --help &>/dev/null; then + local port_count + port_count=$(ls -d "/sys/class/infiniband/${ib_dev}/ports/"* 2>/dev/null | wc -l) + for p in $(seq 1 "$port_count"); do + sudo rdma link set "${ib_dev}/${p}" type eth || true + done + echo " RoCEv2 mode configured for $ib_dev" + else + echo " WARNING: Could not verify RoCEv2 mode for $ib_dev" + fi + fi + + # DSCP trust mode for lossless RoCE + if command -v mlnx_qos &>/dev/null; then + sudo mlnx_qos -i "$iface" --trust=dscp 2>/dev/null || true + echo " DSCP trust mode set" + fi + + # Disable adaptive RX coalescing for low latency + if command -v ethtool &>/dev/null; then + sudo ethtool -C "$iface" adaptive-rx off rx-usecs 0 2>/dev/null || true + fi + + echo " Done: $iface is up at $ip" +} + +do_setup_network() { + IB_DEVICE=$(detect_ib_device) + local netdev + netdev=$(get_netdev "$IB_DEVICE") + + echo "=== Setting up network ===" + echo " IB device: $IB_DEVICE" + echo " Net device: $netdev" + + if [[ -z "$netdev" ]]; then + echo "ERROR: No network device found for $IB_DEVICE" >&2 + exit 1 + fi + + if $EMULATE; then + setup_port "$netdev" "$BRIDGE_IP" "$MTU" + sudo ip addr add "$EMULATOR_IP/24" dev "$netdev" 2>/dev/null || true + # Add static ARP entries for loopback + local mac + mac=$(cat /sys/class/net/$netdev/address) + sudo ip neigh replace "$BRIDGE_IP" lladdr "$mac" dev "$netdev" nud permanent 2>/dev/null || true + sudo ip neigh replace "$EMULATOR_IP" lladdr "$mac" dev "$netdev" nud permanent 2>/dev/null || true + else + setup_port "$netdev" "$BRIDGE_IP" "$MTU" + fi + + echo "=== Network setup complete ===" +} + +# ============================================================================ +# Run +# ============================================================================ + +cleanup_pids() { + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -INT "$pid" 2>/dev/null || true + fi + done + # Give processes a moment to shut down gracefully + sleep 1 + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null || true + fi + done + for pid in "${PIDS[@]}"; do + wait "$pid" 2>/dev/null || true + done +} + +do_run() { + IB_DEVICE=$(detect_ib_device) + local build_dir="$CUDA_QUANTUM_DIR/realtime/build" + local utils_dir="$build_dir/unittests/utils" + + if [ -n "$BIN_DIR" ]; then + local bridge_bin="$BIN_DIR/hololink_bridge" + local emulator_bin="$BIN_DIR/hololink_fpga_emulator" + local playback_bin="$BIN_DIR/hololink_fpga_playback" + else + local bridge_bin="$utils_dir/hololink_bridge" + local emulator_bin="$utils_dir/hololink_fpga_emulator" + local playback_bin="$utils_dir/hololink_fpga_playback" + fi + + # Verify binaries exist + for bin in "$bridge_bin"; do + if [[ ! -x "$bin" ]]; then + echo "ERROR: $bin not found. Run with --build first." >&2 + exit 1 + fi + done + + PIDS=() + trap cleanup_pids EXIT + + local FPGA_QP + local FPGA_TARGET_IP + + if $EMULATE; then + echo "=== Emulated mode ===" + + # Start emulator + echo "--- Starting emulator ---" + > /tmp/emulator.log + "$emulator_bin" \ + --device="$IB_DEVICE" \ + --port="$CONTROL_PORT" \ + --bridge-ip="$BRIDGE_IP" \ + --page-size="$PAGE_SIZE" \ + > /tmp/emulator.log 2>&1 & + PIDS+=($!) + tail -f /tmp/emulator.log & + PIDS+=($!) + + # Wait for emulator to print QP number + sleep 2 + FPGA_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/emulator.log | head -1) + if [[ -z "$FPGA_QP" ]]; then + echo "ERROR: Could not parse emulator QP from log" >&2 + exit 1 + fi + FPGA_QP="0x$FPGA_QP" + FPGA_TARGET_IP="$EMULATOR_IP" + + echo " Emulator QP: $FPGA_QP" + else + echo "=== FPGA mode ===" + FPGA_QP="0x2" + FPGA_TARGET_IP="$FPGA_IP" + fi + + # Start bridge + echo "--- Starting bridge ---" + > /tmp/bridge.log + local bridge_args=( + --device="$IB_DEVICE" + --peer-ip="$FPGA_TARGET_IP" + --remote-qp="$FPGA_QP" + --gpu="$GPU_ID" + --timeout="$TIMEOUT" + --payload-size="$PAYLOAD_SIZE" + --page-size="$PAGE_SIZE" + --num-pages="$NUM_PAGES" + ) + if $FORWARD; then + bridge_args+=(--forward) + fi + if $UNIFIED; then + bridge_args+=(--unified) + fi + CUDA_MODULE_LOADING=EAGER "$bridge_bin" "${bridge_args[@]}" > /tmp/bridge.log 2>&1 & + BRIDGE_PID=$! + PIDS+=($BRIDGE_PID) + tail -f /tmp/bridge.log & + PIDS+=($!) + + # Wait for bridge to print "Bridge Ready" (CUDA/DOCA init can take 5-15s) + local wait_elapsed=0 + while ! grep -q "Bridge Ready" /tmp/bridge.log 2>/dev/null; do + if ! kill -0 "$BRIDGE_PID" 2>/dev/null; then + echo "ERROR: Bridge process died during startup" >&2 + cat /tmp/bridge.log >&2 + exit 1 + fi + if (( wait_elapsed >= 30 )); then + echo "ERROR: Bridge did not become ready within 30s" >&2 + cat /tmp/bridge.log >&2 + exit 1 + fi + sleep 1 + (( wait_elapsed++ )) || true + done + + local BRIDGE_QP BRIDGE_RKEY BRIDGE_BUFFER + BRIDGE_QP=$(grep -oP 'QP Number: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + BRIDGE_RKEY=$(grep -oP 'RKey: \K[0-9]+' /tmp/bridge.log | tail -1) + BRIDGE_BUFFER=$(grep -oP 'Buffer Addr: 0x\K[0-9a-fA-F]+' /tmp/bridge.log | tail -1) + + if [[ -z "$BRIDGE_QP" || -z "$BRIDGE_RKEY" || -z "$BRIDGE_BUFFER" ]]; then + echo "ERROR: Could not parse bridge QP info from log" >&2 + echo " QP=$BRIDGE_QP RKEY=$BRIDGE_RKEY BUFFER=$BRIDGE_BUFFER" >&2 + exit 1 + fi + + echo " Bridge QP: 0x$BRIDGE_QP" + echo " Bridge RKey: $BRIDGE_RKEY" + echo " Bridge Buffer: 0x$BRIDGE_BUFFER" + + # Start playback + echo "--- Starting playback ---" + local playback_args=( + --hololink="$FPGA_TARGET_IP" + --bridge-qp="0x$BRIDGE_QP" + --bridge-rkey="$BRIDGE_RKEY" + --bridge-buffer="0x$BRIDGE_BUFFER" + --page-size="$PAGE_SIZE" + --num-pages="$NUM_PAGES" + --num-messages="$NUM_MESSAGES" + --payload-size="$PAYLOAD_SIZE" + --bridge-ip="$BRIDGE_IP" + ) + if $EMULATE; then + playback_args+=(--emulator --control-port="$CONTROL_PORT") + fi + if ! $VERIFY; then + playback_args+=(--no-verify) + fi + if $FORWARD; then + playback_args+=(--forward) + fi + + "$playback_bin" "${playback_args[@]}" + PLAYBACK_EXIT=$? + + # Wait for bridge to finish + sleep 2 + + # Cleanup + cleanup_pids + + echo "" + if [[ $PLAYBACK_EXIT -eq 0 ]]; then + echo "*** TEST PASSED ***" + else + echo "*** TEST FAILED ***" + fi + exit $PLAYBACK_EXIT +} + +# ============================================================================ +# Main +# ============================================================================ + +echo "=== Hololink Generic RPC Test ===" +echo "Mode: $(if $EMULATE; then echo "emulated"; else echo "FPGA"; fi)" + +if [ -n "$BIN_DIR" ]; then + if $DO_BUILD; then + echo "Cannot request a build when the binary directory is provided." + fi +fi + +if $DO_BUILD; then + do_build +fi + +if $DO_SETUP_NETWORK; then + do_setup_network +fi + +if $DO_RUN; then + do_run +fi + +echo "Done." diff --git a/realtime/unittests/utils/init_rpc_increment_function_table.cu b/realtime/unittests/utils/init_rpc_increment_function_table.cu new file mode 100644 index 00000000000..260ff96061e --- /dev/null +++ b/realtime/unittests/utils/init_rpc_increment_function_table.cu @@ -0,0 +1,92 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +/// @file init_rpc_increment_function_table.cu +/// @brief Device-side increment RPC handler and function table initialisation. +/// +/// This file is compiled by nvcc so that the __device__ function pointer +/// can be taken. The host-callable setup_rpc_increment_function_table() +/// wrapper is extern "C" so that the bridge .cpp (compiled by g++) can +/// call it without needing CUDA kernel launch syntax. + +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_modes.h" + +#include +#include + +namespace { + +//============================================================================== +// Increment RPC Handler +//============================================================================== + +/// @brief Simple RPC handler that increments each byte of the payload by 1. +/// +/// Matches the DeviceRPCFunction signature. Reads from input, writes to +/// output (no in-place overlap). +__device__ int rpc_increment_handler(const void *input, void *output, + std::uint32_t arg_len, + std::uint32_t max_result_len, + std::uint32_t *result_len) { + + const std::uint8_t *in_data = static_cast(input); + std::uint8_t *out_data = static_cast(output); + std::uint32_t len = (arg_len < max_result_len) ? arg_len : max_result_len; + for (std::uint32_t i = 0; i < len; ++i) + out_data[i] = static_cast(in_data[i] + 1); + *result_len = len; + return 0; +} + +constexpr std::uint32_t RPC_INCREMENT_FUNCTION_ID = + cudaq::realtime::fnv1a_hash("rpc_increment"); + +/// @brief Kernel to populate a cudaq_function_entry_t with the increment +/// handler. +__global__ void init_function_table_kernel(cudaq_function_entry_t *entries) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + entries[0].handler.device_fn_ptr = + reinterpret_cast(&rpc_increment_handler); + entries[0].function_id = RPC_INCREMENT_FUNCTION_ID; + entries[0].dispatch_mode = CUDAQ_DISPATCH_DEVICE_CALL; + entries[0].reserved[0] = 0; + entries[0].reserved[1] = 0; + entries[0].reserved[2] = 0; + + // Schema: 1 array argument (uint8), 1 array result (uint8) + entries[0].schema.num_args = 1; + entries[0].schema.num_results = 1; + entries[0].schema.reserved = 0; + entries[0].schema.args[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.args[0].reserved[0] = 0; + entries[0].schema.args[0].reserved[1] = 0; + entries[0].schema.args[0].reserved[2] = 0; + entries[0].schema.args[0].size_bytes = 0; + entries[0].schema.args[0].num_elements = 0; + entries[0].schema.results[0].type_id = CUDAQ_TYPE_ARRAY_UINT8; + entries[0].schema.results[0].reserved[0] = 0; + entries[0].schema.results[0].reserved[1] = 0; + entries[0].schema.results[0].reserved[2] = 0; + entries[0].schema.results[0].size_bytes = 0; + entries[0].schema.results[0].num_elements = 0; + } +} + +} // anonymous namespace + +//============================================================================== +// Host-Callable Wrapper +//============================================================================== + +extern "C" void +setup_rpc_increment_function_table(cudaq_function_entry_t *d_entries) { + init_function_table_kernel<<<1, 1>>>(d_entries); + cudaDeviceSynchronize(); +}