Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .github/workflows/check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:

- name: Lint yaml files
run: |
yamllint .yamllint snap/snapcraft.yaml
yamllint .yamllint snap/snapcraft.yaml.in

unit:
name: Unit
Expand Down Expand Up @@ -106,11 +106,16 @@ jobs:
fail-fast: false
matrix:
runs-on: [[ubuntu-24.04], [self-hosted, jammy, ARM64]]
cuda-version: [11, 12, 13]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Complete git history is required to generate the version from git tags.

- name: Build snapcraft.yaml files
run: |
CUDA_VERSION=${{ matrix.cuda-version }} envsubst '$CUDA_VERSION' < snap/snapcraft.yaml.in > snap/snapcraft.yaml

- name: Verify snap builds successfully
id: build
uses: canonical/action-build@v1
Expand All @@ -121,7 +126,7 @@ jobs:
- name: Upload the built snap
uses: actions/upload-artifact@v4
with:
name: snap_${{ env.SYSTEM_ARCH }}
name: snap_${{ matrix.cuda-version }}_${{ env.SYSTEM_ARCH }}
path: ${{ steps.build.outputs.snap }}

func:
Expand All @@ -132,6 +137,7 @@ jobs:
fail-fast: false
matrix:
runs-on: [[ubuntu-24.04], [self-hosted, jammy, ARM64]]
cuda-version: [11, 12, 13]
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -143,7 +149,7 @@ jobs:
- name: Download snap file artifact
uses: actions/download-artifact@v4
with:
name: snap_${{ env.SYSTEM_ARCH }}
name: snap_${{ matrix.cuda-version }}_${{ env.SYSTEM_ARCH }}

- name: Set up Python 3.10
uses: actions/setup-python@v5
Expand All @@ -165,7 +171,7 @@ jobs:
run: |
BASE_VERSION_SANITIZED=${{ matrix.runs-on }}
BASE_VERSION_SANITIZED=${BASE_VERSION_SANITIZED//./-}
echo "ARTIFACT_SUFFIX=$BASE_VERSION_SANITIZED-${{ env.SYSTEM_ARCH }}" >> $GITHUB_ENV
echo "ARTIFACT_SUFFIX=$BASE_VERSION_SANITIZED-${{ matrix.cuda-version }}-${{ env.SYSTEM_ARCH }}" >> $GITHUB_ENV

- name: Rename Functional Test Coverage Artifact
run: |
Expand Down
10 changes: 9 additions & 1 deletion .github/workflows/promote.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ on:
description: 'Channel Promotion, e.g. latest/edge -> latest/candidate'
required: true
type: choice
options: ['latest/edge -> latest/candidate', 'latest/candidate -> latest/stable', 'core24/edge -> core24/candidate', 'core24/candidate -> core24/stable']
options:
[
'v4-cuda11/edge -> v4-cuda11/candidate',
'v4-cuda11/candidate -> v4-cuda11/stable',
'v4-cuda12/edge -> v4-cuda12/candidate',
'v4-cuda12/candidate -> v4-cuda12/stable',
'v4-cuda13/edge -> v4-cuda13/candidate',
'v4-cuda13/candidate -> v4-cuda13/stable'
]

jobs:
promote-snap:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:
fail-fast: false
matrix:
runs-on: [[ubuntu-24.04], [self-hosted, jammy, ARM64]]
cuda-version: [11, 12, 13]
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -34,7 +35,7 @@ jobs:
- name: Download the built snap from check workflow
uses: actions/download-artifact@v4
with:
name: snap_${{ env.SYSTEM_ARCH }}
name: snap_${{ matrix.cuda-version }}_${{ env.SYSTEM_ARCH }}

- name: Find the downloaded snap file
run: echo "SNAP_FILE=$(find . -name "*.snap")" >> $GITHUB_ENV
Expand All @@ -45,7 +46,7 @@ jobs:
with:
snap: ${{ env.SNAP_FILE }}
# Comma-separated list of channels to release the snap to.
release: core24/edge,latest/edge
release: v4-cuda${{ matrix.cuda-version }}/edge

notify-on-release-failure:
runs-on: ubuntu-latest
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,8 @@ dist/
*~
*.bak

# snapcraft file is dynamically generated
snap/snapcraft.yaml

# Note: for editor-specific files, please don't add them here, as they are specific to your environment, not the project.
# Instead, consider using a global gitignore on your workstation.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,24 @@ The snap consists of [dcgm](https://developer.nvidia.com/dcgm) and [dcgm-exporte

## Build the snap

From v4, the `snapcraft.yaml` file is dynamically generated using the `snapcraft.yaml.in` file
as template. To create the file run:

```shell
# generate v4 using cuda11 packages
CUDA_VERSION=11 envsubst '$CUDA_VERSION' < snap/snapcraft.yaml.in > snap/snapcraft.yaml

# generate v4 using cuda12 packages
CUDA_VERSION=12 envsubst '$CUDA_VERSION' < snap/snapcraft.yaml.in > snap/snapcraft.yaml

# generate v4 using cuda13 packages
CUDA_VERSION=13 envsubst '$CUDA_VERSION' < snap/snapcraft.yaml.in > snap/snapcraft.yaml
```

You can build the snap locally by using the command:

```shell
snapcraft --use-lxd
snapcraft pack -v
```

## Test
Expand Down
6 changes: 4 additions & 2 deletions snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
set -euo pipefail

# Build the argument list for the dcgm-exporter command
args=()
# startup validation dynamic generates and check libs that are not possible in snap confinement
# See https://github.com/NVIDIA/dcgm-exporter/issues/553
args=("--disable-startup-validate")

nv_hostengine_port="$(snapctl get nv-hostengine-port)"
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
Expand All @@ -25,4 +27,4 @@ else
echo "Error: DCGM exporter metrics file not found or empty: $dcgm_exporter_metrics_file_path, using default"
fi

exec "$SNAP/bin/dcgm-exporter" "${args[@]}"
exec "dcgm-exporter" "${args[@]}"
14 changes: 0 additions & 14 deletions snap/local/files/run_dcgmproftester10.sh

This file was deleted.

76 changes: 45 additions & 31 deletions snap/snapcraft.yaml → snap/snapcraft.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,27 @@ description: |
**How-To**
---

**How to select a channel:**

You need to have NVIDIA drivers installed on your system to use this snap.\
You can check the version of your NVIDIA driver with the command:

cat /proc/driver/nvidia/version

With that information, you can check the compatible CUDA version here:\
https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html

Keep in mind that:
DCGM v4 has different releases for different CUDA versions. E.g. if your system has the Driver\
version >= 580 it uses CUDA 13, so you should install the snap from the channel `v4-cuda13`

sudo snap install dcgm --channel=v4-cuda13/stable

DCGM v3 ships a single package with binaries that works from CUDA 10 up to CUDA 12.

**How to install the snap:**

sudo snap install dcgm
sudo snap install dcgm --channel=<CHANNEL>

**How to enable metrics collection:**

Expand Down Expand Up @@ -129,20 +147,8 @@ apps:
restart-condition: on-abort
environment:
DCGM_HOME_DIR: "${SNAP_COMMON}"
dcgmproftester10:
command: bin/run_dcgmproftester10.sh
plugs:
- network-bind
- opengl
- hardware-observe
dcgmproftester11:
command: usr/bin/dcgmproftester11
plugs:
- network-bind
- opengl
- hardware-observe
dcgmproftester12:
command: usr/bin/dcgmproftester12
dcgmproftester:
command: usr/bin/dcgmproftester${CUDA_VERSION}
plugs:
- network-bind
- opengl
Expand All @@ -165,33 +171,38 @@ parts:
after:
- cuda-sources
plugin: nil
stage-packages: [datacenter-gpu-manager=1:3.3.8]

build-packages:
- git

override-pull: |
craftctl default
apt download datacenter-gpu-manager-4-cuda${CUDA_VERSION}=1:4.4.1-1
apt download $(apt-cache depends datacenter-gpu-manager-4-cuda${CUDA_VERSION} | awk '/Depends:/ {print $2}')

override-build: |
craftctl default
dpkg-deb -x datacenter-gpu-manager-4-cuda${CUDA_VERSION}_*.deb $SNAPCRAFT_PART_INSTALL/
dpkg-deb -x datacenter-gpu-manager-4-core*.deb $SNAPCRAFT_PART_INSTALL/

# override prime to set version
override-prime: |
craftctl default
# Locate dcgm .deb file
DEB_FILE=$(ls $HOME/parts/dcgm/stage_packages/datacenter-gpu-manager_*.deb)
DEB_FILE=$(ls $HOME/parts/dcgm/src/datacenter-gpu-manager-4-cuda*.deb)

# Extract the version from the .deb file
DCGM_VERSION=$(dpkg-deb -f "$DEB_FILE" Version)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always --dirty --abbrev=10)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always)

# Set the Snap version to the same as dcgm deb file
craftctl set version="${DCGM_VERSION#1:}+snap-${GIT_VERSION}"
craftctl set version="${CUDA_VERSION}-${DCGM_VERSION#1:}-${GIT_VERSION}"

# This is the DCGM exporter
dcgm-exporter:
plugin: go
build-snaps:
- go
source: https://github.com/NVIDIA/dcgm-exporter.git
source-type: git
source-tag: 3.3.8-3.6.0
# override build to get the default csv files from the upstream
override-build: |
craftctl default
mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter
cp etc/default-counters.csv etc/dcp-metrics-included.csv $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter/
plugin: nil
stage-packages:
- datacenter-gpu-manager-exporter=4.5.2-1

# wrappers supporting snap options
wrapper:
Expand All @@ -200,8 +211,11 @@ parts:
organize:
run_nv_hostengine.sh: bin/
run_dcgm_exporter.sh: bin/
run_dcgmproftester10.sh: bin/

layout:
/etc/dcgm-exporter:
symlink: $SNAP/etc/dcgm-exporter
/usr/lib/x86_64-linux-gnu/libdcgm.so.4:
bind-file: $SNAP/usr/lib/x86_64-linux-gnu/libdcgm.so.4.4.1
/usr/libexec/datacenter-gpu-manager-4:
bind: $SNAP/usr/libexec/datacenter-gpu-manager-4
1 change: 1 addition & 0 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def install_dcgm_snap():

# Manually connect the 'hardware-observe' interface as auto-connect is not allowed
subprocess.check_call("sudo snap connect dcgm:hardware-observe".split())
subprocess.check_call("sudo snap connect dcgm:opengl".split())

subprocess.check_call("sudo snap start dcgm.dcgm-exporter".split())

Expand Down