Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions .github/workflows/check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:

- name: Lint yaml files
run: |
yamllint .yamllint snap/snapcraft.yaml
yamllint .yamllint snap/snapcraft.yaml.in

unit:
name: Unit
Expand Down Expand Up @@ -106,11 +106,16 @@ jobs:
fail-fast: false
matrix:
runs-on: [[ubuntu-24.04], [self-hosted, jammy, ARM64]]
cuda-version: [11, 12, 13]
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Complete git history is required to generate the version from git tags.

- name: Build snapcraft.yaml files
run: |
CUDA_VERSION=${{ matrix.cuda-version }} envsubst '$CUDA_VERSION' < snap/snapcraft.yaml.in > snap/snapcraft.yaml

- name: Verify snap builds successfully
id: build
uses: canonical/action-build@v1
Expand All @@ -121,7 +126,7 @@ jobs:
- name: Upload the built snap
uses: actions/upload-artifact@v4
with:
name: snap_${{ env.SYSTEM_ARCH }}
name: snap_${{ matrix.cuda-version }}_${{ env.SYSTEM_ARCH }}
path: ${{ steps.build.outputs.snap }}

func:
Expand All @@ -132,6 +137,7 @@ jobs:
fail-fast: false
matrix:
runs-on: [[ubuntu-24.04], [self-hosted, jammy, ARM64]]
cuda-version: [11, 12, 13]
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -143,7 +149,7 @@ jobs:
- name: Download snap file artifact
uses: actions/download-artifact@v4
with:
name: snap_${{ env.SYSTEM_ARCH }}
name: snap_${{ matrix.cuda-version }}_${{ env.SYSTEM_ARCH }}

- name: Set up Python 3.10
uses: actions/setup-python@v5
Expand All @@ -165,7 +171,7 @@ jobs:
run: |
BASE_VERSION_SANITIZED=${{ matrix.runs-on }}
BASE_VERSION_SANITIZED=${BASE_VERSION_SANITIZED//./-}
echo "ARTIFACT_SUFFIX=$BASE_VERSION_SANITIZED-${{ env.SYSTEM_ARCH }}" >> $GITHUB_ENV
echo "ARTIFACT_SUFFIX=$BASE_VERSION_SANITIZED-${{ matrix.cuda-version }}-${{ env.SYSTEM_ARCH }}" >> $GITHUB_ENV

- name: Rename Functional Test Coverage Artifact
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,8 @@ dist/
*~
*.bak

# snapcraft file is dynamically generated
snap/snapcraft.yaml

# Note: for editor-specific files, please don't add them here, as they are specific to your environment, not the project.
# Instead, consider using a global gitignore on your workstation.
2 changes: 1 addition & 1 deletion snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -euo pipefail

# Build the argument list for the dcgm-exporter command
args=()
args=("--disable-startup-validate")

nv_hostengine_port="$(snapctl get nv-hostengine-port)"
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
Expand Down
14 changes: 0 additions & 14 deletions snap/local/files/run_dcgmproftester10.sh

This file was deleted.

63 changes: 32 additions & 31 deletions snap/snapcraft.yaml → snap/snapcraft.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,29 @@ description: |

**How to install the snap:**

sudo snap install dcgm
sudo snap install dcgm

**How to enable metrics collection:**

# Start the DCGM-Exporter service (disabled by default)
sudo snap start dcgm.dcgm-exporter
# Start the DCGM-Exporter service (disabled by default)
sudo snap start dcgm.dcgm-exporter

# Get the metrics
curl -s localhost:9400/metrics
# Get the metrics
curl -s localhost:9400/metrics

**How to configure the snap services:**

The NV-Hostengine and DCGM-Exporter services can be configured via the `snap` CLI.\
For example:

# Get all the configuration options
sudo snap get dcgm
# Get all the configuration options
sudo snap get dcgm

# Set the NV-Hostengine port
sudo snap set dcgm nv-hostengine-port=5577
# Set the NV-Hostengine port
sudo snap set dcgm nv-hostengine-port=5577

# Restart the NV-Hostengine service to apply the changes
sudo snap restart dcgm.nv-hostengine
# Restart the NV-Hostengine service to apply the changes
sudo snap restart dcgm.nv-hostengine

**Reference**
---
Expand Down Expand Up @@ -129,20 +129,8 @@ apps:
restart-condition: on-abort
environment:
DCGM_HOME_DIR: "${SNAP_COMMON}"
dcgmproftester10:
command: bin/run_dcgmproftester10.sh
plugs:
- network-bind
- opengl
- hardware-observe
dcgmproftester11:
command: usr/bin/dcgmproftester11
plugs:
- network-bind
- opengl
- hardware-observe
dcgmproftester12:
command: usr/bin/dcgmproftester12
dcgmproftester:
command: usr/bin/dcgmproftester${CUDA_VERSION}
plugs:
- network-bind
- opengl
Expand All @@ -165,19 +153,29 @@ parts:
after:
- cuda-sources
plugin: nil
stage-packages: [datacenter-gpu-manager=1:3.3.8]

override-pull: |
craftctl default
apt download datacenter-gpu-manager-4-cuda${CUDA_VERSION}=1:4.4.1-1
apt download $(apt-cache depends datacenter-gpu-manager-4-cuda${CUDA_VERSION} | awk '/Depends:/ {print $2}')

override-build: |
craftctl default
dpkg-deb -x datacenter-gpu-manager-4-cuda${CUDA_VERSION}_*.deb $SNAPCRAFT_PART_INSTALL/
dpkg-deb -x datacenter-gpu-manager-4-core*.deb $SNAPCRAFT_PART_INSTALL/

# override prime to set version
override-prime: |
craftctl default
# Locate dcgm .deb file
DEB_FILE=$(ls $HOME/parts/dcgm/stage_packages/datacenter-gpu-manager_*.deb)
DEB_FILE=$(ls $HOME/parts/dcgm/src/datacenter-gpu-manager-4-cuda*.deb)

# Extract the version from the .deb file
DCGM_VERSION=$(dpkg-deb -f "$DEB_FILE" Version)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always --dirty --abbrev=10)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always)

# Set the Snap version to the same as dcgm deb file
craftctl set version="${DCGM_VERSION#1:}+snap-${GIT_VERSION}"
craftctl set version="${CUDA_VERSION}-${DCGM_VERSION#1:}-${GIT_VERSION}"

# This is the DCGM exporter
dcgm-exporter:
Expand All @@ -186,10 +184,10 @@ parts:
- go
source: https://github.com/NVIDIA/dcgm-exporter.git
source-type: git
source-tag: 3.3.8-3.6.0
# override build to get the default csv files from the upstream
override-build: |
craftctl default

mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter
cp etc/default-counters.csv etc/dcp-metrics-included.csv $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter/

Expand All @@ -200,8 +198,11 @@ parts:
organize:
run_nv_hostengine.sh: bin/
run_dcgm_exporter.sh: bin/
run_dcgmproftester10.sh: bin/

layout:
/etc/dcgm-exporter:
symlink: $SNAP/etc/dcgm-exporter
/usr/lib/x86_64-linux-gnu/libdcgm.so.4:
bind-file: $SNAP/usr/lib/x86_64-linux-gnu/libdcgm.so.4.4.1
/usr/libexec/datacenter-gpu-manager-4:
bind: $SNAP/usr/libexec/datacenter-gpu-manager-4
1 change: 1 addition & 0 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def install_dcgm_snap():

# Manually connect the 'hardware-observe' interface as auto-connect is not allowed
subprocess.check_call("sudo snap connect dcgm:hardware-observe".split())
subprocess.check_call("sudo snap connect dcgm:opengl".split())

subprocess.check_call("sudo snap start dcgm.dcgm-exporter".split())

Expand Down