Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion snap/local/files/run_dcgm_exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -euo pipefail

# Build the argument list for the dcgm-exporter command
args=()
args=("--disable-startup-validate")

nv_hostengine_port="$(snapctl get nv-hostengine-port)"
dcgm_exporter_address="$(snapctl get dcgm-exporter-address)"
Expand Down
14 changes: 0 additions & 14 deletions snap/local/files/run_dcgmproftester10.sh

This file was deleted.

26 changes: 15 additions & 11 deletions snap/snapcraft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,6 @@ apps:
restart-condition: on-abort
environment:
DCGM_HOME_DIR: "${SNAP_COMMON}"
dcgmproftester10:
command: bin/run_dcgmproftester10.sh
plugs:
- network-bind
- opengl
- hardware-observe
dcgmproftester11:
command: usr/bin/dcgmproftester11
plugs:
Expand All @@ -147,6 +141,12 @@ apps:
- network-bind
- opengl
- hardware-observe
dcgmproftester13:
command: usr/bin/dcgmproftester13
plugs:
- network-bind
- opengl
- hardware-observe

parts:
# This is a workaround to package-repositories not supporting multiple architectures
Expand All @@ -165,16 +165,17 @@ parts:
after:
- cuda-sources
plugin: nil
stage-packages: [datacenter-gpu-manager=1:3.3.8]
stage-packages:
- datacenter-gpu-manager-4-cuda-all=1:4.4.1-1
# override prime to set version
override-prime: |
craftctl default
# Locate dcgm .deb file
DEB_FILE=$(ls $HOME/parts/dcgm/stage_packages/datacenter-gpu-manager_*.deb)
DEB_FILE=$(ls $HOME/parts/dcgm/stage_packages/datacenter-gpu-manager-4-core_*.deb)

# Extract the version from the .deb file
DCGM_VERSION=$(dpkg-deb -f "$DEB_FILE" Version)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always --dirty --abbrev=10)
GIT_VERSION=$(git -C $CRAFT_PROJECT_DIR describe --always)

# Set the Snap version to the same as dcgm deb file
craftctl set version="${DCGM_VERSION#1:}+snap-${GIT_VERSION}"
Expand All @@ -186,10 +187,10 @@ parts:
- go
source: https://github.com/NVIDIA/dcgm-exporter.git
source-type: git
source-tag: 3.3.8-3.6.0
# override build to get the default csv files from the upstream
override-build: |
craftctl default

mkdir -p $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter
cp etc/default-counters.csv etc/dcp-metrics-included.csv $SNAPCRAFT_PART_INSTALL/etc/dcgm-exporter/

Expand All @@ -200,8 +201,11 @@ parts:
organize:
run_nv_hostengine.sh: bin/
run_dcgm_exporter.sh: bin/
run_dcgmproftester10.sh: bin/

layout:
/etc/dcgm-exporter:
symlink: $SNAP/etc/dcgm-exporter
/usr/lib/x86_64-linux-gnu/libdcgm.so.4:
bind-file: $SNAP/usr/lib/x86_64-linux-gnu/libdcgm.so.4.4.1
/usr/libexec/datacenter-gpu-manager-4:
bind: $SNAP/usr/libexec/datacenter-gpu-manager-4
1 change: 1 addition & 0 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def install_dcgm_snap():

# Manually connect the 'hardware-observe' interface as auto-connect is not allowed
subprocess.check_call("sudo snap connect dcgm:hardware-observe".split())
subprocess.check_call("sudo snap connect dcgm:opengl".split())

subprocess.check_call("sudo snap start dcgm.dcgm-exporter".split())

Expand Down