-
Notifications
You must be signed in to change notification settings - Fork 239
Expand file tree
/
Copy pathtest-brev-tutorial-docker-images.yml
More file actions
188 lines (159 loc) · 7.58 KB
/
test-brev-tutorial-docker-images.yml
File metadata and controls
188 lines (159 loc) · 7.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
name: Test Brev Tutorial Docker Images
on:
workflow_dispatch:
inputs:
tutorial:
description: 'Tutorial name to test'
required: true
type: string
git_sha:
description: 'Git commit SHA to update status for'
required: true
type: string
workflow_run_id:
description: 'Workflow run ID to download artifacts from'
required: true
type: string
jobs:
test-tutorial:
name: test-tutorial (${{ inputs.tutorial }})
runs-on: linux-amd64-gpu-l4-latest-1
defaults:
run:
working-directory: ${{ github.workspace }}
permissions:
statuses: write
env:
BUILDKIT_PROGRESS: plain
DOCKER_CLI_HINTS: false
steps:
- name: Show runner info
run: |
echo "Runner name: ${{ runner.name }}"
echo "Runner OS: ${{ runner.os }}"
echo "Runner arch: ${{ runner.arch }}"
echo "Runner uname: $(uname -a)"
- name: Checkout repository
uses: actions/checkout@v4
- name: Set Git branch variables
run: |
GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/}
# Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase)
DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | tr '[:upper:]' '[:lower:]')
GIT_SHA=${{ inputs.git_sha }}
GIT_SHORT_SHA=${GIT_SHA:0:7}
echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV
echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV
echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV
echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV
- name: Download commit-specific Docker Compose artifact
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-brev-tutorial-docker-images.yml
run_id: ${{ inputs.workflow_run_id }}
name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }}
path: artifacts/commit-specific/${{ inputs.tutorial }}/
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Stop DCGM to allow NCU profiling
run: |
# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
# Stop it before running the container tests.
echo "Stopping DCGM services..."
# Stop the dcgm-exporter Docker container
echo "Stopping dcgm-exporter Docker container..."
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
# Stop systemd services
sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
# Kill any remaining dcgm processes
sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
sudo pkill -9 dcgm || echo "No dcgm processes found"
# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
echo "Relaxing profiling permissions..."
sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
echo "DCGM services stopped and profiling permissions relaxed."
- name: Debug GPU and NCU configuration
run: |
echo "=== GPU Information ==="
nvidia-smi || echo "nvidia-smi failed"
echo ""
echo "=== NVIDIA Driver Version ==="
cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version"
echo ""
echo "=== GPU Processes ==="
nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed"
echo ""
echo "=== DCGM/NCU Blocking Processes ==="
ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found"
echo ""
echo "=== Systemd Services (nvidia/dcgm related) ==="
systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
echo ""
echo "=== Profiling Permissions ==="
cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
echo ""
echo "=== NVIDIA Kernel Modules ==="
lsmod | grep nvidia || echo "No nvidia modules loaded"
echo ""
echo "=== /dev/nvidia* devices ==="
ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
echo ""
- name: Pre-pull Docker images
run: |
COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
# Extract all unique images from the compose file
# The main image is defined with a YAML anchor like: image: &image ghcr.io/...
MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
if [ -z "$MAIN_IMAGE" ]; then
# Fallback: try to find any ghcr.io image reference
MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
fi
# Extract the nsight image (nvcr.io)
NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
echo "Pre-pulling main image: $MAIN_IMAGE"
docker pull "$MAIN_IMAGE"
if [ -n "$NSIGHT_IMAGE" ]; then
echo "Pre-pulling nsight image: $NSIGHT_IMAGE"
docker pull "$NSIGHT_IMAGE"
fi
echo "All images pulled successfully"
- name: Test Docker Compose
id: test
run: |
./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
- name: Update commit status to success
if: success()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='success' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests passed' \
-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Update commit status to failure
if: failure()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='failure' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests failed' \
-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}