Skip to content

Commit f4690ce

Browse files
committed
ci: add basic gpu ci tests
1 parent 98b12de commit f4690ce

1 file changed

Lines changed: 150 additions & 0 deletions

File tree

.github/workflows/gpu-tests.yml

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
name: Basic_GPU_Tests
2+
3+
on:
4+
pull_request:
5+
branches: [ main ]
6+
paths:
7+
- 'cache-dit/src/**'
8+
- 'cache-dit/examples/**'
9+
- pyproject.toml
10+
- '.github/workflows/gpu-tests.yml' # Updated workflow file path
11+
12+
concurrency:
13+
group: ${{ github.ref }}-gpu-tests
14+
cancel-in-progress: true
15+
16+
jobs:
17+
flux-model-test:
18+
runs-on: [self-hosted, gpu, private-server]
19+
permissions:
20+
contents: read
21+
pull-requests: write
22+
23+
steps:
24+
- name: 🔍 Environment Precheck (Container/Model/GPU)
25+
run: |
26+
echo "=== Server GPU Information ==="
27+
nvidia-smi
28+
echo "=== Running Container Check ==="
29+
CONTAINER_STATUS=$(docker inspect -f '{{.State.Status}}' cache_dit_ci_test 2>/dev/null || echo "not_exists")
30+
if [ "${CONTAINER_STATUS}" != "running" ]; then
31+
echo "❌ Container cache_dit_ci_test is not running (Status: ${CONTAINER_STATUS}), please start the container first!"
32+
exit 1
33+
else
34+
echo "✅ Container cache_dit_ci_test is running"
35+
fi
36+
echo "=== HF_MODELS Env Var Check in Container ==="
37+
# Check HF_MODELS (required by generate.py)
38+
HF_MODELS=$(docker exec cache_dit_ci_test env | grep -E '^HF_MODELS=' | cut -d= -f2)
39+
if [ -z "${HF_MODELS}" ]; then
40+
echo "⚠️ HF_MODELS is not configured in container, setting to default path /workspace/dev/vipdev/hf_models"
41+
# Temporarily set HF_MODELS (if not exists in container)
42+
docker exec cache_dit_ci_test bash -c "export HF_MODELS='/workspace/dev/vipdev/hf_models'"
43+
fi
44+
echo "✅ HF_MODELS in container: ${HF_MODELS}"
45+
# Verify model path exists, e.g., FLUX.1-dev
46+
docker exec cache_dit_ci_test bash -c "if [ -d '${HF_MODELS}/FLUX.1-dev' ]; then echo '✅ Model directory exists'; else echo '❌ Model directory does not exist'; exit 1; fi"
47+
48+
- name: 📥 Pull PR Code
49+
uses: actions/checkout@v4
50+
with:
51+
ref: ${{ github.event.pull_request.head.sha }}
52+
fetch-depth: 1
53+
54+
- name: 📝 Write Test Execution Script (Reuse Existing Container)
55+
run: |
56+
cat > run_gpu_tests.sh << 'EOF'
57+
#!/bin/bash
58+
set -e # Exit immediately if any command fails (meet the requirement of python exception interrupt as failure)
59+
60+
# Define key paths
61+
LOCAL_CODE_DIR="${PWD}" # Local PR code directory
62+
CONTAINER_CODE_DIR="/workspace/cache-dit-ci" # Code directory in container
63+
CACHE_DIT_DIR="${CONTAINER_CODE_DIR}/cache-dit" # cache-dit root directory in container
64+
EXAMPLES_DIR="${CACHE_DIT_DIR}/examples" # examples directory in container
65+
66+
# 1. Create code directory in container
67+
echo "📁 Create code directory in container: ${CONTAINER_CODE_DIR}"
68+
docker exec cache_dit_ci_test mkdir -p "${CONTAINER_CODE_DIR}"
69+
70+
# 2. Copy local PR code to container (overwrite existing code)
71+
echo "📤 Copy PR code to container..."
72+
docker cp "${LOCAL_CODE_DIR}/." cache_dit_ci_test:"${CONTAINER_CODE_DIR}/"
73+
74+
# 3. Check cache-dit directory and test script existence in container
75+
echo "🔍 Check code directories and scripts..."
76+
docker exec cache_dit_ci_test bash -c "
77+
if [ ! -d '${CACHE_DIT_DIR}' ]; then
78+
echo '❌ cache-dit directory does not exist: ${CACHE_DIT_DIR}'
79+
exit 1
80+
fi
81+
if [ ! -d '${EXAMPLES_DIR}' ]; then
82+
echo '❌ examples directory does not exist: ${EXAMPLES_DIR}'
83+
exit 1
84+
fi
85+
echo '✅ Code directory check passed'
86+
# List contents of current directory (CONTAINER_CODE_DIR in container)
87+
echo '=== Contents of code root directory in container ==='
88+
ls -l "${CONTAINER_CODE_DIR}"
89+
"
90+
91+
# 4. Install cache-dit (cd to cache-dit directory and execute installation)
92+
echo "🔧 Install cache-dit..."
93+
docker exec cache_dit_ci_test bash -c "
94+
cd '${CACHE_DIT_DIR}' &&
95+
echo '=== Contents of current directory (cache-dit) ===' &&
96+
ls -l && # List contents of current directory
97+
echo '=== Start installing cache-dit ===' &&
98+
pip install -U pip &&
99+
pip install . # Install cache-dit (add --no-cache-dir if compilation is needed)
100+
"
101+
102+
# 5. Execute generate.py script under examples directory
103+
echo "🚀 Execute generate.py in examples directory..."
104+
# 5.1 Baseline: FLUX.1-dev w/o any acceleration
105+
docker exec cache_dit_ci_test bash -c "
106+
cd '${EXAMPLES_DIR}' &&
107+
echo '=== Contents of current directory (examples) ===' &&
108+
ls -l && # List contents of current directory
109+
echo '=== Execute python3 generate.py list ===' &&
110+
python3 generate.py list &&
111+
echo '=== Execute python3 generate.py flux ===' &&
112+
python3 generate.py flux --model-path \$HF_MODELS/FLUX.1-dev --track-memory --summary &&
113+
echo '=== Contents of examples directory after execution ===' &&
114+
ls -l # List directory contents again
115+
"
116+
117+
# 5.2 FLUX.1-dev w/ cache acceleration, use --cache option
118+
docker exec cache_dit_ci_test bash -c "
119+
cd '${EXAMPLES_DIR}' &&
120+
echo '=== Execute python3 generate.py flux with cache acceleration ===' &&
121+
python3 generate.py flux --model-path \$HF_MODELS/FLUX.1-dev --cache --track-memory --summary &&
122+
echo '=== Contents of examples directory after cache acceleration execution ===' &&
123+
ls -l # List directory contents again
124+
"
125+
126+
# 6. Completion message
127+
echo "✅ All test steps completed successfully!"
128+
EOF
129+
chmod +x run_gpu_tests.sh
130+
131+
- name: 🚀 Execute Model Test
132+
run: |
133+
./run_gpu_tests.sh
134+
timeout-minutes: 1200 # Adjust according to actual test duration
135+
136+
- name: 📤 Test Result Feedback (On Failure)
137+
if: failure()
138+
run: |
139+
echo "❌ GPU Model Test failed!"
140+
gh pr comment ${{ github.event.pull_request.number }} --body "❌ GPU Model Test failed, check CI logs: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
141+
env:
142+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
143+
144+
- name: 📤 Test Result Feedback (On Success)
145+
if: success()
146+
run: |
147+
echo "✅ GPU Model Test Succeeded!"
148+
gh pr comment ${{ github.event.pull_request.number }} --body "✅ GPU Model Test Passed!"
149+
env:
150+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

0 commit comments

Comments
 (0)