|
| 1 | +name: Basic_GPU_Tests |
| 2 | + |
| 3 | +on: |
| 4 | + pull_request: |
| 5 | + branches: [ main ] |
| 6 | + paths: |
| 7 | + - 'cache-dit/src/**' |
| 8 | + - 'cache-dit/examples/**' |
| 9 | + - pyproject.toml |
| 10 | + - '.github/workflows/gpu-tests.yml' # Updated workflow file path |
| 11 | + |
| 12 | +concurrency: |
| 13 | + group: ${{ github.ref }}-gpu-tests |
| 14 | + cancel-in-progress: true |
| 15 | + |
| 16 | +jobs: |
| 17 | + flux-model-test: |
| 18 | + runs-on: [self-hosted, gpu, private-server] |
| 19 | + permissions: |
| 20 | + contents: read |
| 21 | + pull-requests: write |
| 22 | + |
| 23 | + steps: |
| 24 | + - name: 🔍 Environment Precheck (Container/Model/GPU) |
| 25 | + run: | |
| 26 | + echo "=== Server GPU Information ===" |
| 27 | + nvidia-smi |
| 28 | + echo "=== Running Container Check ===" |
| 29 | + CONTAINER_STATUS=$(docker inspect -f '{{.State.Status}}' cache_dit_ci_test 2>/dev/null || echo "not_exists") |
| 30 | + if [ "${CONTAINER_STATUS}" != "running" ]; then |
| 31 | + echo "❌ Container cache_dit_ci_test is not running (Status: ${CONTAINER_STATUS}), please start the container first!" |
| 32 | + exit 1 |
| 33 | + else |
| 34 | + echo "✅ Container cache_dit_ci_test is running" |
| 35 | + fi |
| 36 | + echo "=== HF_MODELS Env Var Check in Container ===" |
| 37 | + # Check HF_MODELS (required by generate.py) |
| 38 | + HF_MODELS=$(docker exec cache_dit_ci_test env | grep -E '^HF_MODELS=' | cut -d= -f2) |
| 39 | + if [ -z "${HF_MODELS}" ]; then |
| 40 | + echo "⚠️ HF_MODELS is not configured in container, setting to default path /workspace/dev/vipdev/hf_models" |
| 41 | + # Temporarily set HF_MODELS (if not exists in container) |
| 42 | + docker exec cache_dit_ci_test bash -c "export HF_MODELS='/workspace/dev/vipdev/hf_models'" |
| 43 | + fi |
| 44 | + echo "✅ HF_MODELS in container: ${HF_MODELS}" |
| 45 | + # Verify model path exists, e.g., FLUX.1-dev |
| 46 | + docker exec cache_dit_ci_test bash -c "if [ -d '${HF_MODELS}/FLUX.1-dev' ]; then echo '✅ Model directory exists'; else echo '❌ Model directory does not exist'; exit 1; fi" |
| 47 | +
|
| 48 | + - name: 📥 Pull PR Code |
| 49 | + uses: actions/checkout@v4 |
| 50 | + with: |
| 51 | + ref: ${{ github.event.pull_request.head.sha }} |
| 52 | + fetch-depth: 1 |
| 53 | + |
| 54 | + - name: 📝 Write Test Execution Script (Reuse Existing Container) |
| 55 | + run: | |
| 56 | + cat > run_gpu_tests.sh << 'EOF' |
| 57 | + #!/bin/bash |
| 58 | + set -e # Exit immediately if any command fails (meet the requirement of python exception interrupt as failure) |
| 59 | +
|
| 60 | + # Define key paths |
| 61 | + LOCAL_CODE_DIR="${PWD}" # Local PR code directory |
| 62 | + CONTAINER_CODE_DIR="/workspace/cache-dit-ci" # Code directory in container |
| 63 | + CACHE_DIT_DIR="${CONTAINER_CODE_DIR}/cache-dit" # cache-dit root directory in container |
| 64 | + EXAMPLES_DIR="${CACHE_DIT_DIR}/examples" # examples directory in container |
| 65 | +
|
| 66 | + # 1. Create code directory in container |
| 67 | + echo "📁 Create code directory in container: ${CONTAINER_CODE_DIR}" |
| 68 | + docker exec cache_dit_ci_test mkdir -p "${CONTAINER_CODE_DIR}" |
| 69 | +
|
| 70 | + # 2. Copy local PR code to container (overwrite existing code) |
| 71 | + echo "📤 Copy PR code to container..." |
| 72 | + docker cp "${LOCAL_CODE_DIR}/." cache_dit_ci_test:"${CONTAINER_CODE_DIR}/" |
| 73 | +
|
| 74 | + # 3. Check cache-dit directory and test script existence in container |
| 75 | + echo "🔍 Check code directories and scripts..." |
| 76 | + docker exec cache_dit_ci_test bash -c " |
| 77 | + if [ ! -d '${CACHE_DIT_DIR}' ]; then |
| 78 | + echo '❌ cache-dit directory does not exist: ${CACHE_DIT_DIR}' |
| 79 | + exit 1 |
| 80 | + fi |
| 81 | + if [ ! -d '${EXAMPLES_DIR}' ]; then |
| 82 | + echo '❌ examples directory does not exist: ${EXAMPLES_DIR}' |
| 83 | + exit 1 |
| 84 | + fi |
| 85 | + echo '✅ Code directory check passed' |
| 86 | + # List contents of current directory (CONTAINER_CODE_DIR in container) |
| 87 | + echo '=== Contents of code root directory in container ===' |
| 88 | + ls -l "${CONTAINER_CODE_DIR}" |
| 89 | + " |
| 90 | +
|
| 91 | + # 4. Install cache-dit (cd to cache-dit directory and execute installation) |
| 92 | + echo "🔧 Install cache-dit..." |
| 93 | + docker exec cache_dit_ci_test bash -c " |
| 94 | + cd '${CACHE_DIT_DIR}' && |
| 95 | + echo '=== Contents of current directory (cache-dit) ===' && |
| 96 | + ls -l && # List contents of current directory |
| 97 | + echo '=== Start installing cache-dit ===' && |
| 98 | + pip install -U pip && |
| 99 | + pip install . # Install cache-dit (add --no-cache-dir if compilation is needed) |
| 100 | + " |
| 101 | +
|
| 102 | + # 5. Execute generate.py script under examples directory |
| 103 | + echo "🚀 Execute generate.py in examples directory..." |
| 104 | + # 5.1 Baseline: FLUX.1-dev w/o any acceleration |
| 105 | + docker exec cache_dit_ci_test bash -c " |
| 106 | + cd '${EXAMPLES_DIR}' && |
| 107 | + echo '=== Contents of current directory (examples) ===' && |
| 108 | + ls -l && # List contents of current directory |
| 109 | + echo '=== Execute python3 generate.py list ===' && |
| 110 | + python3 generate.py list && |
| 111 | + echo '=== Execute python3 generate.py flux ===' && |
| 112 | + python3 generate.py flux --model-path \$HF_MODELS/FLUX.1-dev --track-memory --summary && |
| 113 | + echo '=== Contents of examples directory after execution ===' && |
| 114 | + ls -l # List directory contents again |
| 115 | + " |
| 116 | +
|
| 117 | + # 5.2 FLUX.1-dev w/ cache acceleration, use --cache option |
| 118 | + docker exec cache_dit_ci_test bash -c " |
| 119 | + cd '${EXAMPLES_DIR}' && |
| 120 | + echo '=== Execute python3 generate.py flux with cache acceleration ===' && |
| 121 | + python3 generate.py flux --model-path \$HF_MODELS/FLUX.1-dev --cache --track-memory --summary && |
| 122 | + echo '=== Contents of examples directory after cache acceleration execution ===' && |
| 123 | + ls -l # List directory contents again |
| 124 | + " |
| 125 | +
|
| 126 | + # 6. Completion message |
| 127 | + echo "✅ All test steps completed successfully!" |
| 128 | + EOF |
| 129 | + chmod +x run_gpu_tests.sh |
| 130 | +
|
| 131 | + - name: 🚀 Execute Model Test |
| 132 | + run: | |
| 133 | + ./run_gpu_tests.sh |
| 134 | + timeout-minutes: 1200 # Adjust according to actual test duration |
| 135 | + |
| 136 | + - name: 📤 Test Result Feedback (On Failure) |
| 137 | + if: failure() |
| 138 | + run: | |
| 139 | + echo "❌ GPU Model Test failed!" |
| 140 | + gh pr comment ${{ github.event.pull_request.number }} --body "❌ GPU Model Test failed, check CI logs: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |
| 141 | + env: |
| 142 | + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| 143 | + |
| 144 | + - name: 📤 Test Result Feedback (On Success) |
| 145 | + if: success() |
| 146 | + run: | |
| 147 | + echo "✅ GPU Model Test Succeeded!" |
| 148 | + gh pr comment ${{ github.event.pull_request.number }} --body "✅ GPU Model Test Passed!" |
| 149 | + env: |
| 150 | + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
0 commit comments