diff --git a/.github/workflows/H-Coverage.yml b/.github/workflows/H-Coverage.yml index a9346ee097882..6d8018261840f 100644 --- a/.github/workflows/H-Coverage.yml +++ b/.github/workflows/H-Coverage.yml @@ -386,7 +386,7 @@ jobs: timeout-minutes: 60 env: TASK: fleet-ci-paddle-build-whl-${{ github.event.pull_request.number }} - docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test" + docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:ubuntu24-cuda129-dev" steps: - name: Check docker image and run container env: @@ -501,7 +501,7 @@ jobs: - name: Check docker image and run container env: GPU_DEVICES: ${{ env.GPU_DEVICES }} - docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test" + docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:ubuntu24-cuda129-dev" run: | container_name=${TASK}-$(date +%Y%m%d-%H%M%S) echo "container_name=${container_name}" >> ${{ github.env }} @@ -536,13 +536,14 @@ jobs: docker exec -t ${{ env.container_name }} /bin/bash -ce ' rm -rf * .[^.]* source /root/proxy - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 + pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt pytest matplotlib parameterized wget -q --tries=5 --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PR/paddlefleet/${PR_ID}/${COMMIT_ID}/paddldfleet.tar.gz tar -xf paddldfleet.tar.gz --strip-components=1 git config --global --add safe.directory /paddle pip install dist/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/ --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ - echo "paddlefleet commit:" - python -c "import paddlefleet; print(paddlefleet.version.commit)" + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq ' - name: Download paddle.tar.gz and install paddle whl @@ -550,6 +551,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -c ' set -e + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 mkdir -p /PaddlePaddle cd /PaddlePaddle echo "Downloading Paddle.tar.gz from cfs" @@ -559,12 +561,15 @@ jobs: export UV_HTTP_TIMEOUT=300 pip uninstall paddlepaddle-gpu -y pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ + echo "paddlefleet commit:" + python -c "import paddlefleet; print(paddlefleet.version.commit)" ' - name: Single card test run: | docker exec -t ${{ env.container_name }} /bin/bash -xce ' pwd + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 if [ "${BRANCH}" != "develop" ]; then git checkout $fleet_branch echo "Checked out fleet branch: $fleet_branch" @@ -576,7 +581,7 @@ jobs: export UV_NO_SYNC=1 # This environment variable prevents uv sync from being executed when running un run. export UV_HTTP_TIMEOUT=300 python -c "import paddle; print(paddle.version.commit)" - timeout 15m bash ci/single_card_test.sh + timeout 40m bash ci/single_card_test.sh single_card_exit_code=$? if [[ "$single_card_exit_code" != "0" ]]; then echo -e "::error:: \033[31mSingle card test failed.\033[0m" @@ -603,7 +608,7 @@ jobs: env: PIP_CACHE_DIR: /root/.cache/pip TASK: paddle-fleet-CI-${{ github.event.pull_request.number }}-multi-card_test - docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test" + docker_image: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:ubuntu24-cuda129-dev" steps: - name: Check docker image and run container run: | @@ -642,13 +647,14 @@ jobs: docker exec -t ${{ env.container_name }} /bin/bash -ce ' rm -rf * .[^.]* source /root/proxy - pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 + pip install uv coverage==7.6.1 bce-python-sdk==0.8.74 wrapt pytest matplotlib parameterized wget -q --tries=5 --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PR/paddlefleet/${PR_ID}/${COMMIT_ID}/paddldfleet.tar.gz tar -xf paddldfleet.tar.gz --strip-components=1 git config --global --add safe.directory /paddle pip install dist/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url=https://www.paddlepaddle.org.cn/packages/stable/cu129/ --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ - echo "paddlefleet commit:" - python -c "import paddlefleet; print(paddlefleet.version.commit)" + wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + chmod +x /usr/local/bin/yq ' - name: Download paddle.tar.gz and install paddle whl @@ -656,6 +662,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -c ' set -e + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 mkdir -p /PaddlePaddle cd /PaddlePaddle echo "Downloading Paddle.tar.gz from cfs" @@ -665,12 +672,15 @@ jobs: export UV_HTTP_TIMEOUT=300 pip uninstall paddlepaddle-gpu -y pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ + echo "paddlefleet commit:" + python -c "import paddlefleet; print(paddlefleet.version.commit)" ' - name: Multi-card test run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' export PYTHONPATH=$(pwd) + export LD_LIBRARY_PATH=/usr/local/cuda-12.9/targets/x86_64-linux/lib:/usr/local/cuda/lib64 if [ "${BRANCH}" != "develop" ]; then git checkout $fleet_branch echo "Checked out fleet branch: $fleet_branch"