diff --git a/.github/workflows/_selected_tests.yaml b/.github/workflows/_selected_tests.yaml index dc1c59d4dc2..5d2e1c0aa79 100644 --- a/.github/workflows/_selected_tests.yaml +++ b/.github/workflows/_selected_tests.yaml @@ -42,6 +42,17 @@ on: required: false default: false description: 'Continue running the job even if tests fail' + enable-coverage: + type: boolean + required: false + default: false + description: 'Whether to run tests with coverage enabled.' + secrets: + OBS_ACCESS_KEY_PRECISION: + required: false + OBS_SECRET_ACCESS_KEY_PRECISION: + required: false + # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -223,8 +234,12 @@ jobs: continue-on-error: ${{ inputs.continue_on_error }} env: VLLM_WORKER_MULTIPROC_METHOD: spawn + ENABLE_COVERAGE: ${{ inputs.enable-coverage}} run: | . /usr/local/Ascend/ascend-toolkit/set_env.sh + if [ "${{ inputs.enable-coverage }}" = "true" ]; then + export ENABLE_COVERAGE=true + fi TIMING_FLAG="" if [ "${{ inputs.upload_timing }}" = "true" ]; then TIMING_FLAG="--timing" @@ -242,7 +257,11 @@ jobs: env: VLLM_WORKER_MULTIPROC_METHOD: spawn TORCH_DEVICE_BACKEND_AUTOLOAD: 0 + ENABLE_COVERAGE: ${{ inputs.enable-coverage}} run: | + if [ "${{ inputs.enable-coverage }}" = "true" ]; then + export ENABLE_COVERAGE=true + fi .github/workflows/scripts/run_selected_tests.sh \ "${{ matrix.group.npu_type }}" \ "${{ matrix.group.num_npus }}" \ @@ -259,6 +278,17 @@ jobs: if-no-files-found: ignore retention-days: 7 + - name: Upload coverage data + if: ${{ always() && inputs.enable-coverage }} + continue-on-error: true + uses: actions/upload-artifact@v7 + with: + name: selected-test-coverage-vllm-${{ inputs.vllm }}-${{ matrix.group.npu_type }}-${{ matrix.group.num_npus }}card-${{ matrix.group.partition }} + path: tests/outputs/**/covdata/** + if-no-files-found: ignore + retention-days: 14 + compression-level: 0 + - name: Upload selected test logs if: always() continue-on-error: true @@ -269,3 +299,112 @@ jobs: if-no-files-found: ignore retention-days: 14 compression-level: 0 + + upload-coverage-to-obs: + if: ${{ always() && inputs.enable-coverage }} + needs: selected-tests + runs-on: ubuntu-latest + continue-on-error: true + env: + OBS_ACCESS_KEY: ${{ secrets.OBS_ACCESS_KEY_PRECISION }} + OBS_SECRET_KEY: ${{ secrets.OBS_SECRET_ACCESS_KEY_PRECISION }} + steps: + - name: Checkout vllm-ascend source code + uses: actions/checkout@v6 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Download all coverage artifacts + uses: actions/download-artifact@v5 + with: + path: all-coverage + pattern: selected-test-coverage-* + merge-multiple: true + + - name: Debug - List downloaded coverage files + run: | + echo "=== all-coverage top level ===" + find all-coverage -maxdepth 1 2>/dev/null | head -20 || echo "all-coverage directory not exist" + echo "" + echo "=== all-coverage total file count ===" + find all-coverage -type f 2>/dev/null | wc -l + echo "=== sample covdata dirs ===" + find all-coverage -type d -name covdata 2>/dev/null | head -10 + echo "=== sample files ===" + find all-coverage -path '*/covdata/*' -type f 2>/dev/null | head -10 + + - name: Assemble and compress coverage files + run: | + set -euo pipefail + TASK_DATE=$(date +%Y%m%d%H) + TASK_NAME="VLLM-ASCEND@task_${TASK_DATE}" + COVERAGE_PKG_DIR="coverage/vllm-ascend" + TASK_DIR="${COVERAGE_PKG_DIR}/${TASK_NAME}" + mkdir -p "${TASK_DIR}" "${COVERAGE_PKG_DIR}/covstub/vllm_ascend" + + COVERAGE_FILE_COUNT=0 + while IFS= read -r f; do + rel="${f#all-coverage/}" + case "${rel}" in + selected-test-coverage-*/*) rel="${rel#*/}" ;; + esac + dest="${TASK_DIR}/${rel}" + mkdir -p "$(dirname "${dest}")" + cp "${f}" "${dest}" + COVERAGE_FILE_COUNT=$((COVERAGE_FILE_COUNT + 1)) + done < <(find all-coverage -path '*/covdata/*' -type f 2>/dev/null) + + echo "Copied ${COVERAGE_FILE_COUNT} coverage files to ${TASK_DIR}" + if [ "${COVERAGE_FILE_COUNT}" -eq 0 ]; then + echo "::error::No coverage files found under all-coverage/*/covdata/" + exit 1 + fi + + cp -r vllm_ascend/. "${COVERAGE_PKG_DIR}/covstub/vllm_ascend/" + cp tests/coverage_settingInfo.xml "${COVERAGE_PKG_DIR}/settingInfo.xml" + echo "${TASK_NAME}" > coverage_version.txt + + echo "=== package top level ===" + find "${COVERAGE_PKG_DIR}" -maxdepth 1 2>/dev/null | head -20 || true + echo "=== task dir file count ===" + find "${TASK_DIR}" -type f | wc -l + + tar cf coverage.tar -C coverage vllm-ascend + echo "Packed coverage/vllm-ascend into coverage.tar" + du -h coverage.tar + echo "=== tar contents (sample) ===" + tar tf coverage.tar | head -20 || true + echo "=== tar task file count ===" + tar tf coverage.tar | grep -c "vllm-ascend/${TASK_NAME}/" || true + + { + echo "TASK_DATE=${TASK_DATE}" + echo "TASK_NAME=${TASK_NAME}" + } >> "${GITHUB_ENV}" + + - name: Upload coverage to OBS + run: | + pip install esdk-obs-python --quiet + python3 - <<'EOF' + import os + from obs import ObsClient + + OBS_BUCKET = 'vllm-ascend' + OBS_PREFIX = 'ci/precision-test' + client = ObsClient( + access_key_id=os.environ['OBS_ACCESS_KEY'], + secret_access_key=os.environ['OBS_SECRET_KEY'], + server='https://obs.cn-north-4.myhuaweicloud.com' + ) + + uploads = [ + ('coverage.tar', f'{OBS_PREFIX}/coverage.tar'), + ('coverage_version.txt', f'{OBS_PREFIX}/coverage_version.txt'), + ] + for local_path, obs_path in uploads: + resp = client.putFile(OBS_BUCKET, obs_path, local_path) + if resp.status < 300: + print(f'Uploaded: {local_path} -> {obs_path}') + else: + raise Exception(f'Failed to upload {local_path}: {resp.errorMessage}') + EOF diff --git a/.github/workflows/pr_test.yaml b/.github/workflows/pr_test.yaml index 9575af95c72..4b4b32523d9 100644 --- a/.github/workflows/pr_test.yaml +++ b/.github/workflows/pr_test.yaml @@ -28,6 +28,20 @@ on: - 'main' - '*-dev' - 'releases/v*' + schedule: + - cron: "0 18 * * *" + workflow_dispatch: + inputs: + vllm_ascend_ref: + description: "vllm-ascend ref (branch, tag, or SHA)" + required: false + default: "main" + type: string + vllm_version: + description: "vllm version (commit hash or tag)" + required: false + default: "v0.20.2" + type: string # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -42,7 +56,7 @@ concurrency: jobs: lint-and-select-tests: - if: ${{ github.event.action != 'labeled' || github.event.label.name == 'ready' }} + if: ${{ github.event_name == 'pull_request' && (github.event.action != 'labeled' || github.event.label.name == 'ready') }} runs-on: linux-amd64-cpu-8-hk container: image: quay.io/ascend-ci/vllm-ascend:lint @@ -213,3 +227,76 @@ jobs: vllm: ${{ matrix.vllm_version }} ref: ${{ github.event.pull_request.head.sha }} test_groups: ${{ needs.lint-and-select-tests.outputs.test_groups }} + +# ─── Schedule/Dispatch path: full tests ──────────────────────────────────────── + + select-full-tests: + if: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + runs-on: linux-amd64-cpu-8-hk + container: + image: quay.io/ascend-ci/vllm-ascend:lint + outputs: + has_tests: ${{ steps.full-scope.outputs.has_tests }} + test_groups: ${{ steps.full-scope.outputs.test_groups }} + matched_modules: ${{ steps.full-scope.outputs.matched_modules }} + vllm_ascend_ref: ${{ steps.resolve-refs.outputs.vllm_ascend_ref }} + vllm_version: ${{ steps.resolve-refs.outputs.vllm_version }} + main_commit: ${{ steps.resolve-refs.outputs.main_commit }} + release_tag: ${{ steps.resolve-refs.outputs.release_tag }} + steps: + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v6 + with: + ref: ${{ inputs.vllm_ascend_ref || 'main' }} + fetch-depth: 0 + + - name: Resolve refs + id: resolve-refs + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "vllm_ascend_ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT" + else + echo "vllm_ascend_ref=${{ inputs.vllm_ascend_ref || 'main' }}" >> "$GITHUB_OUTPUT" + fi + echo "vllm_version=${{ inputs.vllm_version || 'v0.21.0' }}" >> "$GITHUB_OUTPUT" + main_commit="$(tr -d '[:space:]' < .github/vllm-main-verified.commit)" + release_tag="$(tr -d '[:space:]' < .github/vllm-release-tag.commit)" + [[ "${main_commit}" =~ ^[0-9a-f]{7,40}$ ]] || { + echo "::error file=.github/vllm-main-verified.commit::invalid vLLM main commit: ${main_commit}" + exit 1 + } + [[ "${release_tag}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+([.-].*)?$ ]] || { + echo "::error file=.github/vllm-release-tag.commit::invalid vLLM release tag: ${release_tag}" + exit 1 + } + { + echo "main_commit=${main_commit}" + echo "release_tag=${release_tag}" + } >> "$GITHUB_OUTPUT" + + - name: Select all tests + id: full-scope + run: | + pip install regex pyyaml + git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend + python3 .github/workflows/scripts/select_tests.py \ + --changed-files vllm_ascend/dummy.py \ + --run-all-modules + + run-full-tests: + needs: select-full-tests + if: ${{ needs.select-full-tests.outputs.has_tests == 'true' }} + strategy: + fail-fast: false + matrix: + vllm_version: + - ${{ needs.select-full-tests.outputs.release_tag }} + uses: ./.github/workflows/_selected_tests.yaml + with: + vllm: ${{ matrix.vllm_version }} + ref: ${{ needs.select-full-tests.outputs.vllm_ascend_ref }} + test_groups: ${{ needs.select-full-tests.outputs.test_groups }} + enable-coverage: true + secrets: + OBS_ACCESS_KEY_PRECISION: ${{ secrets.OBS_ACCESS_KEY_PRECISION }} + OBS_SECRET_ACCESS_KEY_PRECISION: ${{ secrets.OBS_SECRET_ACCESS_KEY_PRECISION }} \ No newline at end of file diff --git a/.github/workflows/scripts/run_selected_tests.sh b/.github/workflows/scripts/run_selected_tests.sh index 55c30f1bd6c..8ae3a73ac7f 100755 --- a/.github/workflows/scripts/run_selected_tests.sh +++ b/.github/workflows/scripts/run_selected_tests.sh @@ -1,8 +1,25 @@ #!/usr/bin/env bash set -euo pipefail +enable_coverage=false +if [ "${ENABLE_COVERAGE:-}" = "true" ]; then + enable_coverage=true +fi + +while [ "$#" -gt 0 ]; do + case "$1" in + --enable-coverage) + enable_coverage=true + shift + ;; + *) + break + ;; + esac +done + if [ "$#" -lt 4 ]; then - echo "Usage: $0 [--timing] [test ...]" + echo "Usage: $0 [--enable-coverage] [--timing] [test ...]" exit 1 fi @@ -28,16 +45,30 @@ test_results=() failed_logs=() timing_entries=() test_index=0 +overall_status=0 pytest_log_dir="${RUNNER_TEMP:-/tmp}/selected-tests-${npu_type}-${num_npus}card" +project_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" mkdir -p "${pytest_log_dir}" +setup_coverage() { + local target="$1" + local test_basename="${target%.py}" + test_basename="${test_basename//\//__}" + test_basename="${test_basename//::/--}" + local covdata_dir="${project_root}/tests/outputs/${test_basename}/covdata" + mkdir -p "${covdata_dir}" + export COVERAGE_FILE="${covdata_dir}/coverage" + echo -e " \033[33mCOVERAGE_FILE:\033[0m ${COVERAGE_FILE}" +} + print_test_info() { echo -e "\033[1;34m=== TEST INFO ===\033[0m" echo -e " \033[33mDevice:\033[0m ${npu_type}" if [ "${npu_type}" != "cpu" ]; then echo -e " \033[33mNPU count:\033[0m ${num_npus}" fi + echo -e " \033[33mCoverage:\033[0m ${enable_coverage}" echo -e " \033[33mTargets:\033[0m" for target in "${targets[@]}"; do echo -e " \033[32m-\033[0m ${target}" @@ -77,8 +108,14 @@ run_pytest_target() { if [ "${record_timing}" = true ]; then start_time=$(date +%s%N) fi - set +e - pytest -sv --color=yes "${target}" 2>&1 | tee "${log_file}" + if [ "${enable_coverage}" = "true" ]; then + setup_coverage "${target}" + set +e + python -m coverage run --rcfile="${project_root}/tests/coveragerc" -m pytest -sv --color=yes "${target}" 2>&1 | tee "${log_file}" + else + set +e + pytest -sv --color=yes "${target}" 2>&1 | tee "${log_file}" + fi local status=${PIPESTATUS[0]} set -e if [ "${record_timing}" = true ]; then @@ -112,8 +149,15 @@ run_pytest_batch() { if [ "${record_timing}" = true ]; then start_time=$(date +%s%N) fi - set +e - pytest -sv --color=yes "${batch_targets[@]}" 2>&1 | tee "${log_file}" + if [ "${enable_coverage}" = "true" ]; then + echo "DEBUG: 进入【覆盖率分支】" + setup_coverage "cpu-ut" + set +e + python -m coverage run --rcfile="${project_root}/tests/coveragerc" -m pytest -sv --color=yes "${batch_targets[@]}" 2>&1 | tee "${log_file}" + else + set +e + pytest -sv --color=yes "${batch_targets[@]}" 2>&1 | tee "${log_file}" + fi local status=${PIPESTATUS[0]} set -e if [ "${record_timing}" = true ]; then @@ -178,3 +222,4 @@ fi print_timing_json print_summary +exit "${overall_status}" diff --git a/requirements-dev.txt b/requirements-dev.txt index 972b006abd4..ea5b2a10007 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -24,3 +24,4 @@ mindstudio-probe>=8.3.0 xlite==0.1.0rc11.dev210 uc-manager ninja +coverage diff --git a/tests/coverage_settingInfo.xml b/tests/coverage_settingInfo.xml new file mode 100644 index 00000000000..f220b0f1733 --- /dev/null +++ b/tests/coverage_settingInfo.xml @@ -0,0 +1,6 @@ + + + 3 + 1 + admin + diff --git a/tests/coveragerc b/tests/coveragerc new file mode 100644 index 00000000000..2d3848005a0 --- /dev/null +++ b/tests/coveragerc @@ -0,0 +1,39 @@ +[run] +branch = True +# 指定覆盖数据中文件路径采用绝对路径,便于后续分析时路径匹配 +relative_files = False +# 指定产生的覆盖数据文件基础名称,可采用绝对路径 +data_file = /mnt/share/s00837289/covdata/coverage +# 对覆盖数据文件增加进程ID等标识 +parallel = True +# 项目使用的并发库,默认thread。若代码采用了multiprocessing, gevent, greenlet, 或eventlet等并发库,则需显示指定。如任务调度框架celery可采用单进程模式,使用了multiprocessing库,则此选项需追加multiprocessing +concurrency = thread,multiprocessing +# 注册信号15,以便“kill pid”杀进程前可导出覆盖数据,若业务代码本身注册了15处理函数,则应关闭。默认关闭 +sigterm = False +# debug配置,方便定位可能的异常。按需开启 +# debug=pid,dataio,dataop +# debug_file=/opt/coveragepy_debug.out +# 禁用coverage.py警告打印 +disable_warnings=no-data-collected,module-not-python +# 配置可采集覆盖数据的源码文件列表,支持通配符*匹配,支持通过逗号分隔多个值。根据实际调整 +include = + */vllm_ascend/* + # 目录usr下任何格式python代码 + /vllm_ascend/* +# 配置需排除覆盖数据采集的文件列表,作用于include配置处理之后。语法同include +omit = */.local/* + /usr/* + utils/tirefire.py +# agent配置常驻进程覆盖采集执行机。2025年3月所出烛龙版本7.5.4a0.dev2新增配置项 +# 是否开启执行机覆盖数据收取接口服务,默认False +#agent_enable=False +# 指定收取服务使用端口,默认8088。可配置为0以采用随机端口 +#agent_port=8088 +# 是否开启收取日志信息打印,默认False +#agent_debug=False +# 开始日志打印是,收取服务日志文件绝对路径 +#agent_debug_file=/path/to/coverage_agent.log +# 指定导出数据超时配置,默认3秒 +#agent_dump_timeout=3 +# 导出数据是否采用API方式,默认True。True:调用dump方法方式;False:kill -34方式 +#agent_dump_in_api=True \ No newline at end of file