daily_ete_test #3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: daily_ete_test | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| repo_org: | |
| required: false | |
| description: 'Tested repository organization name. Default is open-compass/opencompass' | |
| type: string | |
| default: 'open-compass/opencompass' | |
| repo_ref: | |
| required: false | |
| description: 'Set branch or tag or commit id. Default is "main"' | |
| type: string | |
| default: 'main' | |
| regression_type: | |
| required: true | |
| description: 'regression types' | |
| type: string | |
| default: "['model', 'eval', 'cmd', 'cluster']" | |
| baseline_result: | |
| required: true | |
| description: 'baseline result' | |
| type: string | |
| default: "0.5.0-baseline" | |
| schedule: | |
| - cron: '15 14 * * 2' | |
| env: | |
| HF_DATASETS_OFFLINE: 1 | |
| HF_EVALUATE_OFFLINE: 1 | |
| TRANSFORMERS_OFFLINE: 1 | |
| VLLM_USE_MODELSCOPE: false | |
| LMDEPLOY_USE_MODELSCOPE: false | |
| HF_HUB_OFFLINE: 1 | |
| OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }} | |
| CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3 | |
| REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/ete_regression | |
| COMPASS_DATA_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache | |
| HF_DATASETS_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache | |
| HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub | |
| HF_DATASETS_DISABLE_LOCKFILES: 1 | |
| HF_ENDPOINT: https://hf-mirror.com | |
| PIP_CACHE_DIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pip_cache_new | |
| CONDA_ENV: ete_regression | |
| VLLM_WORKER_MULTIPROC_METHOD: spawn | |
| KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn | |
| KUBEBRAIN_NAMESPACE: ailab-opencompass | |
| JOB_NAME: daily-test-${{ github.run_id }}-${{ github.run_attempt }} | |
| BASELINE_DIR: ${{github.event.inputs.baseline_result || '0.5.0-baseline' }} | |
| TEST_MODEL: Qwen/Qwen3-8B | |
| jobs: | |
| build-pypi: | |
| runs-on: ubuntu-latest | |
| env: | |
| http_proxy: '' | |
| https_proxy: '' | |
| steps: | |
| - uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: Set up Python 3.10 | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.10' | |
| - name: Build lagent | |
| run: | | |
| pip install wheel setuptools | |
| python setup.py sdist bdist_wheel | |
| - name: Upload Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| if-no-files-found: error | |
| path: dist/* | |
| retention-days: 1 | |
| name: my-artifact-${{ github.run_id }} | |
| prepare_env: | |
| if: ${{!cancelled()}} | |
| needs: ['build-pypi'] | |
| runs-on: yidian_cu12_ete | |
| timeout-minutes: 180 #3hours | |
| steps: | |
| - name: Clean workdir | |
| run: sudo git clean -ffdx | |
| - name: Clone repository | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: Download Artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: my-artifact-${{ github.run_id }} | |
| - name: Remove Conda Env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda env remove -y --name ${{env.CONDA_ENV}} | |
| conda info --envs | |
| - name: Prepare - create conda env and install torch - cu12 | |
| uses: nick-fields/retry@v3 | |
| with: | |
| max_attempts: 3 | |
| timeout_minutes: 120 | |
| command: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda create -y --name ${{env.CONDA_ENV}} python=3.10 | |
| conda activate ${{env.CONDA_ENV}} | |
| export PIP_CACHE_DIR=${{env.PIP_CACHE_DIR}} | |
| pip install -r /mnt/shared-storage-user/qa-llm-cicd/oc_test_resource/requirements.txt | |
| pip install opencompass*.whl | |
| pip install opencompass[lmdeploy] | |
| pip install opencompass[vllm] | |
| pip install opencompass[full] | |
| pip install opencompass[api] | |
| pip install xformers | |
| pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 | |
| pip install transformers==4.56.0 vllm==0.11.0 lmdeploy==0.11.0 | |
| pip install fire pyyaml pytest | |
| FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl | |
| cp -r /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/packages/nltk_data /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/envs/${{env.CONDA_ENV}}/nltk_data | |
| - name: conda env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| pip list | |
| daily_model_test: | |
| if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'model'))}} | |
| needs: prepare_env | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| model_func: ["api", "api_rollout", "lmdeploy_base","lmdeploy_chat","vllm_chat", "vllm_base", "transformers_base", "transformers_chat"] | |
| runs-on: yidian_cu12_ete | |
| timeout-minutes: 240 #4hours | |
| steps: | |
| - name: Clean workdir | |
| run: sudo git clean -ffdx | |
| - name: Clone repository | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: conda env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| pip list | |
| - name: Run test - api | |
| if: contains(matrix.model_func, 'api') | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| echo ${{github.workspace}} | |
| JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; lmdeploy serve api_server ${{env.TEST_MODEL}} --session-len 146000 --max-batch-size 1 & python autotest/utils/health_check.py; opencompass autotest/model/infer_${{matrix.model_func}}.py -m infer --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Run test - other | |
| if: matrix.model_func != 'api' && matrix.model_func != 'api_rollout' | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| echo ${{github.workspace}} | |
| JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/model/infer_${{matrix.model_func}}.py -m infer --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} --max-num-workers 2 --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Assert result | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.model_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/${{matrix.model_func}} predictions | |
| - name: stop job | |
| if: always() | |
| run: | | |
| JOB_NAME=model-${{ github.run_id }}-${{ matrix.model_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob stop job $JOB_NAME | |
| daily_eval_test: | |
| if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'eval'))}} | |
| needs: prepare_env | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| eval_func: ["chat_obj_fullbench_v5", "chat_obj_fullbench_v6", "chat_obj_fullbench_v7", "chat_obj_fullbench_v8", "chat_obj_v8", "chat_obj_fullbench_other", "chat_sub_fullbench", "base_fullbench","base_longtext_fullbench","chat_longtext_fullbench"] | |
| runs-on: yidian_cu12_daily | |
| timeout-minutes: 240 #4hours | |
| steps: | |
| - name: Clean workdir | |
| run: sudo git clean -ffdx | |
| - name: Clone repository | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: conda env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| pip list | |
| - name: Run test - without out judger | |
| if: matrix.eval_func == 'chat_obj_fullbench_v5' || matrix.eval_func == 'base_fullbench' || matrix.eval_func == 'base_longtext_fullbench' || matrix.eval_func == 'chat_longtext_fullbench' | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p | |
| cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }} | |
| echo ${{github.workspace}} | |
| JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=0 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Run test - other subdatasets | |
| if: matrix.eval_func == 'chat_sub_fullbench' | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p | |
| cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }} | |
| echo ${{github.workspace}} | |
| JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache_sub --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Run test - with judger | |
| if: matrix.eval_func != 'chat_obj_fullbench_v5' && matrix.eval_func != 'base_fullbench' && matrix.eval_func != 'base_longtext_fullbench' && matrix.eval_func != 'chat_longtext_fullbench' && matrix.eval_func != 'chat_sub_fullbench' && matrix.eval_func != 'chat_obj_fullbench_other' | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p | |
| cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }} | |
| echo ${{github.workspace}} | |
| JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Run test - with judger HF source | |
| if: matrix.eval_func == 'chat_obj_fullbench_other' | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| mkdir ${{env.REPORT_ROOT}}/${{ github.run_id }} -p | |
| cp -r ${{env.REPORT_ROOT}}/${{ env.BASELINE_DIR }}/predictions/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{ github.run_id }} | |
| echo ${{github.workspace}} | |
| JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --env=DATASET_SOURCE=HF --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; opencompass autotest/eval/eval_${{matrix.eval_func}}.py -m eval --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} --reuse --dump-res-length' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Assert result | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/eval_${{matrix.eval_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/eval_${{matrix.eval_func}} results | |
| - name: stop job | |
| if: always() | |
| run: | | |
| JOB_NAME=eval-${{ github.run_id }}-${{ matrix.eval_func }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob stop job $JOB_NAME | |
| daily_run_cmd: | |
| if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'cmd'))}} | |
| needs: prepare_env | |
| runs-on: yidian_cu12_daily | |
| timeout-minutes: 240 #4hours | |
| steps: | |
| - name: Clean workdir | |
| run: sudo git clean -ffdx | |
| - name: Clone repository | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: conda env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| pip list | |
| - name: Run test | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| JOB_NAME=cmd-${{ github.run_id }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}' | |
| for i in {1..600}; do | |
| current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+') | |
| if [[ $current_status == "Succeeded" ]]; then | |
| echo "Task succeeded" | |
| exit 0 | |
| elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then | |
| echo "Task failed or stopped, fetching logs" | |
| rjob logs job $JOB_NAME | |
| exit 1 | |
| fi | |
| sleep 10 | |
| done | |
| - name: Assert result | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily | |
| python -m pytest -m case1 -s -v --color=yes autotest/utils/oc_score_assert.py | |
| rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily | |
| python -m pytest -m case2 -s -v --color=yes autotest/utils/oc_score_assert.py | |
| rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily | |
| python -m pytest -m case3 -s -v --color=yes autotest/utils/oc_score_assert.py | |
| rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily | |
| python -m pytest -m case4 -s -v --color=yes autotest/utils/oc_score_assert.py | |
| rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily | |
| python -m pytest -m case5 -s -v --color=yes autotest/utils/oc_score_assert.py | |
| - name: stop job | |
| if: always() | |
| run: | | |
| JOB_NAME=cmd-${{ github.run_id }}-${{ github.run_attempt }} | |
| JOB_NAME=${JOB_NAME//_/-} | |
| rjob stop job $JOB_NAME | |
| daily_run_cluster: | |
| if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'cluster'))}} | |
| needs: prepare_env | |
| runs-on: yidian_cu12_daily | |
| timeout-minutes: 240 #4hours | |
| steps: | |
| - name: Clean workdir | |
| run: sudo git clean -ffdx | |
| - name: Clone repository | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} | |
| ref: ${{github.event.inputs.repo_ref || 'main'}} | |
| - name: conda env | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| pip list | |
| - name: change rjob.py | |
| run: | | |
| cp /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/config/rjob.py . | |
| sed -i "s/TASK_ID='none'/TASK_ID='${{ github.run_id }}'/g" rjob.py | |
| - name: Run test | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| opencompass autotest/cluster/chat_models.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster --reuse --dump-res-length | |
| - name: Assert result | |
| run: | | |
| . ${{env.CONDA_PATH}}/bin/activate | |
| conda activate ${{env.CONDA_ENV}} | |
| conda info --envs | |
| python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/cluster predictions | |
| python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/cluster ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/cluster results |