Skip to content

Commit f4ba494

Browse files
[ci] add api testcase into pr_test (#2460)
* add more case into pr test * update * update * update * update * updsate * update * update * update * update * updaet * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update
1 parent 4df8be6 commit f4ba494

17 files changed

Lines changed: 1035 additions & 365 deletions

.github/workflows/daily-ete-test.yml

Lines changed: 1 addition & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ on:
1717
required: true
1818
description: 'regression types'
1919
type: string
20-
default: "['infer', 'model', 'eval', 'cmd', 'cluster', 'all']"
20+
default: "['model', 'eval', 'cmd', 'cluster', 'all']"
2121
baseline_result:
2222
required: true
2323
description: 'baseline result'
@@ -214,80 +214,6 @@ jobs:
214214
JOB_NAME=${JOB_NAME//_/-}
215215
rjob stop job $JOB_NAME
216216
217-
218-
daily_infer_test:
219-
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'infer'))}}
220-
needs: prepare_env
221-
strategy:
222-
fail-fast: false
223-
matrix:
224-
include:
225-
- infer_func: chat_obj_fullbench_v1
226-
memory: 10240
227-
cpu: 16
228-
- infer_func: chat_obj_fullbench_v2
229-
memory: 32568
230-
cpu: 16
231-
- infer_func: chat_sub_fullbench
232-
memory: 3072
233-
cpu: 2
234-
- infer_func: chat_longtext_fullbench
235-
memory: 65136
236-
cpu: 16
237-
runs-on: yidian_cu12_daily
238-
timeout-minutes: 240 #4hours
239-
env:
240-
COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
241-
steps:
242-
- name: Clean workdir
243-
run: sudo git clean -ffdx
244-
- name: Clone repository
245-
uses: actions/checkout@v5
246-
with:
247-
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
248-
ref: ${{github.event.inputs.repo_ref || 'main'}}
249-
- name: conda env
250-
run: |
251-
. ${{env.CONDA_PATH}}/bin/activate
252-
conda activate ${{env.CONDA_ENV}}
253-
conda info --envs
254-
pip list
255-
- name: Run test
256-
run: |
257-
. ${{env.CONDA_PATH}}/bin/activate
258-
conda activate ${{env.CONDA_ENV}}
259-
echo ${{github.workspace}}
260-
261-
JOB_NAME=infer-${{ github.run_id }}-${{ matrix.infer_func }}-${{ github.run_attempt }}
262-
JOB_NAME=${JOB_NAME//_/-}
263-
264-
rjob submit --metadata-name=$JOB_NAME --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=0 --cpu=${{ matrix.cpu }} --memory=${{ matrix.memory }} --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; python /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/mock_start.py & python autotest/utils/health_check.py; opencompass autotest/infer/infer_${{matrix.infer_func}}.py -m infer --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/infer_${{matrix.infer_func}} --reuse --dump-res-length'
265-
266-
for i in {1..600}; do
267-
current_status=$(rjob get $JOB_NAME | grep -oP 'rjob [^:]+: \K[^ ]+')
268-
if [[ $current_status == "Succeeded" ]]; then
269-
echo "Task succeeded"
270-
exit 0
271-
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
272-
echo "Task failed or stopped, fetching logs"
273-
rjob logs job $JOB_NAME
274-
exit 1
275-
fi
276-
sleep 10
277-
done
278-
- name: Assert result
279-
run: |
280-
. ${{env.CONDA_PATH}}/bin/activate
281-
conda activate ${{env.CONDA_ENV}}
282-
conda info --envs
283-
python autotest/utils/compare_results.py compare_results ${{env.REPORT_ROOT}}/${{ github.run_id }}/infer_${{matrix.infer_func}} ${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/infer_${{matrix.infer_func}} predictions
284-
- name: stop job
285-
if: always()
286-
run: |
287-
JOB_NAME=infer-${{ github.run_id }}-${{ matrix.infer_func }}-${{ github.run_attempt }}
288-
JOB_NAME=${JOB_NAME//_/-}
289-
rjob stop job $JOB_NAME
290-
291217
daily_eval_test:
292218
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_type), 'eval'))}}
293219
needs: prepare_env

.github/workflows/pr-run-test.yml

Lines changed: 104 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ concurrency:
1717
cancel-in-progress: true
1818

1919
env:
20-
CONDA_ENV: pr_test
20+
CONDA_ENV: pr_regression
2121
HF_DATASETS_OFFLINE: 1
2222
HF_EVALUATE_OFFLINE: 1
2323
TRANSFORMERS_OFFLINE: 1
@@ -26,20 +26,22 @@ env:
2626
HF_HUB_OFFLINE: 1
2727
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
2828
REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/prtest
29+
BASELINE_DIR: mock-api-baseline
2930
COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
30-
HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache
31+
HF_DATASETS_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache
3132
HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
3233
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
3334
KUBEBRAIN_NAMESPACE: ailab-opencompass
34-
JOB_NAME: pr-test-${{ github.run_id }}-${{ github.run_attempt }}
3535

3636
jobs:
37-
pr_run_test:
37+
cmd_test:
3838
runs-on: yidian_cu12
3939
timeout-minutes: 45
40+
env:
41+
JOB_NAME: pr-test-${{ github.run_id }}-cmd-${{ github.run_attempt }}
4042
steps:
4143
- name: Checkout repository
42-
uses: actions/checkout@v2
44+
uses: actions/checkout@v6
4345
- name: Prepare - Install opencompass
4446
run: |
4547
. ${{env.CONDA_PATH}}/bin/activate
@@ -54,13 +56,18 @@ jobs:
5456
. ${{env.CONDA_PATH}}/bin/activate
5557
conda activate ${{env.CONDA_ENV}}
5658
pip list
57-
rjob submit --metadata-name=${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pr_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
59+
rjob submit --metadata-name=${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=${{env.HF_DATASETS_CACHE}} --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pr_test_new.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
5860
5961
for i in {1..300}; do
6062
current_status=$(rjob get ${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')
61-
if [[ $current_status == "Succeeded" || $current_status == "Failed" || $current_status == "Stopped" ]]; then
62-
echo "Current status: $current_status, stop checking"
63-
break
63+
if [[ $current_status == "Succeeded" ]]; then
64+
echo "Task succeeded"
65+
rjob logs job ${{ env.JOB_NAME }} -n 100
66+
exit 0
67+
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
68+
echo "Task failed or stopped, fetching logs"
69+
rjob logs job ${{ env.JOB_NAME }} -n 100
70+
exit 1
6471
fi
6572
sleep 6
6673
done
@@ -96,12 +103,93 @@ jobs:
96103
conda info --envs
97104
rjob stop job ${{ env.JOB_NAME }}
98105
99-
notify_to_feishu:
100-
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
101-
needs: [pr_run_test]
102-
timeout-minutes: 5
103-
runs-on: self-hosted
106+
mock_api_test:
107+
runs-on: yidian_cu12
108+
timeout-minutes: 120
109+
strategy:
110+
fail-fast: false
111+
matrix:
112+
include:
113+
- func_type: chat_obj_fullbench_v1
114+
name: chat-v1
115+
memory: 32568
116+
cpu: 16
117+
- func_type: chat_obj_fullbench_v2
118+
name: chat-v2
119+
memory: 32568
120+
cpu: 16
121+
- func_type: chat_sub_fullbench
122+
name: chat-sub
123+
# memory: 3072
124+
memory: 32568
125+
cpu: 2
126+
- func_type: chat_longtext_fullbench
127+
name: chat-longtext
128+
memory: 65136
129+
cpu: 16
130+
env:
131+
JOB_NAME: pr-test-${{ github.run_id }}-api-${{ matrix.name }}-${{ github.run_attempt }}
104132
steps:
105-
- name: notify
133+
- name: Checkout repository
134+
uses: actions/checkout@v6
135+
- name: Prepare - Install opencompass
136+
run: |
137+
. ${{env.CONDA_PATH}}/bin/activate
138+
conda activate ${{env.CONDA_ENV}}
139+
python3 -m pip uninstall opencompass -y
140+
python3 -m pip install .[full]
141+
conda info --envs
142+
pip list
143+
lmdeploy check_env
144+
- name: Run test
106145
run: |
107-
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}
146+
. ${{env.CONDA_PATH}}/bin/activate
147+
conda activate ${{env.CONDA_ENV}}
148+
pip list
149+
150+
rjob submit --metadata-name=${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=0 --cpu=${{ matrix.cpu }} --memory=${{ matrix.memory }} --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:v0.12.0-cu12.8 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=${{env.HF_DATASETS_CACHE}} --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=REPORT_DIR=${{env.REPORT_ROOT}} --env=CHAT_TYPE=${{matrix.func_type}} --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --host-network=True -- bash -exc 'source ${{env.CONDA_PATH}}/bin/activate; conda activate ${{env.CONDA_ENV}}; conda env list; cd ${{github.workspace}}; ln -s /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache/data .; python /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/mock_chat_api.py --type winrate --port 26333 > mock_${{matrix.name}}.log 2>&1 & sleep 3; opencompass autotest/all/${{matrix.func_type}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.func_type}} --reuse;'
151+
152+
for i in {1..300}; do
153+
current_status=$(rjob get ${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')
154+
if [[ $current_status == "Succeeded" ]]; then
155+
echo "Task succeeded"
156+
rjob logs job ${{ env.JOB_NAME }} -n 100
157+
exit 0
158+
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
159+
echo "Task failed or stopped, fetching logs"
160+
rjob logs job ${{ env.JOB_NAME }} -n 100
161+
exit 1
162+
fi
163+
sleep 6
164+
done
165+
- name: Compare predictions with baseline
166+
run: |
167+
. ${{env.CONDA_PATH}}/bin/activate
168+
conda activate ${{env.CONDA_ENV}}
169+
CURRENT="${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.func_type}}"
170+
BASELINE="${{env.REPORT_ROOT}}/${{env.BASELINE_DIR}}/${{matrix.func_type}}"
171+
echo "Current run: $CURRENT"
172+
echo "Baseline: $BASELINE"
173+
if [[ ! -d "$CURRENT" ]]; then
174+
echo "Current run output not found: $CURRENT"
175+
exit 1
176+
fi
177+
if [[ ! -d "$BASELINE" ]]; then
178+
echo "Baseline not found: $BASELINE"
179+
echo "Upload golden predictions to REPORT_ROOT/BASELINE_DIR/<func_type> on shared storage."
180+
exit 1
181+
fi
182+
python autotest/utils/compare_results.py compare_results \
183+
"$CURRENT" "$BASELINE" predictions
184+
python autotest/utils/compare_results.py compare_results \
185+
"$CURRENT" "$BASELINE" results
186+
python autotest/utils/compare_results.py compare_results \
187+
"$CURRENT" "$BASELINE" summary
188+
- name: Uninstall opencompass
189+
if: always()
190+
run: |
191+
. ${{env.CONDA_PATH}}/bin/activate
192+
conda activate ${{env.CONDA_ENV}}
193+
python3 -m pip uninstall opencompass -y
194+
conda info --envs
195+
rjob stop job ${{ env.JOB_NAME }}

.github/workflows/unit-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
timeout-minutes: 45
3131
steps:
3232
- name: Checkout repository
33-
uses: actions/checkout@v2
33+
uses: actions/checkout@v6
3434
- name: Prepare - Install opencompass
3535
run: |
3636
. ${{env.CONDA_PATH}}/bin/activate
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from mmengine.config import read_base
2+
3+
with read_base():
4+
from autotest.all.config import \
5+
concurrent_infer as infer # noqa: F401, E501
6+
from autotest.all.config import models # noqa: F401, E501
7+
from opencompass.configs.datasets.babilong.babilong_256k_gen import \
8+
babiLong_256k_datasets # noqa: F401, E501
9+
from opencompass.configs.datasets.longbenchv2.longbenchv2_gen import \
10+
LongBenchv2_datasets as LongBenchv2_datasets # noqa: F401, E501
11+
from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import \
12+
needlebench_datasets as needlebench_32k_datasets # noqa: F401, E501
13+
from opencompass.configs.datasets.ruler.ruler_8k_gen import \
14+
ruler_datasets as ruler_8k_datasets # noqa: F401, E501
15+
16+
datasets = sum(
17+
([v[0]] if v else []
18+
for k, v in locals().items() if k.endswith('_datasets')),
19+
[],
20+
)
21+
22+
for d in datasets:
23+
d['reader_cfg']['test_range'] = '[0:1]'

autotest/infer/infer_chat_obj_fullbench_v1.py renamed to autotest/all/chat_obj_fullbench_v1.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from mmengine.config import read_base
22

33
with read_base():
4-
from autotest.infer.config import \
4+
from autotest.all.config import concurrent_eval as eval # noqa: F401, E501
5+
from autotest.all.config import \
56
concurrent_infer as infer # noqa: F401, E501
6-
from autotest.infer.config import models # noqa: F401, E501
7+
from autotest.all.config import models # noqa: F401, E501
78
from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import \
89
aime2024_datasets # noqa: F401, E501
910
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
@@ -240,8 +241,8 @@
240241

241242
for datasets_, num in repeated_info:
242243
for dataset_ in datasets_:
243-
dataset_['n'] = num
244-
dataset_['k'] = num
244+
dataset_['n'] = 1
245+
dataset_['k'] = 1
245246

246247
# CompassAcademic Extended Process
247248

@@ -316,12 +317,39 @@
316317
temp_dataset['abbr'] = temp_dataset['abbr'] + '_0shot'
317318

318319
datasets = sum(
319-
(v for k, v in locals().items()
320+
([v[0]] if v else [] for k, v in locals().items()
320321
if k.endswith('_datasets') and 'scicode' not in k.lower()
321322
and 'teval' not in k.lower() and 'dingo' not in k.lower()),
322323
[],
323324
)
325+
teval_en_datasets[0]['eval_cfg']['num_gpus'] = 0
326+
teval_zh_datasets[0]['eval_cfg']['num_gpus'] = 0
327+
datasets += [teval_en_datasets[0], teval_zh_datasets[0], SciCode_datasets[0]]
324328

325-
datasets += teval_en_datasets
326-
datasets += teval_zh_datasets
327-
datasets += SciCode_datasets
329+
obj_llm_judge_cfg = models[0]
330+
331+
for item in datasets:
332+
try:
333+
if 'atlas' in item['abbr'] and 'judge_cfg' in item['eval_cfg'][
334+
'evaluator']:
335+
item['eval_cfg']['evaluator']['judge_cfg'] = dict(
336+
judgers=[obj_llm_judge_cfg])
337+
elif 'judge_cfg' in item['eval_cfg']['evaluator']:
338+
item['eval_cfg']['evaluator']['judge_cfg'] = obj_llm_judge_cfg
339+
elif 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
340+
item['eval_cfg']['evaluator']['llm_evaluator'][
341+
'judge_cfg'] = obj_llm_judge_cfg
342+
except Exception:
343+
pass
344+
345+
for d in datasets:
346+
d['reader_cfg']['test_range'] = '[0:2]'
347+
if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[
348+
'eval_cfg']['evaluator']['dataset_cfg']:
349+
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
350+
'test_range'] = '[0:2]'
351+
if 'llm_evaluator' in d['eval_cfg'][
352+
'evaluator'] and 'dataset_cfg' in d[ # noqa: E501
353+
'eval_cfg']['evaluator']['llm_evaluator']:
354+
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
355+
'reader_cfg']['test_range'] = '[0:2]'

0 commit comments

Comments
 (0)