Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 79277b4

Browse files
changwangsspre-commit-ci[bot]chensuyue
authored
update lm-eval to 0.4.3 (#1658)
Signed-off-by: changwangss <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chen, suyue <[email protected]>
1 parent a864bb2 commit 79277b4

File tree

17 files changed

+63
-50
lines changed

17 files changed

+63
-50
lines changed

.github/workflows/script/formatScan/pylint.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ else
2828
echo "Not found requirements.txt file."
2929
fi
3030
# install packages
31-
pip install lm-eval==0.4.2
31+
pip install lm-eval==0.4.3
3232
pip install accelerate nlpaug nltk schema optimum-intel optimum peft
3333
pip install --upgrade --force-reinstall transformers==4.36.2
3434
pip install optimum-habana

examples/huggingface/neural_speed/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
intel_extension_for_transformers
22
neural-speed
3-
lm-eval==0.4.2
3+
lm-eval==0.4.3
44
sentencepiece
55
gguf
66
--extra-index-url https://download.pytorch.org/whl/cpu
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
transformers
22
accelerate
33
sentencepiece != 0.1.92
4-
lm-eval==0.4.2
4+
lm-eval==0.4.3

examples/huggingface/pytorch/language-modeling/pruning/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ transformers
77
torch==2.0.1
88
tqdm
99
neural_compressor
10-
lm-eval==0.4.2
10+
lm-eval==0.4.3
1111

examples/huggingface/pytorch/language-modeling/quantization/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ wandb
99
einops
1010
neural-compressor
1111
pytest==8.0.0
12-
lm-eval==0.4.2
12+
lm-eval==0.4.3
1313
git+https://github.com/huggingface/peft.git@6c44096c7b8d55a2ecf24be9bc68393467e1584a

examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ tiktoken #qwen
1414
einops #qwen
1515
auto-round
1616
git+https://github.com/intel/neural-compressor.git
17-
lm-eval==0.4.2
17+
lm-eval==0.4.3

examples/huggingface/pytorch/text-generation/quantization/requirements_cpu_woq.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ einops #qwen
1313
git+https://github.com/intel/neural-speed.git
1414
auto-round==0.2
1515
git+https://github.com/intel/neural-compressor.git
16-
lm-eval==0.4.2
16+
lm-eval==0.4.3
1717
huggingface_hub

examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ transformers_stream_generator
1313
tiktoken #qwen
1414
einops #qwen
1515
git+https://github.com/intel/neural-compressor.git
16-
lm-eval==0.4.2
16+
lm-eval==0.4.3
1717
huggingface_hub

examples/huggingface/pytorch/text2text-generation/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ neural-compressor
1111
optimum-intel > 1.12.0
1212
onnxruntime
1313
intel-extension-for-pytorch
14-
lm-eval==0.4.2
14+
lm-eval==0.4.3

examples/modelscope/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
intel_extension_for_transformers
22
neural-speed
3-
lm-eval==0.4.2
3+
lm-eval==0.4.3
44
sentencepiece
55
gguf
66
--extra-index-url https://download.pytorch.org/whl/cpu

intel_extension_for_transformers/neural_chat/requirements_cpu.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ fastapi
77
fschat==0.2.32
88
huggingface_hub
99
intel_extension_for_pytorch==2.3.0
10-
lm-eval==0.4.2
10+
lm-eval==0.4.3
1111
neural-compressor
1212
neural_speed==1.0a0
1313
numpy==1.23.5

intel_extension_for_transformers/neural_chat/requirements_hpu.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ evaluate
44
fastapi
55
fschat==0.2.35
66
huggingface_hub
7-
lm-eval==0.4.2
7+
lm-eval==0.4.3
88
neural-compressor
99
numpy==1.23.5
1010
optimum

intel_extension_for_transformers/neural_chat/requirements_win.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ fastapi
66
fschat==0.2.35
77
huggingface_hub
88
intel-extension-for-transformers
9-
lm-eval==0.4.2
9+
lm-eval==0.4.3
1010
neural-compressor
1111
numpy==1.23.5
1212
optimum

intel_extension_for_transformers/neural_chat/tests/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ langchain-community==0.0.27
3838
langchain_core==0.1.35
3939
langid
4040
librosa
41-
lm-eval==0.4.2
41+
lm-eval==0.4.3
4242
markdown
4343
neural-compressor
4444
neural_speed==1.0a0

intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/accuracy.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval.evaluator import(
4444
request_caching_arg_to_dict
4545
)
46-
from lm_eval.logging_utils import WandbLogger
46+
from lm_eval.loggers import WandbLogger
4747
from lm_eval.tasks import TaskManager
4848
from lm_eval.utils import make_table, simple_parse_args_string
4949

intel_extension_for_transformers/transformers/llm/evaluation/lm_eval/evaluator.py

+47-34
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
print_writeout,
3939
run_task_tests,
4040
)
41-
from lm_eval.logging_utils import add_env_info, get_git_commit_hash
41+
from lm_eval.loggers.utils import add_env_info, get_git_commit_hash
4242
from lm_eval.tasks import TaskManager, get_task_dict
4343
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
4444
from lm_eval import utils
@@ -509,9 +509,14 @@ def evaluate(
509509
# aggregate results ; run bootstrap CIs
510510
for task_output in eval_tasks:
511511
task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
512-
results, samples, configs, versions, num_fewshot = consolidate_results(
513-
eval_tasks
514-
)
512+
(
513+
results,
514+
samples,
515+
configs,
516+
versions,
517+
num_fewshot,
518+
higher_is_better,
519+
) = consolidate_results(eval_tasks)
515520

516521
### Calculate group metrics ###
517522
if bool(results):
@@ -522,6 +527,24 @@ def evaluate(
522527
# or `task_name: []`.
523528
# we only want to operate on groups here.
524529
continue
530+
531+
# collect all higher_is_better values for metrics
532+
# in the group's subtasks.
533+
# TODO: clean this up ; unify with the below metric_list loop?
534+
_higher_is_better = {}
535+
for task in task_list:
536+
for m, h in higher_is_better[task].items():
537+
if m not in _higher_is_better.keys():
538+
_higher_is_better[m] = h
539+
if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
540+
eval_logger.warning(
541+
f"Higher_is_better values for metric {m} in group {group} are not consistent." +
542+
f"Defaulting to None."
543+
)
544+
_higher_is_better[m] = None
545+
higher_is_better[group] = _higher_is_better
546+
547+
# collect all metric keys used by a subtask in the group.
525548
metric_list = list(
526549
{
527550
key
@@ -534,38 +557,22 @@ def evaluate(
534557
stderr = "_stderr,".join(metric.split(","))
535558

536559
# gather metrics, sizes, and stderrs from subtasks
537-
metrics = [
538-
results[task][metric]
539-
for task in task_list
540-
if metric in results[task]
541-
] # TODO: copy?
542-
stderrs = [
543-
results[task][stderr]
544-
for task in task_list
545-
if stderr in results[task]
546-
]
547-
sizes = [
548-
results[task]["samples"]
549-
for task in task_list
550-
if metric in results[task]
551-
]
560+
metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy?
561+
stderrs = [results[task][stderr] for task in task_list if stderr in results[task]]
562+
sizes = [results[task]["samples"] for task in task_list if metric in results[task]]
552563

553564
# compute group's pooled metric and stderr
554-
results[group][metric] = (
555-
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
556-
)
565+
results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
557566
# TODO: calculate grouped metric using aggregation fn
558567
if "N/A" in stderrs:
559568
results[group][stderr] = "N/A"
560569
else:
561-
results[group][stderr] = (
562-
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
563-
)
570+
results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
564571
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
565572
# To use the old (likely incorrect) variance formula,
566573
# comment out the above and uncomment this line:
567-
# results[group][stderr] = \
568-
# lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
574+
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs,
575+
# sizes, metrics=metrics)
569576

570577
results[group]["samples"] = sum(sizes)
571578

@@ -578,19 +585,15 @@ def evaluate(
578585
if len(left_tasks_list) == 0:
579586
break
580587

581-
_task_hierarchy = {
582-
k: v for k, v in task_hierarchy.items() if k in left_tasks_list
583-
}
588+
_task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list}
584589
_results_agg, _groups_agg = prepare_print_tasks(_task_hierarchy, results)
585590

586591
results_agg = {**results_agg, **_results_agg}
587592
groups_agg = {**groups_agg, **_groups_agg}
588593

589594
for group_name, task_list in task_hierarchy.items():
590595
if task_list:
591-
num_fewshot[group_name] = num_fewshot[
592-
task_list[0]
593-
] # TODO: validate this
596+
num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this
594597

595598
results_dict = {
596599
"results": dict(results_agg.items()),
@@ -599,6 +602,17 @@ def evaluate(
599602
"configs": dict(sorted(configs.items())),
600603
"versions": dict(sorted(versions.items())),
601604
"n-shot": dict(sorted(num_fewshot.items())),
605+
"higher_is_better": dict(sorted(higher_is_better.items())),
606+
"n-samples": {
607+
task_output.task_name: {
608+
"original": len(task_output.task.eval_docs),
609+
"effective": min(
610+
limit if limit else len(task_output.task.eval_docs),
611+
len(task_output.task.eval_docs),
612+
),
613+
}
614+
for task_output in eval_tasks
615+
},
602616
}
603617
if log_samples:
604618
results_dict["samples"] = dict(samples)
@@ -608,7 +622,6 @@ def evaluate(
608622
else:
609623
return None
610624

611-
612625
def request_caching_arg_to_dict(cache_requests: str) -> dict:
613626
request_caching_args = {
614627
"cache_requests": cache_requests in {"true", "refresh"},

tests/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ git+https://github.com/intel/neural-compressor.git
1212
git+https://github.com/intel/neural-speed.git
1313
intel-extension-for-pytorch==2.3.0
1414
intel-tensorflow==2.14.0
15-
lm-eval==0.4.2
15+
lm-eval==0.4.3
1616
mlflow
1717
nlpaug==1.1.9
1818
onnx

0 commit comments

Comments
 (0)