From df8f9eb8771681da3b869088262c00dcea0a772f Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Thu, 25 Sep 2025 22:04:38 +0000 Subject: [PATCH 1/3] fix caching --- src/lighteval/tasks/registry.py | 2 +- src/lighteval/utils/cache_management.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 95914991c..cae642ff1 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -325,7 +325,7 @@ def _update_task_configs(self) -> dict[str, LightevalTaskConfig]: # noqa: C901 f"was not correctly parametrized. Forgot to set '{attribute}'." ) - task_to_configs[expanded_task].append(config) + task_to_configs[expanded_task + f"|{few_shot}"].append(config) return task_to_configs diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index 2059d2843..2b51775a1 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -163,8 +163,8 @@ def _get_task_hash(self, full_task_name: str) -> str: "The task registry was not provided to the cache config. We can't test if the current task has the same hash as the saved tasks." ) return "NO_HASH" - task_suite, task_name, _ = full_task_name.split("|") - task_configs: list[LightevalTaskConfig] = sorted(self.registry.task_to_configs[f"{task_suite}|{task_name}"]) + task_suite, task_name, few_shot = full_task_name.split("|") + task_configs: list[LightevalTaskConfig] = sorted(self.registry.task_to_configs[f"{task_suite}|{task_name}|{few_shot}"]) config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs]) return hashlib.sha256(config_str.encode()).hexdigest()[:16] From 7d32a114bdefc3741db78bf8a422f4fb44fbfad2 Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Fri, 3 Oct 2025 18:06:52 +0000 Subject: [PATCH 2/3] precommit --- src/lighteval/utils/cache_management.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index 2b51775a1..e82b35d03 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -30,6 +30,7 @@ from typing import Callable, List, Set, Tuple, Union import pandas as pd +import tqdm from datasets import Dataset, load_dataset from lighteval.models.abstract_model import ModelConfig @@ -164,7 +165,9 @@ def _get_task_hash(self, full_task_name: str) -> str: ) return "NO_HASH" task_suite, task_name, few_shot = full_task_name.split("|") - task_configs: list[LightevalTaskConfig] = sorted(self.registry.task_to_configs[f"{task_suite}|{task_name}|{few_shot}"]) + task_configs: list[LightevalTaskConfig] = sorted( + self.registry.task_to_configs[f"{task_suite}|{task_name}|{few_shot}"] + ) config_str = "|".join([task_config.__str__(lite=True) for task_config in task_configs]) return hashlib.sha256(config_str.encode()).hexdigest()[:16] @@ -243,7 +246,7 @@ def get_samples_to_process_and_cache( docs_not_cached = [] tasks_with_cached_samples = set() - for doc in docs: + for doc in tqdm.tqdm(docs): task_id = self.get_task_id(doc.task_name, sampling_method) try: if doc.id in cached_indices[task_id]: From 450149668e548319839a3ce440145354284fc45a Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Fri, 3 Oct 2025 18:09:07 +0000 Subject: [PATCH 3/3] notqdm --- src/lighteval/utils/cache_management.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lighteval/utils/cache_management.py b/src/lighteval/utils/cache_management.py index e82b35d03..590854955 100644 --- a/src/lighteval/utils/cache_management.py +++ b/src/lighteval/utils/cache_management.py @@ -30,7 +30,6 @@ from typing import Callable, List, Set, Tuple, Union import pandas as pd -import tqdm from datasets import Dataset, load_dataset from lighteval.models.abstract_model import ModelConfig @@ -246,7 +245,7 @@ def get_samples_to_process_and_cache( docs_not_cached = [] tasks_with_cached_samples = set() - for doc in tqdm.tqdm(docs): + for doc in docs: task_id = self.get_task_id(doc.task_name, sampling_method) try: if doc.id in cached_indices[task_id]: