Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/lighteval/main_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ def inspect(

from lighteval.tasks.registry import Registry

registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
registry = Registry(
tasks=tasks, custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True
)

# Loading task
task_dict = registry.load_tasks()
for name, task in task_dict.items():
print("-" * 10, name, "-" * 10)
if show_config:
print("-" * 10, "CONFIG")
task.cfg.print()
task.config.print()
for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
if ix == 0:
print("-" * 10, "SAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

class CorpusLevelComputation(ABC):
@abstractmethod
def compute_corpus(self):
def compute_corpus(self, items):
raise NotImplementedError

def __str__(self):
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@

class SampleLevelComputation(ABC):
@abstractmethod
def compute(self, model_response: ModelResponse, doc: Doc, **kwargs):
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs):
raise NotImplementedError

def __str__(self):
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/extended/ifbench/instructions.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,7 +788,7 @@ def check_following(self, value):
"""Checks if the response only includes words with prime length."""
value = value.translate(str.maketrans("", "", string.punctuation))
words = value.split()
primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97)
primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}
for word in words:
if len(word) not in primes:
return False
Expand Down
61 changes: 30 additions & 31 deletions src/lighteval/tasks/extended/tiny_benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@

import numpy as np
import requests
from aenum import extend_enum
from scipy.optimize import minimize

import lighteval.tasks.default_prompts as prompt
from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics
from lighteval.metrics.metrics import CorpusLevelMetricGrouping
from lighteval.metrics.metrics_corpus import CorpusLevelComputation
from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation
from lighteval.metrics.normalizations import gsm8k_normalizer
from lighteval.models.model_output import ModelResponse
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import SamplingMethod
from lighteval.tasks.requests import Doc, SamplingMethod


# Utility functions
Expand Down Expand Up @@ -101,18 +101,18 @@ def download(self):
with open(path_dld, "wb") as file:
file.write(response.content)

def compute(self, **args):
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
if self.task == "gsm8k":
res = ExactMatches(
strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
).compute(**args)
).compute(doc, model_response, **kwargs)
return dict.fromkeys(self.METRICS, res)
else:
res = LoglikelihoodAcc().compute(**args)
res = LoglikelihoodAcc().compute(doc, model_response, **kwargs)
return dict.fromkeys(self.METRICS, res)

def compute_corpus(self, y_input):
if len(y_input) == self.num_samples and self.estimates is not None:
def compute_corpus(self, items):
if len(items) == self.num_samples and self.estimates is not None:
return self.estimates[self.task]

# We load the weights for the relevant examples
Expand Down Expand Up @@ -149,7 +149,7 @@ def compute_corpus(self, y_input):
# Creating vector y and estimating theta
y = np.zeros(N)
for i, j in enumerate(seen_examples):
y[j] = y_input[i]
y[j] = items[i]

# Getting estimates
theta = fit_theta(y, seen_examples, A, B)
Expand All @@ -175,7 +175,7 @@ def compute_corpus(self, y_input):
estimates[scenario]["pirt"] = IRTp
estimates[scenario]["gpirt"] = IRTpp

self.num_samples = len(y_input)
self.num_samples = len(items)
self.estimates = estimates

return estimates[self.task]
Expand Down Expand Up @@ -238,6 +238,25 @@ def compute_corpus(self, y_input):
# },
]

metrics = {}

for task_param in task_params:
name = task_param["name"]
if name == "gsm8k":
category = SamplingMethod.GENERATIVE
else:
category = SamplingMethod.LOGPROBS

metrics[f"tinybench_metric_{name}"] = (
CorpusLevelMetricGrouping(
metric_name=TinyCorpusAggregator.METRICS,
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
sample_level_fn=TinyCorpusAggregator(name),
category=category,
corpus_level_fn=TinyCorpusAggregator(name),
),
)

TASKS_TABLE = []
for task in task_params:
name = task["name"]
Expand All @@ -256,28 +275,8 @@ def compute_corpus(self, y_input):
evaluation_splits=task["evaluation_split"],
few_shots_split=None,
few_shots_select="random_sampling",
metrics=[f"tinybench_metric_{name}"],
metrics=metrics[f"tinybench_metric_{name}"],
generation_size=generation_size,
stop_sequence=stop_sequence,
)
TASKS_TABLE.append(task)

# CUSTOM METRIC
for task_param in task_params:
name = task_param["name"]
if name == "gsm8k":
category = SamplingMethod.GENERATIVE
else:
category = SamplingMethod.LOGPROBS

extend_enum(
Metrics,
f"tinybench_metric_{name}",
CorpusLevelMetricGrouping(
metric_name=TinyCorpusAggregator.METRICS,
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
sample_level_fn=TinyCorpusAggregator(name),
category=category,
corpus_level_fn=TinyCorpusAggregator(name),
),
)
Loading