Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/evaluate_bluebench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ unitxt-evaluate \
--batch_size 8 \
--verbosity ERROR

unitxt-summarize ./results/bluebench
unitxt-summarize --folder ./results/bluebench
146 changes: 113 additions & 33 deletions src/unitxt/evaluate_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import platform
import subprocess
import sys
from datetime import datetime
from datetime import datetime, timezone
from functools import partial
from typing import Any, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -691,9 +691,8 @@ def _save_results_to_disk(
"results": global_scores,
}

# prepend to the results_path name the time in a wat like this: 2025-04-04T11:37:32

timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
# prepend the timestamp in UTC (e.g., 2025-01-18T11-37-32) to the file names
timestamp = datetime.now().astimezone(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")

results_path = prepend_timestamp_to_path(results_path, timestamp)
samples_path = prepend_timestamp_to_path(samples_path, timestamp)
Expand Down Expand Up @@ -836,48 +835,129 @@ def main():
logger.info("Unitxt Evaluation CLI finished successfully.")


def extract_scores(directory): # pragma: no cover
def extract_scores(folder: str, subset: str, group: str): # pragma: no cover
import pandas as pd

data = []
def safe_score(d: dict, key="score"):
na = "N/A"
return d.get(key, na) if isinstance(d, dict) else na

for filename in sorted(os.listdir(directory)):
if filename.endswith("evaluation_results.json"):
file_path = os.path.join(directory, filename)
try:
with open(file_path, encoding="utf-8") as f:
content = json.load(f)
def extract_subset(results: dict, subset: str, group: str):
subset_results = results.get(subset, {})
row = {subset: safe_score(subset_results)}

groups = subset_results.get("groups", {})

if not groups:
return row

group_results = groups.get(group) if group else next(iter(groups.values()), {})

env_info = content.get("environment_info", {})
timestamp = env_info.get("timestamp_utc", "N/A")
model = env_info.get("parsed_arguments", {}).get("model", "N/A")
results = content.get("results", {})
if not isinstance(group_results, dict):
return row

row = {}
row["Model"] = model
row["Timestamp"] = timestamp
row["Average"] = results.get("score", "N/A")
row.update(
{k: safe_score(v) for k, v in group_results.items() if isinstance(v, dict)}
)
return row

def extract_all(results: dict):
row = {"Average": safe_score(results)}
row.update(
{k: safe_score(v) for k, v in results.items() if isinstance(v, dict)}
)
return row

data = []

for key in results.keys():
if isinstance(results[key], dict):
score = results[key].get("score", "N/A")
row[key] = score
for filename in sorted(os.listdir(folder)):
if not filename.endswith("evaluation_results.json"):
continue

data.append(row)
except Exception as e:
logger.error(f"Error parsing results file {filename}: {e}.")
file_path = os.path.join(folder, filename)
try:
with open(file_path, encoding="utf-8") as f:
content = json.load(f)

env_info = content.get("environment_info", {})
row = {
"Model": safe_score(env_info.get("parsed_arguments", {}), "model"),
"Timestamp": safe_score(env_info, "timestamp_utc"),
}

results = content.get("results", {})

extra = (
extract_subset(results, subset, group)
if subset
else extract_all(results)
)
row.update(extra)
data.append(row)
except Exception as e:
logger.error(f"Error parsing results file {filename}: {e}.")

return pd.DataFrame(data).sort_values(by="Timestamp", ascending=True)


def setup_summarization_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description="CLI utility for summarizing evaluation results.",
)

parser.add_argument(
"--folder",
"-f",
dest="folder",
type=str,
default=".",
help="Directory containing evaluation results json files. Default: current folder.\n",
)

parser.add_argument(
"--subset",
"-s",
type=str,
dest="subset",
default=None,
help="Subset to filter results by. Default: none.",
)

parser.add_argument(
"--group",
"-g",
type=str,
dest="group",
default=None,
help="Group to filter results to. Requires specifying a subset. Default: first group.",
)

parser.add_argument(
"--output",
"-o",
type=str,
choices=["markdown", "csv"],
dest="output",
default="markdown",
help="Output format. Can be markdown or csv. Default: markdown",
)

return parser


def summarize_cli():
if len(sys.argv) != 2:
logger.error("Usage: python summarize_cli_results.py <results-directory>")
sys.exit(1)
directory = sys.argv[1]
df = extract_scores(directory)
parser = setup_summarization_parser()
args = parser.parse_args()

logger.info(df.to_markdown(index=False))
df = extract_scores(args.folder, args.subset, args.group)

if args.output == "markdown":
logger.info(df.to_markdown(index=False))
elif args.output == "csv":
logger.info(df.to_csv(index=False))
else:
logger.error(f"Unsupported output format: {args.output}")


if __name__ == "__main__":
Expand Down
10 changes: 8 additions & 2 deletions tests/library/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,8 +751,12 @@ def test_save_results_to_disk_summary_only(
# --- Arrange ---
# (Arrange section remains the same as previous version)
mock_timestamp = "2025-04-14T10:00:00"
mock_timestamp_utc = "2025-04-14T08:00:00"
mock_now = MagicMock()
mock_now.strftime.return_value = mock_timestamp
mock_astimezone = MagicMock()
mock_astimezone.strftime.return_value = mock_timestamp_utc
mock_now.astimezone.return_value = mock_astimezone
mock_datetime.now.return_value = mock_now
mock_utcnow = MagicMock()
mock_utcnow.isoformat.return_value = "2025-04-14T08:00:00"
Expand Down Expand Up @@ -784,7 +788,9 @@ def test_save_results_to_disk_summary_only(
}
base_results_path = "/out/results_prefix.json"
base_samples_path = "/out/results_prefix_samples.json"
expected_timestamped_results_path = f"/out/{mock_timestamp}_results_prefix.json"
expected_timestamped_results_path = (
f"/out/{mock_timestamp_utc}_results_prefix.json"
)

# --- Act ---
cli._save_results_to_disk(
Expand Down Expand Up @@ -844,7 +850,7 @@ def test_save_results_to_disk_summary_only(
)
log_calls = [call[0][0] for call in mock_logger.info.call_args_list]
expected_timestamped_samples_path = (
f"/out/{mock_timestamp}_results_prefix_samples.json"
f"/out/{mock_timestamp_utc}_results_prefix_samples.json"
)
self.assertNotIn(
f"Saving detailed samples to: {expected_timestamped_samples_path}",
Expand Down