feat: add GenEval benchmark (#507)

davidberenstein1957 · cursoragent · davidberenstein1957 · commit 7ebb4cd8ff57 · 2026-02-27T11:25:24.000+01:00
* feat: add benchmark support to PrunaDataModule and implement PartiPrompts benchmark

- Introduced `from_benchmark` method in `PrunaDataModule` to create instances from benchmark classes.
- Added `Benchmark`, `BenchmarkEntry`, and `BenchmarkRegistry` classes for managing benchmarks.
- Implemented `PartiPrompts` benchmark for text-to-image generation with various categories and challenges.
- Created utility function `benchmark_to_datasets` to convert benchmarks into datasets compatible with `PrunaDataModule`.
- Added integration tests for benchmark functionality and data module interactions.

* refactor: simplify benchmark system, extend PartiPrompts with subset filtering

- Remove heavy benchmark abstraction (Benchmark class, registry, adapter, 24 subclasses)
- Extend setup_parti_prompts_dataset with category and num_samples params
- Add BenchmarkInfo dataclass for metadata (metrics, description, subsets)
- Switch PartiPrompts to prompt_with_auxiliaries_collate to preserve Category/Challenge
- Merge tests into test_datamodule.py

Reduces 964 lines to 128 lines (87% reduction)

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* feat: add GenEval benchmark

Add GenEval benchmark for fine-grained compositional evaluation of
text-to-image models. Fetches prompts from GitHub and generates questions.

- Add setup_geneval_dataset with 6 subcategories
- Categories: single_object, two_object, counting, colors, position, color_attr
- Generates evaluation questions from metadata
- Register in base_datasets with prompt_with_auxiliaries_collate
- Add BenchmarkInfo with metrics: ["qa_accuracy"]
- Add tests

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* fix: add Numpydoc parameter docs for BenchmarkInfo

Document all dataclass fields per Numpydoc PR01 with summary on new line per GL01.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* feat: add benchmark discovery functions and expand benchmark registry

- Add list_benchmarks() to filter benchmarks by task type
- Add get_benchmark_info() to retrieve benchmark metadata
- Add COCO, ImageNet, WikiText to benchmark_info registry
- Fix metric names to match MetricRegistry (clip_score, clipiqa)

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* fix: properly check position value before generating question

Use None default and check both pos existence and non-empty first element to avoid malformed questions.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;

* chore: apply ruff format to data module, add lint-before-push script

Made-with: Cursor

* chore: fix get_literal_values_from_param docstring, add SCOPE to lint script

Made-with: Cursor

* chore: remove scripts/lint-before-push.sh

Made-with: Cursor

* chore: align metrics with Pruna, comment unsupported InferBench metrics

Made-with: Cursor

* fix: remove accuracy from GenEval - qa_accuracy ≠ accuracy

Made-with: Cursor

* chore: simplify metric comment

Made-with: Cursor

---------

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/src/pruna/data/__init__.py b/src/pruna/data/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dataclasses import dataclass, field
 from functools import partial
 from typing import Any, Callable, Tuple
 
@@ -28,6 +29,7 @@
 from pruna.data.datasets.prompt import (
     setup_drawbench_dataset,
     setup_genai_bench_dataset,
+    setup_geneval_dataset,
     setup_parti_prompts_dataset,
 )
 from pruna.data.datasets.question_answering import setup_polyglot_dataset
@@ -50,6 +52,7 @@
 
 BENCHMARK_CATEGORY_CONFIG: dict[str, tuple[str, list[str]]] = {
     "PartiPrompts": ("Animals", ["Category", "Challenge"]),
+    "GenEval": ("counting", ["tag"]),
 }
 
 base_datasets: dict[str, Tuple[Callable, str, dict[str, Any]]] = {
@@ -107,6 +110,186 @@
         {},
     ),
     "GenAIBench": (setup_genai_bench_dataset, "prompt_collate", {}),
+    "GenEval": (setup_geneval_dataset, "prompt_with_auxiliaries_collate", {}),
     "TinyIMDB": (setup_tiny_imdb_dataset, "text_generation_collate", {}),
     "VBench": (setup_vbench_dataset, "prompt_with_auxiliaries_collate", {}),
 }
+
+
+@dataclass
+class BenchmarkInfo:
+    """
+    Metadata for a benchmark dataset.
+
+    Parameters
+    ----------
+    name : str
+        Internal identifier for the benchmark.
+    display_name : str
+        Human-readable name for display purposes.
+    description : str
+        Description of what the benchmark evaluates.
+    metrics : list[str]
+        List of metric names used for evaluation.
+    task_type : str
+        Type of task the benchmark evaluates (e.g., 'text_to_image').
+    subsets : list[str]
+        Optional list of benchmark subset names.
+    """
+
+    name: str
+    display_name: str
+    description: str
+    metrics: list[str]
+    task_type: str
+    subsets: list[str] = field(default_factory=list)
+
+
+benchmark_info: dict[str, BenchmarkInfo] = {
+    "PartiPrompts": BenchmarkInfo(
+        name="parti_prompts",
+        display_name="Parti Prompts",
+        description=(
+            "Over 1,600 diverse English prompts across 12 categories with 11 challenge aspects "
+            "ranging from basic to complex, enabling comprehensive assessment of model capabilities "
+            "across different domains and difficulty levels."
+        ),
+        metrics=["arniqa", "clip_score", "clipiqa", "sharpness"],
+        task_type="text_to_image",
+        subsets=[
+            "Abstract",
+            "Animals",
+            "Artifacts",
+            "Arts",
+            "Food & Beverage",
+            "Illustrations",
+            "Indoor Scenes",
+            "Outdoor Scenes",
+            "People",
+            "Produce & Plants",
+            "Vehicles",
+            "World Knowledge",
+            "Basic",
+            "Complex",
+            "Fine-grained Detail",
+            "Imagination",
+            "Linguistic Structures",
+            "Perspective",
+            "Properties & Positioning",
+            "Quantity",
+            "Simple Detail",
+            "Style & Format",
+            "Writing & Symbols",
+        ],
+    ),
+    "DrawBench": BenchmarkInfo(
+        name="drawbench",
+        display_name="DrawBench",
+        description="A comprehensive benchmark for evaluating text-to-image generation models.",
+        metrics=[
+            "clip_score",
+            "clipiqa",
+            "sharpness",
+            # "image_reward" not supported in Pruna
+        ],
+        task_type="text_to_image",
+    ),
+    "GenAIBench": BenchmarkInfo(
+        name="genai_bench",
+        display_name="GenAI Bench",
+        description="A benchmark for evaluating generative AI models.",
+        metrics=[
+            "clip_score",
+            "clipiqa",
+            "sharpness",
+            # "vqa" not supported in Pruna
+        ],
+        task_type="text_to_image",
+    ),
+    "VBench": BenchmarkInfo(
+        name="vbench",
+        display_name="VBench",
+        description="A benchmark for evaluating video generation models.",
+        metrics=["clip_score"],
+        task_type="text_to_video",
+    ),
+    "GenEval": BenchmarkInfo(
+        name="geneval",
+        display_name="GenEval",
+        description=(
+            "Fine-grained compositional evaluation across object co-occurrence, positioning, "
+            "counting, and color binding to identify specific failure modes in text-to-image alignment."
+        ),
+        metrics=[
+            # "qa_accuracy" not supported in Pruna
+        ],
+        task_type="text_to_image",
+        subsets=["single_object", "two_object", "counting", "colors", "position", "color_attr"],
+    ),
+    "COCO": BenchmarkInfo(
+        name="coco",
+        display_name="COCO",
+        description="Microsoft COCO dataset for image generation evaluation with real image-caption pairs.",
+        metrics=["fid", "clip_score", "clipiqa"],
+        task_type="text_to_image",
+    ),
+    "ImageNet": BenchmarkInfo(
+        name="imagenet",
+        display_name="ImageNet",
+        description="Large-scale image classification benchmark with 1000 classes.",
+        metrics=["accuracy"],
+        task_type="image_classification",
+    ),
+    "WikiText": BenchmarkInfo(
+        name="wikitext",
+        display_name="WikiText",
+        description="Language modeling benchmark based on Wikipedia articles.",
+        metrics=["perplexity"],
+        task_type="text_generation",
+    ),
+}
+
+
+def list_benchmarks(task_type: str | None = None) -> list[str]:
+    """
+    List available benchmark names.
+
+    Parameters
+    ----------
+    task_type : str | None
+        Filter by task type (e.g., 'text_to_image', 'text_to_video').
+        If None, returns all benchmarks.
+
+    Returns
+    -------
+    list[str]
+        List of benchmark names.
+    """
+    if task_type is None:
+        return list(benchmark_info.keys())
+    return [name for name, info in benchmark_info.items() if info.task_type == task_type]
+
+
+def get_benchmark_info(name: str) -> BenchmarkInfo:
+    """
+    Get benchmark metadata by name.
+
+    Parameters
+    ----------
+    name : str
+        The benchmark name.
+
+    Returns
+    -------
+    BenchmarkInfo
+        The benchmark metadata.
+
+    Raises
+    ------
+    KeyError
+        If benchmark name is not found.
+    """
+    if name not in benchmark_info:
+        available = ", ".join(benchmark_info.keys())
+        raise KeyError(f"Benchmark '{name}' not found. Available: {available}")
+    return benchmark_info[name]
diff --git a/src/pruna/data/datasets/prompt.py b/src/pruna/data/datasets/prompt.py
@@ -19,6 +19,8 @@
 from pruna.data.utils import _prepare_test_only_prompt_dataset, define_sample_size_for_dataset
 from pruna.logging.logger import pruna_logger
 
+GenEvalCategory = Literal["single_object", "two_object", "counting", "colors", "position", "color_attr"]
+
 PartiCategory = Literal[
     "Abstract",
     "Animals",
@@ -110,6 +112,91 @@ def setup_parti_prompts_dataset(
     return _prepare_test_only_prompt_dataset(ds, seed, "PartiPrompts")
 
 
+def _generate_geneval_question(entry: dict) -> list[str]:
+    """Generate evaluation questions from GenEval metadata."""
+    tag = entry.get("tag", "")
+    include = entry.get("include", [])
+    questions = []
+
+    for obj in include:
+        cls = obj.get("class", "")
+        if "color" in obj:
+            questions.append(f"Does the image contain a {obj['color']} {cls}?")
+        elif "count" in obj:
+            questions.append(f"Does the image contain exactly {obj['count']} {cls}(s)?")
+        else:
+            questions.append(f"Does the image contain a {cls}?")
+
+    if tag == "position" and len(include) >= 2:
+        a_cls = include[0].get("class", "")
+        b_cls = include[1].get("class", "")
+        pos = include[1].get("position")
+        if pos and pos[0]:
+            questions.append(f"Is the {b_cls} {pos[0]} the {a_cls}?")
+
+    return questions
+
+
+def setup_geneval_dataset(
+    seed: int,
+    fraction: float = 1.0,
+    train_sample_size: int | None = None,
+    test_sample_size: int | None = None,
+    category: GenEvalCategory | list[GenEvalCategory] | None = None,
+) -> Tuple[Dataset, Dataset, Dataset]:
+    """
+    Setup the GenEval benchmark dataset.
+
+    License: MIT
+
+    Parameters
+    ----------
+    seed : int
+        The seed to use.
+    fraction : float
+        The fraction of the dataset to use.
+    train_sample_size : int | None
+        Unused; train/val are dummy.
+    test_sample_size : int | None
+        The sample size to use for the test dataset.
+    category : GenEvalCategory | list[GenEvalCategory] | None
+        Filter by category. Available: single_object, two_object, counting, colors, position, color_attr.
+
+    Returns
+    -------
+    Tuple[Dataset, Dataset, Dataset]
+        The GenEval dataset (dummy train, dummy val, test).
+    """
+    import json
+
+    import requests
+
+    url = "https://raw.githubusercontent.com/djghosh13/geneval/d927da8e42fde2b1b5cd743da4df5ff83c1654ff/prompts/evaluation_metadata.jsonl"
+    response = requests.get(url)
+    data = [json.loads(line) for line in response.text.splitlines()]
+
+    if category is not None:
+        categories = [category] if not isinstance(category, list) else category
+        data = [entry for entry in data if entry.get("tag") in categories]
+
+    records = []
+    for entry in data:
+        questions = _generate_geneval_question(entry)
+        records.append(
+            {
+                "text": entry["prompt"],
+                "tag": entry.get("tag", ""),
+                "questions": questions,
+                "include": entry.get("include", []),
+            }
+        )
+
+    ds = Dataset.from_list(records)
+    test_sample_size = define_sample_size_for_dataset(ds, fraction, test_sample_size)
+    ds = ds.select(range(min(test_sample_size, len(ds))))
+    return _prepare_test_only_prompt_dataset(ds, seed, "GenEval")
+
+
 def setup_genai_bench_dataset(seed: int) -> Tuple[Dataset, Dataset, Dataset]:
     """
     Setup the GenAI Bench dataset.
diff --git a/src/pruna/data/datasets/text_generation.py b/src/pruna/data/datasets/text_generation.py
@@ -33,8 +33,7 @@ def setup_wikitext_dataset() -> Tuple[Dataset, Dataset, Dataset]:
         The WikiText dataset.
     """
     train_dataset, val_dataset, test_dataset = load_dataset(
-        path="mikasenghaas/wikitext-2",
-        split=["train", "validation", "test"]
+        path="mikasenghaas/wikitext-2", split=["train", "validation", "test"]
     )
     return train_dataset, val_dataset, test_dataset  # type: ignore[return-value]
 
@@ -57,15 +56,15 @@ def setup_wikitext_tiny_dataset(seed: int = 42, num_rows: int = 960) -> Tuple[Da
     Tuple[Dataset, Dataset, Dataset]
         The TinyWikiText dataset split .8/.1/.1 into train/val/test subsets, respectively.
     """
-    assert 10 <= num_rows < 1000, 'the total number of rows, r, for the tiny wikitext dataset must be 10 <= r < 1000'
+    assert 10 <= num_rows < 1000, "the total number of rows, r, for the tiny wikitext dataset must be 10 <= r < 1000"
 
     # load the 'mikasenghaas/wikitext-2' dataset with a total of 21,580 rows using the setup_wikitext_dataset() function
     train_ds, val_ds, test_ds = setup_wikitext_dataset()
 
     # assert the wikitext dataset train/val/test splits each have enough rows for reducing to .8/.1/.1, respectively
-    assert train_ds.num_rows >= int(num_rows * 0.8), f'wikitext cannot be reduced to {num_rows} rows, train too small'
-    assert val_ds.num_rows >= int(num_rows * 0.1), f'wikitext cannot be reduced to {num_rows} rows, val too small'
-    assert test_ds.num_rows >= int(num_rows * 0.1), f'wikitext cannot be reduced to {num_rows} rows, test too small'
+    assert train_ds.num_rows >= int(num_rows * 0.8), f"wikitext cannot be reduced to {num_rows} rows, train too small"
+    assert val_ds.num_rows >= int(num_rows * 0.1), f"wikitext cannot be reduced to {num_rows} rows, val too small"
+    assert test_ds.num_rows >= int(num_rows * 0.1), f"wikitext cannot be reduced to {num_rows} rows, test too small"
 
     # randomly select from the wikitext dataset a total number of rows below 1000 split .8/.1/.1 between train/val/test
     train_dataset_tiny = train_ds.shuffle(seed=seed).select(range(int(num_rows * 0.8)))
diff --git a/tests/data/test_datamodule.py b/tests/data/test_datamodule.py
@@ -45,6 +45,7 @@ def iterate_dataloaders(datamodule: PrunaDataModule) -> None:
         pytest.param("GenAIBench", dict(), marks=pytest.mark.slow),
         pytest.param("TinyIMDB", dict(tokenizer=bert_tokenizer), marks=pytest.mark.slow),
         pytest.param("VBench", dict(), marks=pytest.mark.slow),
+        pytest.param("GenEval", dict(), marks=pytest.mark.slow),
     ],
 )
 def test_dm_from_string(dataset_name: str, collate_fn_args: dict[str, Any]) -> None:

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ def iterate_dataloaders(datamodule: PrunaDataModule) -> None:`
`45`	`45`	`pytest.param("GenAIBench", dict(), marks=pytest.mark.slow),`
`46`	`46`	`pytest.param("TinyIMDB", dict(tokenizer=bert_tokenizer), marks=pytest.mark.slow),`
`47`	`47`	`pytest.param("VBench", dict(), marks=pytest.mark.slow),`
	`48`	`+ pytest.param("GenEval", dict(), marks=pytest.mark.slow),`
`48`	`49`	`],`
`49`	`50`	`)`
`50`	`51`	`def test_dm_from_string(dataset_name: str, collate_fn_args: dict[str, Any]) -> None:`