fix: address PR #502 review comments

davidberenstein1957 · davidberenstein1957 · commit edaaf32222f7 · 2026-02-27T17:07:58.000+01:00
- Remove shuffle from test-only prompt datasets (test sets should not be shuffled)
- Remove duplicate Benchmark/benchmark_info from data/__init__.py (already in evaluation/benchmarks)
- Use explicit if/else for category is None in setup_parti_prompts_dataset
- Add status_code check to ImgEdit for consistency with OneIG
- Put load_dataset on one line in _load_oneig_generic
- Derive alignment_cats from ONEIG_DATASET_CATEGORIES

Made-with: Cursor
diff --git a/src/pruna/data/__init__.py b/src/pruna/data/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Tuple
 
@@ -121,207 +120,3 @@
     "TinyIMDB": (setup_tiny_imdb_dataset, "text_generation_collate", {}),
     "VBench": (setup_vbench_dataset, "prompt_with_auxiliaries_collate", {}),
 }
-
-
-@dataclass
-class Benchmark:
-    """
-    Metadata for a benchmark dataset.
-
-    Parameters
-    ----------
-    name : str
-        Internal identifier for the benchmark.
-    display_name : str
-        Human-readable name for display purposes.
-    description : str
-        Description of what the benchmark evaluates.
-    metrics : list[str]
-        List of metric names used for evaluation.
-    task_type : str
-        Type of task the benchmark evaluates (e.g., 'text_to_image').
-    """
-
-    name: str
-    display_name: str
-    description: str
-    metrics: list[str]
-    task_type: str
-
-
-benchmark_info: dict[str, Benchmark] = {
-    "PartiPrompts": Benchmark(
-        name="parti_prompts",
-        display_name="Parti Prompts",
-        description=(
-            "Holistic benchmark from Google Research with over 1,600 English prompts across 12 categories "
-            "and 11 challenge aspects. Evaluates text-to-image models on abstract thinking, world knowledge, "
-            "perspectives, and symbol rendering from basic to complex compositions."
-        ),
-        metrics=["arniqa", "clip_score", "clipiqa", "sharpness"],
-        task_type="text_to_image",
-    ),
-    "DrawBench": Benchmark(
-        name="drawbench",
-        display_name="DrawBench",
-        description=(
-            "Comprehensive benchmark from the Imagen team for rigorous evaluation of text-to-image models. "
-            "Enables side-by-side comparison on sample quality and image-text alignment with human raters."
-        ),
-        metrics=[
-            "clip_score",
-            "clipiqa",
-            "sharpness",
-            # "image_reward" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "GenAIBench": Benchmark(
-        name="genai_bench",
-        display_name="GenAI Bench",
-        description=(
-            "1,600 prompts from professional designers for compositional text-to-visual generation. "
-            "Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning "
-            "(counting, comparison, logic/negation) with over 24k human ratings."
-        ),
-        metrics=[
-            "clip_score",
-            "clipiqa",
-            "sharpness",
-            # "vqa" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "VBench": Benchmark(
-        name="vbench",
-        display_name="VBench",
-        description=(
-            "Comprehensive benchmark suite for video generative models. Decomposes video quality into "
-            "16 disentangled dimensions: temporal flickering, motion smoothness, subject consistency, "
-            "spatial relationship, color, aesthetic quality, and more."
-        ),
-        metrics=["clip_score"],
-        task_type="text_to_video",
-    ),
-    "GenEval": Benchmark(
-        name="geneval",
-        display_name="GenEval",
-        description=(
-            "Object-focused framework (NeurIPS 2023) for fine-grained text-to-image alignment. "
-            "Evaluates compositional properties: object co-occurrence, position, count, and color binding "
-            "via instance-level analysis rather than distribution-level metrics."
-        ),
-        metrics=[
-            # "qa_accuracy" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "HPS": Benchmark(
-        name="hps",
-        display_name="HPS",
-        description=(
-            "Human Preference Score v2: large-scale benchmark with 798k human preference choices on "
-            "433k image pairs. CLIP fine-tuned on HPD v2 to predict human preferences and align "
-            "evaluation with actual human judgment across diverse generative outputs."
-        ),
-        metrics=[
-            # "hps" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "LongTextBench": Benchmark(
-        name="long_text_bench",
-        display_name="Long Text Bench",
-        description=(
-            "DetailMaster benchmark with prompts averaging 284.89 tokens. Evaluates four dimensions: "
-            "character attributes, structured locations, scene attributes, and spatial relationships "
-            "to test compositional reasoning under long prompt complexity."
-        ),
-        metrics=[
-            # "text_score" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "ImgEdit": Benchmark(
-        name="imgedit",
-        display_name="ImgEdit",
-        description=(
-            "Unified image editing benchmark (PKU-YuanGroup) with 8 edit types: replace, add, remove, "
-            "adjust, extract, style, background, compose. Evaluates instruction adherence, editing "
-            "quality, and detail preservation."
-        ),
-        metrics=[
-            # "img_edit_score" not supported in Pruna
-        ],
-        task_type="image_edit",
-    ),
-    "GEditBench": Benchmark(
-        name="gedit_bench",
-        display_name="GEdit Bench",
-        description=(
-            "StepFun benchmark grounded in real-world user instructions. 11 task types including "
-            "background_change, subject_add/remove/replace, style_change, and tone_transfer for "
-            "practical evaluation of image editing capabilities."
-        ),
-        metrics=[
-            # "viescore" not supported in Pruna
-        ],
-        task_type="image_edit",
-    ),
-    "OneIG": Benchmark(
-        name="oneig",
-        display_name="OneIG",
-        description=(
-            "Omni-dimensional benchmark (NeurIPS 2025) for nuanced image generation evaluation. "
-            "Six categories: Text_Rendering, Anime_Stylization, Portrait, General_Object, "
-            "Knowledge_Reasoning, Multilingualism. Addresses text rendering precision and prompt-image alignment."
-        ),
-        metrics=[
-            # "alignment_score", "text_score" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "DPG": Benchmark(
-        name="dpg",
-        display_name="DPG",
-        description=(
-            "Dense Prompt Graph benchmark from ELLA/Tencent. ~1,000 complex prompts testing "
-            "entity, attribute, relation, and global aspects. Evaluates models on dense prompt "
-            "following with multiple objects and varied attributes."
-        ),
-        metrics=[
-            # "qa_accuracy" not supported in Pruna
-        ],
-        task_type="text_to_image",
-    ),
-    "COCO": Benchmark(
-        name="coco",
-        display_name="COCO",
-        description=(
-            "Microsoft COCO dataset for image generation evaluation. Real image-caption pairs "
-            "enabling FID and alignment metrics on distribution-level and instance-level quality."
-        ),
-        metrics=["fid", "clip_score", "clipiqa"],
-        task_type="text_to_image",
-    ),
-    "ImageNet": Benchmark(
-        name="imagenet",
-        display_name="ImageNet",
-        description=(
-            "Large-scale image classification benchmark with 1,000 classes. Standard evaluation "
-            "for vision model accuracy on object recognition."
-        ),
-        metrics=["accuracy"],
-        task_type="image_classification",
-    ),
-    "WikiText": Benchmark(
-        name="wikitext",
-        display_name="WikiText",
-        description=(
-            "Language modeling benchmark based on Wikipedia articles. Standard evaluation "
-            "for text generation quality via perplexity."
-        ),
-        metrics=["perplexity"],
-        task_type="text_generation",
-    ),
-}
diff --git a/src/pruna/data/datasets/prompt.py b/src/pruna/data/datasets/prompt.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 from typing import Literal, Tuple, get_args
 
 from datasets import Dataset, load_dataset
@@ -122,6 +123,10 @@
 ]
 DPGCategory = Literal["entity", "attribute", "relation", "global", "other"]
 
+ONEIG_DATASET_CATEGORIES = frozenset(
+    {"Anime_Stylization", "General_Object", "Knowledge_Reasoning", "Multilingualism", "Portrait", "Text_Rendering"}
+)
+
 
 def setup_drawbench_dataset(seed: int) -> Tuple[Dataset, Dataset, Dataset]:
     """
@@ -177,7 +182,9 @@ def setup_parti_prompts_dataset(
     """
     ds = load_dataset("nateraw/parti-prompts")["train"]  # type: ignore[index]
 
-    if category is not None:
+    if category is None:
+        pass
+    else:
         categories = [category] if not isinstance(category, list) else category
         ds = ds.filter(lambda x: x["Category"] in categories or x["Challenge"] in categories)
 
@@ -419,8 +426,16 @@ def setup_imgedit_dataset(
     instructions_url = "https://raw.githubusercontent.com/PKU-YuanGroup/ImgEdit/b3eb8e74d7cd1fd0ce5341eaf9254744a8ab4c0b/Benchmark/Basic/basic_edit.json"
     judge_prompts_url = "https://raw.githubusercontent.com/PKU-YuanGroup/ImgEdit/c14480ac5e7b622e08cd8c46f96624a48eb9ab46/Benchmark/Basic/prompts.json"
 
-    instructions = json.loads(requests.get(instructions_url).text)
-    judge_prompts = json.loads(requests.get(judge_prompts_url).text)
+    resp_inst = requests.get(instructions_url)
+    resp_judge = requests.get(judge_prompts_url)
+    instructions: dict = {}
+    if resp_inst.status_code == 200:
+        with contextlib.suppress(json.JSONDecodeError):
+            instructions = json.loads(resp_inst.text)
+    judge_prompts: dict = {}
+    if resp_judge.status_code == 200:
+        with contextlib.suppress(json.JSONDecodeError):
+            judge_prompts = json.loads(resp_judge.text)
 
     categories = [category] if category is not None and not isinstance(category, list) else category
     records = []
@@ -495,7 +510,7 @@ def _load_oneig_alignment(seed: int, category: str | None = None, class_filter:
         except json.JSONDecodeError:
             pass
 
-    alignment_cats = {"Anime_Stylization", "Portrait", "General_Object"}
+    alignment_cats = ONEIG_DATASET_CATEGORIES - {"Knowledge_Reasoning", "Multilingualism", "Text_Rendering"}
     records = []
     for row in ds:
         row_id = row.get("id", "")
@@ -525,21 +540,14 @@ def _load_oneig_alignment(seed: int, category: str | None = None, class_filter:
     return Dataset.from_list(records).shuffle(seed=seed)
 
 
-ONEIG_DATASET_CATEGORIES = frozenset(
-    {"Anime_Stylization", "General_Object", "Knowledge_Reasoning", "Multilingualism", "Portrait", "Text_Rendering"}
-)
-
-
 def _load_oneig_generic(
     seed: int,
     category_filter: str | None = None,
     class_filter: str | None = None,
     config: str = "OneIG-Bench",
 ) -> Dataset:
     """Load OneIG data for Knowledge_Reasoning, Multilingualism, or any category without alignment questions."""
-    ds = load_dataset("OneIG-Bench/OneIG-Bench", config)[  # type: ignore[index]
-        "train"
-    ]
+    ds = load_dataset("OneIG-Bench/OneIG-Bench", config)["train"]  # type: ignore[index]
 
     records = []
     for row in ds:
diff --git a/src/pruna/data/utils.py b/src/pruna/data/utils.py
@@ -268,16 +268,16 @@ def _prepare_test_only_prompt_dataset(
     dataset_name: str,
 ) -> Tuple[Dataset, Dataset, Dataset]:
     """
-    Shared tail for test-only prompt datasets: shuffle, return dummy train/val + test.
+    Shared tail for test-only prompt datasets: return dummy train/val + test.
 
-    All benchmark datasets use this.
+    All benchmark datasets use this. Test datasets are not shuffled.
 
     Parameters
     ----------
     ds : Dataset
         The dataset to prepare.
     seed : int
-        The seed for shuffling.
+        Unused; kept for API compatibility.
     dataset_name : str
         Name for logging.
 
@@ -286,7 +286,6 @@ def _prepare_test_only_prompt_dataset(
     Tuple[Dataset, Dataset, Dataset]
         Dummy train, dummy val, and test datasets.
     """
-    ds = ds.shuffle(seed=seed)
     pruna_logger.info(f"{dataset_name} is a test-only dataset. Do not use it for training or validation.")
     return ds.select([0]), ds.select([0]), ds