|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | | -from dataclasses import dataclass |
16 | 15 | from functools import partial |
17 | 16 | from typing import Any, Callable, Tuple |
18 | 17 |
|
|
121 | 120 | "TinyIMDB": (setup_tiny_imdb_dataset, "text_generation_collate", {}), |
122 | 121 | "VBench": (setup_vbench_dataset, "prompt_with_auxiliaries_collate", {}), |
123 | 122 | } |
124 | | - |
125 | | - |
126 | | -@dataclass |
127 | | -class Benchmark: |
128 | | - """ |
129 | | - Metadata for a benchmark dataset. |
130 | | -
|
131 | | - Parameters |
132 | | - ---------- |
133 | | - name : str |
134 | | - Internal identifier for the benchmark. |
135 | | - display_name : str |
136 | | - Human-readable name for display purposes. |
137 | | - description : str |
138 | | - Description of what the benchmark evaluates. |
139 | | - metrics : list[str] |
140 | | - List of metric names used for evaluation. |
141 | | - task_type : str |
142 | | - Type of task the benchmark evaluates (e.g., 'text_to_image'). |
143 | | - """ |
144 | | - |
145 | | - name: str |
146 | | - display_name: str |
147 | | - description: str |
148 | | - metrics: list[str] |
149 | | - task_type: str |
150 | | - |
151 | | - |
152 | | -benchmark_info: dict[str, Benchmark] = { |
153 | | - "PartiPrompts": Benchmark( |
154 | | - name="parti_prompts", |
155 | | - display_name="Parti Prompts", |
156 | | - description=( |
157 | | - "Holistic benchmark from Google Research with over 1,600 English prompts across 12 categories " |
158 | | - "and 11 challenge aspects. Evaluates text-to-image models on abstract thinking, world knowledge, " |
159 | | - "perspectives, and symbol rendering from basic to complex compositions." |
160 | | - ), |
161 | | - metrics=["arniqa", "clip_score", "clipiqa", "sharpness"], |
162 | | - task_type="text_to_image", |
163 | | - ), |
164 | | - "DrawBench": Benchmark( |
165 | | - name="drawbench", |
166 | | - display_name="DrawBench", |
167 | | - description=( |
168 | | - "Comprehensive benchmark from the Imagen team for rigorous evaluation of text-to-image models. " |
169 | | - "Enables side-by-side comparison on sample quality and image-text alignment with human raters." |
170 | | - ), |
171 | | - metrics=[ |
172 | | - "clip_score", |
173 | | - "clipiqa", |
174 | | - "sharpness", |
175 | | - # "image_reward" not supported in Pruna |
176 | | - ], |
177 | | - task_type="text_to_image", |
178 | | - ), |
179 | | - "GenAIBench": Benchmark( |
180 | | - name="genai_bench", |
181 | | - display_name="GenAI Bench", |
182 | | - description=( |
183 | | - "1,600 prompts from professional designers for compositional text-to-visual generation. " |
184 | | - "Covers basic skills (scene, attributes, spatial relationships) to advanced reasoning " |
185 | | - "(counting, comparison, logic/negation) with over 24k human ratings." |
186 | | - ), |
187 | | - metrics=[ |
188 | | - "clip_score", |
189 | | - "clipiqa", |
190 | | - "sharpness", |
191 | | - # "vqa" not supported in Pruna |
192 | | - ], |
193 | | - task_type="text_to_image", |
194 | | - ), |
195 | | - "VBench": Benchmark( |
196 | | - name="vbench", |
197 | | - display_name="VBench", |
198 | | - description=( |
199 | | - "Comprehensive benchmark suite for video generative models. Decomposes video quality into " |
200 | | - "16 disentangled dimensions: temporal flickering, motion smoothness, subject consistency, " |
201 | | - "spatial relationship, color, aesthetic quality, and more." |
202 | | - ), |
203 | | - metrics=["clip_score"], |
204 | | - task_type="text_to_video", |
205 | | - ), |
206 | | - "GenEval": Benchmark( |
207 | | - name="geneval", |
208 | | - display_name="GenEval", |
209 | | - description=( |
210 | | - "Object-focused framework (NeurIPS 2023) for fine-grained text-to-image alignment. " |
211 | | - "Evaluates compositional properties: object co-occurrence, position, count, and color binding " |
212 | | - "via instance-level analysis rather than distribution-level metrics." |
213 | | - ), |
214 | | - metrics=[ |
215 | | - # "qa_accuracy" not supported in Pruna |
216 | | - ], |
217 | | - task_type="text_to_image", |
218 | | - ), |
219 | | - "HPS": Benchmark( |
220 | | - name="hps", |
221 | | - display_name="HPS", |
222 | | - description=( |
223 | | - "Human Preference Score v2: large-scale benchmark with 798k human preference choices on " |
224 | | - "433k image pairs. CLIP fine-tuned on HPD v2 to predict human preferences and align " |
225 | | - "evaluation with actual human judgment across diverse generative outputs." |
226 | | - ), |
227 | | - metrics=[ |
228 | | - # "hps" not supported in Pruna |
229 | | - ], |
230 | | - task_type="text_to_image", |
231 | | - ), |
232 | | - "LongTextBench": Benchmark( |
233 | | - name="long_text_bench", |
234 | | - display_name="Long Text Bench", |
235 | | - description=( |
236 | | - "DetailMaster benchmark with prompts averaging 284.89 tokens. Evaluates four dimensions: " |
237 | | - "character attributes, structured locations, scene attributes, and spatial relationships " |
238 | | - "to test compositional reasoning under long prompt complexity." |
239 | | - ), |
240 | | - metrics=[ |
241 | | - # "text_score" not supported in Pruna |
242 | | - ], |
243 | | - task_type="text_to_image", |
244 | | - ), |
245 | | - "ImgEdit": Benchmark( |
246 | | - name="imgedit", |
247 | | - display_name="ImgEdit", |
248 | | - description=( |
249 | | - "Unified image editing benchmark (PKU-YuanGroup) with 8 edit types: replace, add, remove, " |
250 | | - "adjust, extract, style, background, compose. Evaluates instruction adherence, editing " |
251 | | - "quality, and detail preservation." |
252 | | - ), |
253 | | - metrics=[ |
254 | | - # "img_edit_score" not supported in Pruna |
255 | | - ], |
256 | | - task_type="image_edit", |
257 | | - ), |
258 | | - "GEditBench": Benchmark( |
259 | | - name="gedit_bench", |
260 | | - display_name="GEdit Bench", |
261 | | - description=( |
262 | | - "StepFun benchmark grounded in real-world user instructions. 11 task types including " |
263 | | - "background_change, subject_add/remove/replace, style_change, and tone_transfer for " |
264 | | - "practical evaluation of image editing capabilities." |
265 | | - ), |
266 | | - metrics=[ |
267 | | - # "viescore" not supported in Pruna |
268 | | - ], |
269 | | - task_type="image_edit", |
270 | | - ), |
271 | | - "OneIG": Benchmark( |
272 | | - name="oneig", |
273 | | - display_name="OneIG", |
274 | | - description=( |
275 | | - "Omni-dimensional benchmark (NeurIPS 2025) for nuanced image generation evaluation. " |
276 | | - "Six categories: Text_Rendering, Anime_Stylization, Portrait, General_Object, " |
277 | | - "Knowledge_Reasoning, Multilingualism. Addresses text rendering precision and prompt-image alignment." |
278 | | - ), |
279 | | - metrics=[ |
280 | | - # "alignment_score", "text_score" not supported in Pruna |
281 | | - ], |
282 | | - task_type="text_to_image", |
283 | | - ), |
284 | | - "DPG": Benchmark( |
285 | | - name="dpg", |
286 | | - display_name="DPG", |
287 | | - description=( |
288 | | - "Dense Prompt Graph benchmark from ELLA/Tencent. ~1,000 complex prompts testing " |
289 | | - "entity, attribute, relation, and global aspects. Evaluates models on dense prompt " |
290 | | - "following with multiple objects and varied attributes." |
291 | | - ), |
292 | | - metrics=[ |
293 | | - # "qa_accuracy" not supported in Pruna |
294 | | - ], |
295 | | - task_type="text_to_image", |
296 | | - ), |
297 | | - "COCO": Benchmark( |
298 | | - name="coco", |
299 | | - display_name="COCO", |
300 | | - description=( |
301 | | - "Microsoft COCO dataset for image generation evaluation. Real image-caption pairs " |
302 | | - "enabling FID and alignment metrics on distribution-level and instance-level quality." |
303 | | - ), |
304 | | - metrics=["fid", "clip_score", "clipiqa"], |
305 | | - task_type="text_to_image", |
306 | | - ), |
307 | | - "ImageNet": Benchmark( |
308 | | - name="imagenet", |
309 | | - display_name="ImageNet", |
310 | | - description=( |
311 | | - "Large-scale image classification benchmark with 1,000 classes. Standard evaluation " |
312 | | - "for vision model accuracy on object recognition." |
313 | | - ), |
314 | | - metrics=["accuracy"], |
315 | | - task_type="image_classification", |
316 | | - ), |
317 | | - "WikiText": Benchmark( |
318 | | - name="wikitext", |
319 | | - display_name="WikiText", |
320 | | - description=( |
321 | | - "Language modeling benchmark based on Wikipedia articles. Standard evaluation " |
322 | | - "for text generation quality via perplexity." |
323 | | - ), |
324 | | - metrics=["perplexity"], |
325 | | - task_type="text_generation", |
326 | | - ), |
327 | | -} |
0 commit comments