huggingface · Sunhill666 · Sep 25, 2025
diff --git a/metrics/clip_score/README.md b/metrics/clip_score/README.md
@@ -0,0 +1,74 @@
+---
+title: CLIP Score
+tags:
+- evaluate
+- metric
+description: "CLIPScore is a reference-free evaluation metric for image captioning that measures the alignment between images and their corresponding text descriptions."
+sdk: gradio
+sdk_version: 5.45.0
+app_file: app.py
+pinned: false
+---
+
+# Metric Card for CLIP Score
+
+***Module Card Instructions:*** *This module calculates CLIPScore, a reference-free evaluation metric for image captioning.*
+
+## Metric Description
+
+CLIPScore is a reference-free evaluation metric for image captioning that measures the alignment between images and their corresponding text descriptions. It leverages the CLIP (Contrastive Language-Image Pretraining) model to compute a similarity score between the visual and textual modalities.
+
+## How to Use
+
+To use the CLIPScore metric, you need to provide a list of text predictions and a list of images. The metric will compute the CLIPScore for each pair of image and text.
+
+### Inputs
+
+- **predictions** *(string): A list of text predictions to score. Each prediction should be a string.*
+- **references** *(PIL.Image.Image): A list of images to score against. Each image should be a PIL image.*
+
+### Output Values
+
+The CLIPScore metric outputs a dictionary with a single key-value pair:
+
+- **clip_score** *(float)*: The average CLIPScore across all provided image-text pairs. The score ranges from -1 to 1, where higher scores indicate better alignment between the image and text.
+
+### Examples
+
+```python
+from PIL import Image
+import evaluate
+
+metric = evaluate.load("sunhill/clip_score")
+predictions = ["A cat sitting on a windowsill.", "A dog playing with a ball."]
+references = [Image.open("cat.jpg"), Image.open("dog.jpg")]
+results = metric.compute(predictions=predictions, references=references)
+print(results)
+# Output: {'clip_score': 0.85}
+```
+
+## Citation
+
+```bibtex
+@article{DBLP:journals/corr/abs-2104-08718,
+    author       = {Jack Hessel and
+                    Ari Holtzman and
+                    Maxwell Forbes and
+                    Ronan Le Bras and
+                    Yejin Choi},
+    title        = {CLIPScore: {A} Reference-free Evaluation Metric for Image Captioning},
+    journal      = {CoRR},
+    volume       = {abs/2104.08718},
+    year         = {2021},
+    url          = {https://arxiv.org/abs/2104.08718},
+    eprinttype   = {arXiv},
+    eprint       = {2104.08718},
+    timestamp    = {Sat, 29 Apr 2023 10:09:27 +0200},
+    biburl       = {https://dblp.org/rec/journals/corr/abs-2104-08718.bib},
+    bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+## Further References
+
+- [clip-score](https://github.com/Taited/clip-score)
diff --git a/metrics/clip_score/app.py b/metrics/clip_score/app.py
@@ -0,0 +1,40 @@
+import sys
+from pathlib import Path
+
+import gradio as gr
+
+import evaluate
+from evaluate import parse_readme
+
+
+metric = evaluate.load("sunhill/clip_score")
+
+
+def compute_clip_score(image, text):
+    results = metric.compute(predictions=[text], references=[image])
+    return results["clip_score"]
+
+
+iface = gr.Interface(
+    fn=compute_clip_score,
+    inputs=[
+        gr.Image(type="pil"),
+        gr.Textbox(lines=2, placeholder="Enter text here..."),
+    ],
+    outputs=gr.Number(label="CLIP Score"),
+    title="CLIP Score Evaluator",
+    description="Evaluate the alignment between an image and a text using CLIP Score.",
+    examples=[
+        [
+            "https://images.unsplash.com/photo-1720539222585-346e73f01536",
+            "A cat sitting on a couch",
+        ],
+        [
+            "https://images.unsplash.com/photo-1694253987647-4eebcf679974",
+            "A scenic view of mountains during sunset",
+        ],
+    ],
+    article=parse_readme(Path(sys.path[0]) / "README.md"),
+)
+
+iface.launch()
diff --git a/metrics/clip_score/clip_score.py b/metrics/clip_score/clip_score.py
@@ -0,0 +1,109 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module calculates CLIPScore, a reference-free evaluation metric for image captioning."""
+
+import datasets
+from transformers import AutoTokenizer, CLIPModel, CLIPProcessor
+
+import evaluate
+from evaluate.utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+_CITATION = """\
+@article{DBLP:journals/corr/abs-2104-08718,
+    author       = {Jack Hessel and
+                    Ari Holtzman and
+                    Maxwell Forbes and
+                    Ronan Le Bras and
+                    Yejin Choi},
+    title        = {CLIPScore: {A} Reference-free Evaluation Metric for Image Captioning},
+    journal      = {CoRR},
+    volume       = {abs/2104.08718},
+    year         = {2021},
+    url          = {https://arxiv.org/abs/2104.08718},
+    eprinttype   = {arXiv},
+    eprint       = {2104.08718},
+    timestamp    = {Sat, 29 Apr 2023 10:09:27 +0200},
+    biburl       = {https://dblp.org/rec/journals/corr/abs-2104-08718.bib},
+    bibsource    = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+
+_DESCRIPTION = """\
+This new module is designed to calculate CLIPScore, a reference-free evaluation metric for image captioning.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Computes CLIPScore to evaluate the alignment between an image and a text.
+Args:
+    predictions: list of text predictions to score. Each prediction
+        should be a string.
+    images: list of images to score against. Each image should be a PIL image.
+Returns:
+    clip_score: CLIPScore between the image and the text.
+Examples:
+    >>> metric = evaluate.load("sunhill/clip_score")
+    >>> results = metric.compute(predictions=["A cat sitting on a couch."], images=[PIL_image])
+    >>> print(results)
+    {'clip_score': 0.2076}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class CLIPScore(evaluate.Metric):
+    """CLIPScore metric."""
+
+    def _info(self):
+        return evaluate.MetricInfo(
+            # This is the description that will appear on the modules page.
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string"),
+                    "references": datasets.Image(),
+                }
+            ),
+            # Homepage of the module for documentation
+            homepage="https://huggingface.co/spaces/sunhill/clip_score",
+            # Additional links to the codebase or references
+            codebase_urls=["https://github.com/Taited/clip-score"],
+            reference_urls=["https://arxiv.org/abs/2104.08718"],
+        )
+
+    def _download_and_prepare(self, dl_manager):
+        """Optional: download external resources useful to compute the scores"""
+        logger.info("Downloading and preparing CLIP ViT-B/32 model...")
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+    def _compute(self, predictions, references):
+        """Returns the scores"""
+        refer = self.processor(text=None, images=references, return_tensors="pt", padding=True)
+        pred = self.tokenizer(predictions, return_tensors="pt", padding=True)
+
+        refer_features = self.model.get_image_features(**refer)
+        pred_features = self.model.get_text_features(**pred)
+
+        refer_features = refer_features / refer_features.norm(dim=1, keepdim=True)
+        pred_features = pred_features / pred_features.norm(dim=1, keepdim=True)
+        clip_score = (refer_features * pred_features).sum().item()
+        return {"clip_score": clip_score / refer_features.shape[0]}
diff --git a/metrics/clip_score/requirements.txt b/metrics/clip_score/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+transformers
diff --git a/setup.py b/setup.py
@@ -31,6 +31,8 @@
 REQUIRED_PKGS = [
     # We need datasets as a backend
     "datasets>=2.0.0",
+    # CLIP Score metric needs transformers
+    "transformers",
     # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
     "numpy>=1.17",
     # For smart caching dataset processing
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
		transformers