diff --git a/metrics/clip_score/README.md b/metrics/clip_score/README.md new file mode 100644 index 00000000..2f4fb068 --- /dev/null +++ b/metrics/clip_score/README.md @@ -0,0 +1,74 @@ +--- +title: CLIP Score +tags: +- evaluate +- metric +description: "CLIPScore is a reference-free evaluation metric for image captioning that measures the alignment between images and their corresponding text descriptions." +sdk: gradio +sdk_version: 5.45.0 +app_file: app.py +pinned: false +--- + +# Metric Card for CLIP Score + +***Module Card Instructions:*** *This module calculates CLIPScore, a reference-free evaluation metric for image captioning.* + +## Metric Description + +CLIPScore is a reference-free evaluation metric for image captioning that measures the alignment between images and their corresponding text descriptions. It leverages the CLIP (Contrastive Language-Image Pretraining) model to compute a similarity score between the visual and textual modalities. + +## How to Use + +To use the CLIPScore metric, you need to provide a list of text predictions and a list of images. The metric will compute the CLIPScore for each pair of image and text. + +### Inputs + +- **predictions** *(string): A list of text predictions to score. Each prediction should be a string.* +- **references** *(PIL.Image.Image): A list of images to score against. Each image should be a PIL image.* + +### Output Values + +The CLIPScore metric outputs a dictionary with a single key-value pair: + +- **clip_score** *(float)*: The average CLIPScore across all provided image-text pairs. The score ranges from -1 to 1, where higher scores indicate better alignment between the image and text. + +### Examples + +```python +from PIL import Image +import evaluate + +metric = evaluate.load("sunhill/clip_score") +predictions = ["A cat sitting on a windowsill.", "A dog playing with a ball."] +references = [Image.open("cat.jpg"), Image.open("dog.jpg")] +results = metric.compute(predictions=predictions, references=references) +print(results) +# Output: {'clip_score': 0.85} +``` + +## Citation + +```bibtex +@article{DBLP:journals/corr/abs-2104-08718, + author = {Jack Hessel and + Ari Holtzman and + Maxwell Forbes and + Ronan Le Bras and + Yejin Choi}, + title = {CLIPScore: {A} Reference-free Evaluation Metric for Image Captioning}, + journal = {CoRR}, + volume = {abs/2104.08718}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08718}, + eprinttype = {arXiv}, + eprint = {2104.08718}, + timestamp = {Sat, 29 Apr 2023 10:09:27 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08718.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +## Further References + +- [clip-score](https://github.com/Taited/clip-score) diff --git a/metrics/clip_score/app.py b/metrics/clip_score/app.py new file mode 100644 index 00000000..be0f0222 --- /dev/null +++ b/metrics/clip_score/app.py @@ -0,0 +1,40 @@ +import sys +from pathlib import Path + +import gradio as gr + +import evaluate +from evaluate import parse_readme + + +metric = evaluate.load("sunhill/clip_score") + + +def compute_clip_score(image, text): + results = metric.compute(predictions=[text], references=[image]) + return results["clip_score"] + + +iface = gr.Interface( + fn=compute_clip_score, + inputs=[ + gr.Image(type="pil"), + gr.Textbox(lines=2, placeholder="Enter text here..."), + ], + outputs=gr.Number(label="CLIP Score"), + title="CLIP Score Evaluator", + description="Evaluate the alignment between an image and a text using CLIP Score.", + examples=[ + [ + "https://images.unsplash.com/photo-1720539222585-346e73f01536", + "A cat sitting on a couch", + ], + [ + "https://images.unsplash.com/photo-1694253987647-4eebcf679974", + "A scenic view of mountains during sunset", + ], + ], + article=parse_readme(Path(sys.path[0]) / "README.md"), +) + +iface.launch() diff --git a/metrics/clip_score/clip_score.py b/metrics/clip_score/clip_score.py new file mode 100644 index 00000000..79c62732 --- /dev/null +++ b/metrics/clip_score/clip_score.py @@ -0,0 +1,109 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module calculates CLIPScore, a reference-free evaluation metric for image captioning.""" + +import datasets +from transformers import AutoTokenizer, CLIPModel, CLIPProcessor + +import evaluate +from evaluate.utils.logging import get_logger + + +logger = get_logger(__name__) + +_CITATION = """\ +@article{DBLP:journals/corr/abs-2104-08718, + author = {Jack Hessel and + Ari Holtzman and + Maxwell Forbes and + Ronan Le Bras and + Yejin Choi}, + title = {CLIPScore: {A} Reference-free Evaluation Metric for Image Captioning}, + journal = {CoRR}, + volume = {abs/2104.08718}, + year = {2021}, + url = {https://arxiv.org/abs/2104.08718}, + eprinttype = {arXiv}, + eprint = {2104.08718}, + timestamp = {Sat, 29 Apr 2023 10:09:27 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2104-08718.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +""" + +_DESCRIPTION = """\ +This new module is designed to calculate CLIPScore, a reference-free evaluation metric for image captioning. +""" + + +_KWARGS_DESCRIPTION = """ +Computes CLIPScore to evaluate the alignment between an image and a text. +Args: + predictions: list of text predictions to score. Each prediction + should be a string. + images: list of images to score against. Each image should be a PIL image. +Returns: + clip_score: CLIPScore between the image and the text. +Examples: + >>> metric = evaluate.load("sunhill/clip_score") + >>> results = metric.compute(predictions=["A cat sitting on a couch."], images=[PIL_image]) + >>> print(results) + {'clip_score': 0.2076} +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class CLIPScore(evaluate.Metric): + """CLIPScore metric.""" + + def _info(self): + return evaluate.MetricInfo( + # This is the description that will appear on the modules page. + module_type="metric", + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + # This defines the format of each prediction and reference + features=datasets.Features( + { + "predictions": datasets.Value("string"), + "references": datasets.Image(), + } + ), + # Homepage of the module for documentation + homepage="https://huggingface.co/spaces/sunhill/clip_score", + # Additional links to the codebase or references + codebase_urls=["https://github.com/Taited/clip-score"], + reference_urls=["https://arxiv.org/abs/2104.08718"], + ) + + def _download_and_prepare(self, dl_manager): + """Optional: download external resources useful to compute the scores""" + logger.info("Downloading and preparing CLIP ViT-B/32 model...") + self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + self.tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + def _compute(self, predictions, references): + """Returns the scores""" + refer = self.processor(text=None, images=references, return_tensors="pt", padding=True) + pred = self.tokenizer(predictions, return_tensors="pt", padding=True) + + refer_features = self.model.get_image_features(**refer) + pred_features = self.model.get_text_features(**pred) + + refer_features = refer_features / refer_features.norm(dim=1, keepdim=True) + pred_features = pred_features / pred_features.norm(dim=1, keepdim=True) + clip_score = (refer_features * pred_features).sum().item() + return {"clip_score": clip_score / refer_features.shape[0]} diff --git a/metrics/clip_score/requirements.txt b/metrics/clip_score/requirements.txt new file mode 100644 index 00000000..aed174e3 --- /dev/null +++ b/metrics/clip_score/requirements.txt @@ -0,0 +1,2 @@ +git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} +transformers \ No newline at end of file diff --git a/setup.py b/setup.py index beb46d6a..2968bed5 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,8 @@ REQUIRED_PKGS = [ # We need datasets as a backend "datasets>=2.0.0", + # CLIP Score metric needs transformers + "transformers", # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling) "numpy>=1.17", # For smart caching dataset processing