IBM · elronbandel · Feb 4, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 29, 2025
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -62,7 +62,7 @@ Classifical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.
 
 `Example code  <https://github.com/IBM/unitxt/blob/main/examples/ner_evaluation.py>`__
 
-Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`Open NER task in catalog <catalog.tasks.ner.all_entity_types>`, :ref:`Inference Engines <inference>`.
+Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`NER task in catalog <catalog.tasks.ner.all_entity_types>`, :ref:`Inference Engines <inference>`.
 
 Evaluation usecases
 -----------------------
@@ -244,6 +244,19 @@ Evaluate Image-Text to Text Models with different templates and explore the sens
 
 Related documentation: :ref:`Multi-Modality Guide <multi_modality>`, :ref:`Inference Engines <inference>`.
 
+Evaluate Image Key Value Extraction task
++++++++++++++++++++++++++++++++++++++++++
+
+This example demonstrates how to evaluate an image key value extraction task.  It renders several images of given texts and then prompts a vision model to extract key value pairs from the images.
+This requires the vision model to understand the texts in the images, and extract relevant values. It computes overall F1 scores and F1 scores for each of the keys based on ground truth key value pairs.
+Note the same code can be used for textual key value extraction, just py providing input texts instead of input images.
+
+`Example code <https://github.com/IBM/unitxt/blob/main/examples/key_value_extraction_evaluation.py>`__
+
+Related documentation: :ref:`Key Value Extraction task in catalog <catalog.tasks.key_value_extraction>`, :ref:`Inference Engines <inference>`.
+:ref:`Multi-Modality Guide <multi_modality>`, :ref:`Inference Engines <inference>`.
+
+
 Advanced topics
 ----------------------------
 

diff --git a/examples/key_value_extraction_evaluation.py b/examples/key_value_extraction_evaluation.py
@@ -0,0 +1,73 @@
+import json
+
+from unitxt import get_logger
+from unitxt.api import create_dataset, evaluate
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+
+logger = get_logger()
+keys = ["Worker", "LivesIn", "WorksAt"]
+
+
+def text_to_image(text: str):
+    """Return a image with the input text render in it."""
+    from PIL import Image, ImageDraw, ImageFont
+
+    bg_color = (255, 255, 255)
+    text_color = (0, 0, 0)
+    font_size = 10
+    font = ImageFont.load_default(size=font_size)
+
+    img = Image.new("RGB", (1, 1), bg_color)
+
+    # Get dimensions of the text
+    # text_width, text_height = font.getsize_multiline(value)
+
+    # Create a new image with appropriate size
+    img = Image.new("RGB", (1000, 1000), bg_color)
+    draw = ImageDraw.Draw(img)
+
+    # Draw the text on the image
+    draw.multiline_text((0, 0), text, fill=text_color, font=font)
+    return {"image": img, "format": "png"}
+
+
+test_set = [
+    {
+        "input": text_to_image("John lives in Texas."),
+        "keys": keys,
+        "key_value_pairs_answer": {"Worker": "John", "LivesIn": "Texas"},
+    },
+    {
+        "input": text_to_image("Phil works at Apple and eats an apple."),
+        "keys": keys,
+        "key_value_pairs_answer": {"Worker": "Phil", "WorksAt": "Apple"},
+    },
+]
+
+
+dataset = create_dataset(
+    task="tasks.key_value_extraction",
+    template="templates.key_value_extraction.extract_in_json_format",
+    test_set=test_set,
+    split="test",
+    format="formats.chat_api",
+)
+
+model = CrossProviderInferenceEngine(
+    model="llama-3-2-11b-vision-instruct", provider="watsonx"
+)
+
+predictions = model(dataset)
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Example prompt:")
+
+print(json.dumps(results.instance_scores[0]["source"], indent=4))
+
+print("Instance Results:")
+print(results.instance_scores)
+
+print("Global Results:")
+print(results.global_scores.summary)
diff --git a/prepare/metrics/custom_f1.py b/prepare/metrics/custom_f1.py
@@ -1,5 +1,5 @@
 from unitxt import add_to_catalog
-from unitxt.metrics import NER
+from unitxt.metrics import NER, KeyValueExtraction
 from unitxt.test_utils.metrics import test_metric
 
 metric = NER()
@@ -434,3 +434,50 @@ class NERWithoutClassReporting(NER):
 )
 
 add_to_catalog(metric, "metrics.ner", overwrite=True)
+
+
+metric = KeyValueExtraction()
+
+predictions = [
+    [("key1", "value1"), ("key2", "value2"), ("unknown_key", "unknown_value")]
+]
+
+references = [[[("key1", "value1"), ("key2", "value3")]]]
+#
+instance_targets = [
+    {
+        "f1_key1": 1.0,
+        "f1_key2": 0.0,
+        "f1_macro": 0.5,
+        "f1_micro": 0.4,
+        "in_classes_support": 0.67,
+        "precision_macro": 0.5,
+        "precision_micro": 0.33,
+        "recall_macro": 0.5,
+        "recall_micro": 0.5,
+        "score": 0.4,
+        "score_name": "f1_micro",
+    }
+]
+global_target = {
+    "f1_key1": 1.0,
+    "f1_key2": 0.0,
+    "f1_macro": 0.5,
+    "in_classes_support": 0.67,
+    "f1_micro": 0.4,
+    "recall_micro": 0.5,
+    "recall_macro": 0.5,
+    "precision_micro": 0.33,
+    "precision_macro": 0.5,
+    "score": 0.4,
+    "score_name": "f1_micro",
+    "num_of_instances": 1,
+}
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+add_to_catalog(metric, "metrics.key_value_extraction", overwrite=True)
diff --git a/prepare/tasks/key_value_extraction.py b/prepare/tasks/key_value_extraction.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, List, Tuple
+
+from unitxt.blocks import Task
+from unitxt.catalog import add_to_catalog
+
+add_to_catalog(
+    Task(
+        __description__="This is a key value extraction task, where a specific list of possible 'keys' need to be extracted from the input.  The ground truth is provided key-value pairs in the form of the dictionary.  The results are evaluating using F1 score metric, that expects the predictions to be converted into a list of (key,value) pairs. ",
+        input_fields={"input": Any, "keys": List[str]},
+        reference_fields={"key_value_pairs_answer": Dict[str, str]},
+        prediction_type=List[Tuple[str, str]],
+        metrics=["metrics.key_value_extraction"],
+        default_template="templates.key_value_extraction.extract_in_json_format",
+    ),
+    "tasks.key_value_extraction",
+    overwrite=True,
+)
diff --git a/prepare/templates/key_value_extraction/templates.py b/prepare/templates/key_value_extraction/templates.py
@@ -0,0 +1,17 @@
+from unitxt import add_to_catalog
+from unitxt.processors import PostProcess
+from unitxt.struct_data_operators import JsonStrToListOfKeyValuePairs
+from unitxt.templates import (
+    InputOutputTemplate,
+)
+
+add_to_catalog(
+    InputOutputTemplate(
+        instruction="Extract the key value pairs from the input. Return a valid json object with the following keys: {keys}. Return only the json representation, no additional text or explanations.",
+        input_format="{input}",
+        output_format="{key_value_pairs_answer}",
+        postprocessors=[PostProcess(JsonStrToListOfKeyValuePairs())],
+    ),
+    "templates.key_value_extraction.extract_in_json_format",
+    overwrite=True,
+)
diff --git a/src/unitxt/catalog/metrics/key_value_extraction.json b/src/unitxt/catalog/metrics/key_value_extraction.json
@@ -0,0 +1,3 @@
+{
+    "__type__": "key_value_extraction"
+}
diff --git a/src/unitxt/catalog/tasks/key_value_extraction.json b/src/unitxt/catalog/tasks/key_value_extraction.json
@@ -0,0 +1,16 @@
+{
+    "__type__": "task",
+    "__description__": "This is a key value extraction task, where a specific list of possible 'keys' need to be extracted from the input.  The ground truth is provided key-value pairs in the form of the dictionary.  The results are evaluating using F1 score metric, that expects the predictions to be converted into a list of (key,value) pairs. ",
+    "input_fields": {
+        "input": "Any",
+        "keys": "List[str]"
+    },
+    "reference_fields": {
+        "key_value_pairs_answer": "Dict[str, str]"
+    },
+    "prediction_type": "List[Tuple[str, str]]",
+    "metrics": [
+        "metrics.key_value_extraction"
+    ],
+    "default_template": "templates.key_value_extraction.extract_in_json_format"
+}
diff --git a/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json b/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json
@@ -0,0 +1,14 @@
+{
+    "__type__": "input_output_template",
+    "instruction": "Extract the key value pairs from the input. Return a valid json object with the following keys: {keys}. Return only the json representation, no additional text or explanations.",
+    "input_format": "{input}",
+    "output_format": "{key_value_pairs_answer}",
+    "postprocessors": [
+        {
+            "__type__": "post_process",
+            "operator": {
+                "__type__": "json_str_to_list_of_key_value_pairs"
+            }
+        }
+    ]
+}
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -3355,6 +3355,8 @@ def add_macro_scores(self, f1_result, recall_result, precision_result, result):
 
 
 class NER(CustomF1):
+    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
+
     prediction_type = List[Tuple[str, str]]
 
     def get_element_group(self, element, additional_input):
@@ -3364,6 +3366,18 @@ def get_element_representation(self, element, additional_input):
         return str(element)
 
 
+class KeyValueExtraction(CustomF1):
+    """F1 Metrics that receives as input a list of (Key,Value) pairs."""
+
+    prediction_type = List[Tuple[str, str]]
+
+    def get_element_group(self, element, additional_input):
+        return element[0]
+
+    def get_element_representation(self, element, additional_input):
+        return str(element)
+
+
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
 

diff --git a/src/unitxt/struct_data_operators.py b/src/unitxt/struct_data_operators.py
@@ -23,6 +23,7 @@
     {"key1": "value1", "key2": value2, "key3": "value3"}
 """
 
+import ast
 import json
 import random
 from abc import ABC, abstractmethod
@@ -31,12 +32,14 @@
     Dict,
     List,
     Optional,
+    Tuple,
 )
 
 import pandas as pd
 
 from .augmentors import TypeDependentAugmentor
 from .dict_utils import dict_get
+from .error_utils import UnitxtWarning
 from .operators import FieldOperator, InstanceOperator
 from .random_utils import new_random_generator
 from .serializers import ImageSerializer, TableSerializer
@@ -1019,3 +1022,21 @@ def process_value(self, table: Any) -> Any:
         random.shuffle(shuffled_header)
 
         return {"header": shuffled_header, "rows": table["rows"]}
+
+
+class JsonStrToListOfKeyValuePairs(FieldOperator):
+    def process_value(self, text: str) -> List[Tuple[str, str]]:
+        text = text.replace("null", "None")
+
+        try:
+            dict_value = ast.literal_eval(text)
+        except Exception as e:
+            UnitxtWarning(
+                f"Unable to convert input text to json format in JsonStrToListOfKeyValuePairs due to {e}. Text: {text}"
+            )
+            dict_value = {}
+        return [
+            (str(key), str(value))
+            for key, value in dict_value.items()
+            if value is not None
+        ]