IBM · yoavkatz · Mar 19, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/examples/api_call_evaluation.py b/examples/api_call_evaluation.py
@@ -1,5 +1,5 @@
 import json
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 from unitxt import get_logger
 from unitxt.api import create_dataset, evaluate
@@ -205,7 +205,7 @@ class CurlStrToListOfKeyValuePairs(FieldOperator):
 
     becomes
 
-    [('url', 'curl -X GET -H "Content-Type: application/json" https://petstore.swagger.io/v2/pets'), ('tags', 'dogs'), ('limit', '5')]
+    { 'url' : 'curl -X GET -H "Content-Type: application/json" https://petstore.swagger.io/v2/pets', 'tags' : 'dogs', 'limit' : '5'}
 
     """
 
@@ -217,11 +217,11 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
 
         splits = text.split("?")
         split_command = re.split(r"((?=GET|POST|DELETE)GET|POST|DELETE)", splits[0])
-        result = [
-            ("command", split_command[0]),
-            ("operation", split_command[1]),
-            ("url", split_command[2]),
-        ]
+        result = {
+            "command": split_command[0],
+            "operation": split_command[1],
+            "url": split_command[2],
+        }
         if len(splits) == 1:
             return result
         params = splits[1]
@@ -234,7 +234,7 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
             (key, value) = key_value_splits
             value_splits = value.split(",")
             if len(value_splits) == 1:
-                result.append((f"query_param_{key}", f"{value}"))
+                result[f"query_param_{key}"]= f"{value}"
 
         return result
 
@@ -249,10 +249,9 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
 task = Task(
     input_fields={"user_request": str, "api_spec": str},
     reference_fields={"reference_query": str},
-    prediction_type=List[Tuple[str, str]],
+    prediction_type=Dict[str,str],
     metrics=[
-        "metrics.accuracy",
-        "metrics.key_value_extraction",
+        "metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap",
     ],
 )
 

diff --git a/examples/key_value_extraction_evaluation.py b/examples/key_value_extraction_evaluation.py
@@ -35,9 +35,9 @@ def text_to_image(text: str):
 
 test_set = [
     {
-        "input": text_to_image("John lives in Texas."),
+        "input": text_to_image("John lives in New York."),
         "keys": keys,
-        "key_value_pairs_answer": {"Worker": "John", "LivesIn": "Texas"},
+        "key_value_pairs_answer": {"Worker": "John", "LivesIn": "New York"},
     },
     {
         "input": text_to_image("Phil works at Apple and eats an apple."),
@@ -53,10 +53,11 @@ def text_to_image(text: str):
     test_set=test_set,
     split="test",
     format="formats.chat_api",
+    metrics=["metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap"]
 )
 
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct", provider="watsonx"
+    model="llama-3-2-90b-vision-instruct", provider="watsonx", temperature=0
 )
 
 predictions = model(dataset)

diff --git a/prepare/metrics/custom_f1.py b/prepare/metrics/custom_f1.py
@@ -1,5 +1,5 @@
 from unitxt import add_to_catalog
-from unitxt.metrics import NER, KeyValueExtraction
+from unitxt.metrics import NER
 from unitxt.test_utils.metrics import test_metric
 
 metric = NER()
@@ -434,50 +434,3 @@ class NERWithoutClassReporting(NER):
 )
 
 add_to_catalog(metric, "metrics.ner", overwrite=True)
-
-
-metric = KeyValueExtraction()
-
-predictions = [
-    [("key1", "value1"), ("key2", "value2"), ("unknown_key", "unknown_value")]
-]
-
-references = [[[("key1", "value1"), ("key2", "value3")]]]
-#
-instance_targets = [
-    {
-        "f1_key1": 1.0,
-        "f1_key2": 0.0,
-        "f1_macro": 0.5,
-        "f1_micro": 0.4,
-        "in_classes_support": 0.67,
-        "precision_macro": 0.5,
-        "precision_micro": 0.33,
-        "recall_macro": 0.5,
-        "recall_micro": 0.5,
-        "score": 0.4,
-        "score_name": "f1_micro",
-    }
-]
-global_target = {
-    "f1_key1": 1.0,
-    "f1_key2": 0.0,
-    "f1_macro": 0.5,
-    "in_classes_support": 0.67,
-    "f1_micro": 0.4,
-    "recall_micro": 0.5,
-    "recall_macro": 0.5,
-    "precision_micro": 0.33,
-    "precision_macro": 0.5,
-    "score": 0.4,
-    "score_name": "f1_micro",
-    "num_of_instances": 1,
-}
-outputs = test_metric(
-    metric=metric,
-    predictions=predictions,
-    references=references,
-    instance_targets=instance_targets,
-    global_target=global_target,
-)
-add_to_catalog(metric, "metrics.key_value_extraction", overwrite=True)
diff --git a/prepare/metrics/key_value_extraction.py b/prepare/metrics/key_value_extraction.py
@@ -0,0 +1,36 @@
+from unitxt import add_to_catalog
+from unitxt.metrics import KeyValueExtraction
+from unitxt.test_utils.metrics import test_metric
+
+metric = KeyValueExtraction(__description__ = """Metric that evaluates key value pairs predictions (provided as dictionaries)
+with reference key value pairs (also provided as dictionaries). By default uses an accuracy (exact match) between each for the fields.
+Reports average accuracy for each key , as well as micro and macro averages across all keys.
+""", metric="metrics.accuracy",)
+
+predictions = [
+    {"key1": "value1", "key2": "value2", "unknown_key": "unknown_value"}
+]
+
+references = [[{"key1": "value1", "key2" : "value3"}]]
+#
+instance_targets = [
+     {"accuracy_key1": 1.0, "accuracy_key2": 0.0, "accuracy_legal_keys_in_predictions": 0.67, "accuracy_macro": 0.5, "accuracy_micro": 0.5, "score": 0.5, "score_name": "accuracy_micro"}
+]
+global_target = {"accuracy_key1": 1.0, "accuracy_key2": 0.0, "accuracy_legal_keys_in_predictions": 0.67, "accuracy_macro": 0.5, "accuracy_micro": 0.5, "score": 0.5, "score_name": "accuracy_micro", "num_of_instances" : 1}
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+add_to_catalog(metric, "metrics.key_value_extraction.accuracy", overwrite=True)
+
+metric = KeyValueExtraction(__description__ = """Metric that evaluates key value pairs predictions (provided as dictionary)
+with reference key value pairs (also provided as dictionary).
+Calculates token overlap between values of corresponding value in reference and prediction.
+Reports f1 per key, micro f1 averages across all key/value pairs, and macro f1 averages across keys.
+""",
+metric="metrics.token_overlap",score_prefix="token_overlap_")
+
+add_to_catalog(metric, "metrics.key_value_extraction.token_overlap",  overwrite=True)
diff --git a/prepare/tasks/key_value_extraction.py b/prepare/tasks/key_value_extraction.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 from unitxt.blocks import Task
 from unitxt.catalog import add_to_catalog
@@ -8,8 +8,8 @@
         __description__="This is a key value extraction task, where a specific list of possible 'keys' need to be extracted from the input.  The ground truth is provided key-value pairs in the form of the dictionary.  The results are evaluating using F1 score metric, that expects the predictions to be converted into a list of (key,value) pairs. ",
         input_fields={"input": Any, "keys": List[str]},
         reference_fields={"key_value_pairs_answer": Dict[str, str]},
-        prediction_type=List[Tuple[str, str]],
-        metrics=["metrics.key_value_extraction"],
+        prediction_type=Dict[str, str],
+        metrics=["metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap"],
         default_template="templates.key_value_extraction.extract_in_json_format",
     ),
     "tasks.key_value_extraction",

diff --git a/prepare/templates/key_value_extraction/templates.py b/prepare/templates/key_value_extraction/templates.py
@@ -6,7 +6,7 @@
     ListSerializer,
     MultiTypeSerializer,
 )
-from unitxt.struct_data_operators import JsonStrToListOfKeyValuePairs
+from unitxt.struct_data_operators import JsonStrToDict
 from unitxt.templates import (
     InputOutputTemplate,
 )
@@ -17,7 +17,7 @@
         input_format="{input}",
         output_format="{key_value_pairs_answer}",
         postprocessors=[
-            PostProcess(JsonStrToListOfKeyValuePairs()),
+            PostProcess(JsonStrToDict()),
         ],
         serializer=MultiTypeSerializer(
             serializers=[ImageSerializer(), DictAsJsonSerializer(), ListSerializer()]

diff --git a/src/unitxt/catalog/metrics/key_value_extraction.json b/src/unitxt/catalog/metrics/key_value_extraction.json
diff --git a/src/unitxt/catalog/metrics/key_value_extraction/accuracy.json b/src/unitxt/catalog/metrics/key_value_extraction/accuracy.json
@@ -0,0 +1,5 @@
+{
+    "__type__": "key_value_extraction",
+    "__description__": "Metric that evaluates key value pairs predictions (provided as dictionaries)\nwith reference key value pairs (also provided as dictionaries). By default uses an accuracy (exact match) between each for the fields.\nReports average accuracy for each key , as well as micro and macro averages across all keys.\n",
+    "metric": "metrics.accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/key_value_extraction/token_overlap.json b/src/unitxt/catalog/metrics/key_value_extraction/token_overlap.json
@@ -0,0 +1,6 @@
+{
+    "__type__": "key_value_extraction",
+    "__description__": "Metric that evaluates key value pairs predictions (provided as dictionary)\nwith reference key value pairs (also provided as dictionary).\nCalculates token overlap between values of corresponding value in reference and prediction.\nReports f1 per key, micro f1 averages across all key/value pairs, and macro f1 averages across keys.\n",
+    "metric": "metrics.token_overlap",
+    "score_prefix": "token_overlap_"
+}
diff --git a/src/unitxt/catalog/tasks/key_value_extraction.json b/src/unitxt/catalog/tasks/key_value_extraction.json
@@ -8,9 +8,10 @@
     "reference_fields": {
         "key_value_pairs_answer": "Dict[str, str]"
     },
-    "prediction_type": "List[Tuple[str, str]]",
+    "prediction_type": "Dict[str, str]",
     "metrics": [
-        "metrics.key_value_extraction"
+        "metrics.key_value_extraction.accuracy",
+        "metrics.key_value_extraction.token_overlap"
     ],
     "default_template": "templates.key_value_extraction.extract_in_json_format"
 }
diff --git a/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json b/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json
@@ -7,7 +7,7 @@
         {
             "__type__": "post_process",
             "operator": {
-                "__type__": "json_str_to_list_of_key_value_pairs"
+                "__type__": "json_str_to_dict"
             }
         }
     ],

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -233,8 +233,9 @@ def infer(
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
+                number_of_batches = len(dataset) // self.cache_batch_size + 1
                 result = []
-                for batch_num, batch in enumerate(batched(dataset, self.cache_batch_size)):
+                for batch_index, batch in enumerate(batched(dataset, self.cache_batch_size)):
                     cached_results = []
                     missing_examples = []
                     for i, item in enumerate(batch):
@@ -245,7 +246,8 @@ def infer(
                         else:
                             missing_examples.append((i, item)) # each element is index in batch and example
                     # infare on missing examples only, without indices
-                    logger.info(f"Inferring batch {batch_num} / {len(dataset) // self.cache_batch_size} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
+
+                    logger.info(f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
                     if (len(missing_examples) > 0):
                         inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
                         # recombined to index and value
@@ -257,9 +259,7 @@ def infer(
                             cache_key = self._get_cache_key(item)
                             self._cache[cache_key] = prediction
                     else:
-
-                        inferred_results = []
-
+                        inferred_results=[]
                     # Combine cached and inferred results in original order
                     batch_predictions = [p[1] for p in sorted(cached_results + inferred_results)]
                     result.extend(batch_predictions)
@@ -3313,8 +3313,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
     }
 
     def get_engine_id(self):
-        return get_model_and_label_id(self.model, self.label)
-
+        return get_model_and_label_id(self.model_name, self.label)
 
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer

diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -845,7 +845,7 @@ def verify(self):
 
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
-            self.data_classification_policy or ["proprietary"], "when loading from python dictionary"
+            ["proprietary"], "when loading from python dictionary"
         )
 
     def load_iterables(self) -> MultiStream:

diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -3414,25 +3414,83 @@ def add_macro_scores(self, f1_result, recall_result, precision_result, result):
             result["precision_macro"] = self.zero_division
 
 
-class NER(CustomF1):
-    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
+class KeyValueExtraction(GlobalMetric):
 
-    prediction_type = List[Tuple[str, str]]
+    prediction_type = Dict[str,str]
+    metric : Metric
+    single_reference_per_prediction = True
+    main_score = ""
+    def prepare(self):
+        super().prepare()
+        self.main_score = f"{self.metric.main_score}_micro"
 
-    def get_element_group(self, element, additional_input):
-        return element[1]
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        task_data: List[Dict],
+    ) -> dict:
+        references = [element[0] for element in references]
 
-    def get_element_representation(self, element, additional_input):
-        return str(element)
+        key_statistics = {}
+        all_reference_keys = set()
+        for reference in references:
+            all_reference_keys.update(list(reference.keys()))
+        for key in all_reference_keys:
+            key_statistics[key]= []
+
+        num_prediction_keys=0
+        illegal_prediction_keys=0
+        for reference, prediction in zip(references, predictions):
+            for key in all_reference_keys:
+                if (key not in reference and key not in prediction):
+                    continue
+                if (key in reference and key in prediction):
+                    multi_stream = MultiStream.from_iterables({"test": [{"prediction" : prediction[key],
+                                                                        "references" : [reference[key]]}
+                                                                                                                                                                                                          ]})
+                    output_multi_stream = self.metric(multi_stream)
+                    output_stream = output_multi_stream["test"]
+                    score = next(iter(output_stream))["score"]["global"]["score"]
+                    key_statistics[key].append(score)
+                else:
+                    key_statistics[key].append(0.0)
+
+            for key in prediction.keys():
+                num_prediction_keys += 1
+                if key not in all_reference_keys:
+                    illegal_prediction_keys += 1
 
+        result={}
 
-class KeyValueExtraction(CustomF1):
-    """F1 Metrics that receives as input a list of (Key,Value) pairs."""
+        average = 0
+        total = 0
+
+        weighted_average = 0
+        for key in key_statistics:
+            mean_for_key = numpy.mean(key_statistics[key])
+            num = len(key_statistics[key])
+            total += num
+            average += mean_for_key
+            weighted_average += mean_for_key * num
+            result[f"{self.metric.main_score}_{key}"] = mean_for_key
+
+        result[f"{self.metric.main_score}_micro"] = weighted_average / total
+        result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
+        if (num_prediction_keys !=0):
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 1 - 1.0 * illegal_prediction_keys /  num_prediction_keys
+        else:
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
+
+        return result
+
+class NER(CustomF1):
+    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
 
     prediction_type = List[Tuple[str, str]]
 
     def get_element_group(self, element, additional_input):
-        return element[0]
+        return element[1]
 
     def get_element_representation(self, element, additional_input):
         return str(element)