Changed API of Key Value Extraction task to use Dict and not List[Tuple] (NON BACKWARD COMPATIBLE CHANGE) (#1675)

yoavkatz · elronbandel · web-flow · commit 30a5d19d4583 · 2025-03-19T09:34:50.000+02:00
* Moved key value extraction task  and metrics to use Dict[str,str] to compare and not List[Tuple[str,str]]

Also created a dedicated metric.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated metric to better address in case keys are used in prediction and not references

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added ability to customize inner metric used to compare each entity in KeyValueExtraction

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fixed unitest and bug with handling nones

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fixed KeyValueExtraction prepare

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added example of multiple metrics

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fix catalog

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fix some bugs in inference engine tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Fix some bugs in inference engine tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Updated key value extraction metric names

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated key value extraction metric names

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated documentation string

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fixed unit test.

* Updated to use metric as artifact and not string

* Fix bug in name conversion in rits

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Add engine id

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Improved output message when using inference cache

Also fixed issue when all data was in the cache and an empty list was passed to _infer.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fixed bug due to indentation change

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* fix

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* fix

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Removed warning of legacy name.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Use greedy decoding and remove redundant cache

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Merge branch 'improve_inference_log' into entity_squad_metric

* Ensure temperature is 0 in extraction task

* Removed unneeded changes from past merge

---------

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;
Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;
Co-authored-by: elronbandel &lt;elronbandel@gmail.com&gt;
diff --git a/examples/api_call_evaluation.py b/examples/api_call_evaluation.py
@@ -1,5 +1,5 @@
 import json
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 from unitxt import get_logger
 from unitxt.api import create_dataset, evaluate
@@ -205,7 +205,7 @@ class CurlStrToListOfKeyValuePairs(FieldOperator):
 
     becomes
 
-    [('url', 'curl -X GET -H "Content-Type: application/json" https://petstore.swagger.io/v2/pets'), ('tags', 'dogs'), ('limit', '5')]
+    { 'url' : 'curl -X GET -H "Content-Type: application/json" https://petstore.swagger.io/v2/pets', 'tags' : 'dogs', 'limit' : '5'}
 
     """
 
@@ -217,11 +217,11 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
 
         splits = text.split("?")
         split_command = re.split(r"((?=GET|POST|DELETE)GET|POST|DELETE)", splits[0])
-        result = [
-            ("command", split_command[0]),
-            ("operation", split_command[1]),
-            ("url", split_command[2]),
-        ]
+        result = {
+            "command": split_command[0],
+            "operation": split_command[1],
+            "url": split_command[2],
+        }
         if len(splits) == 1:
             return result
         params = splits[1]
@@ -234,7 +234,7 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
             (key, value) = key_value_splits
             value_splits = value.split(",")
             if len(value_splits) == 1:
-                result.append((f"query_param_{key}", f"{value}"))
+                result[f"query_param_{key}"]= f"{value}"
 
         return result
 
@@ -249,10 +249,9 @@ def process_value(self, text: str) -> List[Tuple[str, str]]:
 task = Task(
     input_fields={"user_request": str, "api_spec": str},
     reference_fields={"reference_query": str},
-    prediction_type=List[Tuple[str, str]],
+    prediction_type=Dict[str,str],
     metrics=[
-        "metrics.accuracy",
-        "metrics.key_value_extraction",
+        "metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap",
     ],
 )
 
diff --git a/examples/key_value_extraction_evaluation.py b/examples/key_value_extraction_evaluation.py
@@ -35,9 +35,9 @@ def text_to_image(text: str):
 
 test_set = [
     {
-        "input": text_to_image("John lives in Texas."),
+        "input": text_to_image("John lives in New York."),
         "keys": keys,
-        "key_value_pairs_answer": {"Worker": "John", "LivesIn": "Texas"},
+        "key_value_pairs_answer": {"Worker": "John", "LivesIn": "New York"},
     },
     {
         "input": text_to_image("Phil works at Apple and eats an apple."),
@@ -53,10 +53,11 @@ def text_to_image(text: str):
     test_set=test_set,
     split="test",
     format="formats.chat_api",
+    metrics=["metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap"]
 )
 
 model = CrossProviderInferenceEngine(
-    model="llama-3-2-11b-vision-instruct", provider="watsonx"
+    model="llama-3-2-90b-vision-instruct", provider="watsonx", temperature=0
 )
 
 predictions = model(dataset)
diff --git a/prepare/metrics/custom_f1.py b/prepare/metrics/custom_f1.py
@@ -1,5 +1,5 @@
 from unitxt import add_to_catalog
-from unitxt.metrics import NER, KeyValueExtraction
+from unitxt.metrics import NER
 from unitxt.test_utils.metrics import test_metric
 
 metric = NER()
@@ -434,50 +434,3 @@ class NERWithoutClassReporting(NER):
 )
 
 add_to_catalog(metric, "metrics.ner", overwrite=True)
-
-
-metric = KeyValueExtraction()
-
-predictions = [
-    [("key1", "value1"), ("key2", "value2"), ("unknown_key", "unknown_value")]
-]
-
-references = [[[("key1", "value1"), ("key2", "value3")]]]
-#
-instance_targets = [
-    {
-        "f1_key1": 1.0,
-        "f1_key2": 0.0,
-        "f1_macro": 0.5,
-        "f1_micro": 0.4,
-        "in_classes_support": 0.67,
-        "precision_macro": 0.5,
-        "precision_micro": 0.33,
-        "recall_macro": 0.5,
-        "recall_micro": 0.5,
-        "score": 0.4,
-        "score_name": "f1_micro",
-    }
-]
-global_target = {
-    "f1_key1": 1.0,
-    "f1_key2": 0.0,
-    "f1_macro": 0.5,
-    "in_classes_support": 0.67,
-    "f1_micro": 0.4,
-    "recall_micro": 0.5,
-    "recall_macro": 0.5,
-    "precision_micro": 0.33,
-    "precision_macro": 0.5,
-    "score": 0.4,
-    "score_name": "f1_micro",
-    "num_of_instances": 1,
-}
-outputs = test_metric(
-    metric=metric,
-    predictions=predictions,
-    references=references,
-    instance_targets=instance_targets,
-    global_target=global_target,
-)
-add_to_catalog(metric, "metrics.key_value_extraction", overwrite=True)
diff --git a/prepare/metrics/key_value_extraction.py b/prepare/metrics/key_value_extraction.py
@@ -0,0 +1,36 @@
+from unitxt import add_to_catalog
+from unitxt.metrics import KeyValueExtraction
+from unitxt.test_utils.metrics import test_metric
+
+metric = KeyValueExtraction(__description__ = """Metric that evaluates key value pairs predictions (provided as dictionaries)
+with reference key value pairs (also provided as dictionaries). By default uses an accuracy (exact match) between each for the fields.
+Reports average accuracy for each key , as well as micro and macro averages across all keys.
+""", metric="metrics.accuracy",)
+
+predictions = [
+    {"key1": "value1", "key2": "value2", "unknown_key": "unknown_value"}
+]
+
+references = [[{"key1": "value1", "key2" : "value3"}]]
+#
+instance_targets = [
+     {"accuracy_key1": 1.0, "accuracy_key2": 0.0, "accuracy_legal_keys_in_predictions": 0.67, "accuracy_macro": 0.5, "accuracy_micro": 0.5, "score": 0.5, "score_name": "accuracy_micro"}
+]
+global_target = {"accuracy_key1": 1.0, "accuracy_key2": 0.0, "accuracy_legal_keys_in_predictions": 0.67, "accuracy_macro": 0.5, "accuracy_micro": 0.5, "score": 0.5, "score_name": "accuracy_micro", "num_of_instances" : 1}
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+add_to_catalog(metric, "metrics.key_value_extraction.accuracy", overwrite=True)
+
+metric = KeyValueExtraction(__description__ = """Metric that evaluates key value pairs predictions (provided as dictionary)
+with reference key value pairs (also provided as dictionary).
+Calculates token overlap between values of corresponding value in reference and prediction.
+Reports f1 per key, micro f1 averages across all key/value pairs, and macro f1 averages across keys.
+""",
+metric="metrics.token_overlap",score_prefix="token_overlap_")
+
+add_to_catalog(metric, "metrics.key_value_extraction.token_overlap",  overwrite=True)
diff --git a/prepare/tasks/key_value_extraction.py b/prepare/tasks/key_value_extraction.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 from unitxt.blocks import Task
 from unitxt.catalog import add_to_catalog
@@ -8,8 +8,8 @@
         __description__="This is a key value extraction task, where a specific list of possible 'keys' need to be extracted from the input.  The ground truth is provided key-value pairs in the form of the dictionary.  The results are evaluating using F1 score metric, that expects the predictions to be converted into a list of (key,value) pairs. ",
         input_fields={"input": Any, "keys": List[str]},
         reference_fields={"key_value_pairs_answer": Dict[str, str]},
-        prediction_type=List[Tuple[str, str]],
-        metrics=["metrics.key_value_extraction"],
+        prediction_type=Dict[str, str],
+        metrics=["metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap"],
         default_template="templates.key_value_extraction.extract_in_json_format",
     ),
     "tasks.key_value_extraction",
diff --git a/prepare/templates/key_value_extraction/templates.py b/prepare/templates/key_value_extraction/templates.py
@@ -6,7 +6,7 @@
     ListSerializer,
     MultiTypeSerializer,
 )
-from unitxt.struct_data_operators import JsonStrToListOfKeyValuePairs
+from unitxt.struct_data_operators import JsonStrToDict
 from unitxt.templates import (
     InputOutputTemplate,
 )
@@ -17,7 +17,7 @@
         input_format="{input}",
         output_format="{key_value_pairs_answer}",
         postprocessors=[
-            PostProcess(JsonStrToListOfKeyValuePairs()),
+            PostProcess(JsonStrToDict()),
         ],
         serializer=MultiTypeSerializer(
             serializers=[ImageSerializer(), DictAsJsonSerializer(), ListSerializer()]
diff --git a/src/unitxt/catalog/metrics/key_value_extraction.json b/src/unitxt/catalog/metrics/key_value_extraction.json
diff --git a/src/unitxt/catalog/metrics/key_value_extraction/accuracy.json b/src/unitxt/catalog/metrics/key_value_extraction/accuracy.json
@@ -0,0 +1,5 @@
+{
+    "__type__": "key_value_extraction",
+    "__description__": "Metric that evaluates key value pairs predictions (provided as dictionaries)\nwith reference key value pairs (also provided as dictionaries). By default uses an accuracy (exact match) between each for the fields.\nReports average accuracy for each key , as well as micro and macro averages across all keys.\n",
+    "metric": "metrics.accuracy"
+}
diff --git a/src/unitxt/catalog/metrics/key_value_extraction/token_overlap.json b/src/unitxt/catalog/metrics/key_value_extraction/token_overlap.json
@@ -0,0 +1,6 @@
+{
+    "__type__": "key_value_extraction",
+    "__description__": "Metric that evaluates key value pairs predictions (provided as dictionary)\nwith reference key value pairs (also provided as dictionary).\nCalculates token overlap between values of corresponding value in reference and prediction.\nReports f1 per key, micro f1 averages across all key/value pairs, and macro f1 averages across keys.\n",
+    "metric": "metrics.token_overlap",
+    "score_prefix": "token_overlap_"
+}
diff --git a/src/unitxt/catalog/tasks/key_value_extraction.json b/src/unitxt/catalog/tasks/key_value_extraction.json
@@ -8,9 +8,10 @@
     "reference_fields": {
         "key_value_pairs_answer": "Dict[str, str]"
     },
-    "prediction_type": "List[Tuple[str, str]]",
+    "prediction_type": "Dict[str, str]",
     "metrics": [
-        "metrics.key_value_extraction"
+        "metrics.key_value_extraction.accuracy",
+        "metrics.key_value_extraction.token_overlap"
     ],
     "default_template": "templates.key_value_extraction.extract_in_json_format"
 }
diff --git a/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json b/src/unitxt/catalog/templates/key_value_extraction/extract_in_json_format.json
@@ -7,7 +7,7 @@
         {
             "__type__": "post_process",
             "operator": {
-                "__type__": "json_str_to_list_of_key_value_pairs"
+                "__type__": "json_str_to_dict"
             }
         }
     ],
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -233,8 +233,9 @@ def infer(
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
+                number_of_batches = len(dataset) // self.cache_batch_size + 1
                 result = []
-                for batch_num, batch in enumerate(batched(dataset, self.cache_batch_size)):
+                for batch_index, batch in enumerate(batched(dataset, self.cache_batch_size)):
                     cached_results = []
                     missing_examples = []
                     for i, item in enumerate(batch):
@@ -245,7 +246,8 @@ def infer(
                         else:
                             missing_examples.append((i, item)) # each element is index in batch and example
                     # infare on missing examples only, without indices
-                    logger.info(f"Inferring batch {batch_num} / {len(dataset) // self.cache_batch_size} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
+
+                    logger.info(f"Inferring batch {batch_index + 1} / {number_of_batches} with {len(missing_examples)} instances (found {len(cached_results)} instances in {self._cache.directory})")
                     if (len(missing_examples) > 0):
                         inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
                         # recombined to index and value
@@ -257,9 +259,7 @@ def infer(
                             cache_key = self._get_cache_key(item)
                             self._cache[cache_key] = prediction
                     else:
-
-                        inferred_results = []
-
+                        inferred_results=[]
                     # Combine cached and inferred results in original order
                     batch_predictions = [p[1] for p in sorted(cached_results + inferred_results)]
                     result.extend(batch_predictions)
@@ -3313,8 +3313,7 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
     }
 
     def get_engine_id(self):
-        return get_model_and_label_id(self.model, self.label)
-
+        return get_model_and_label_id(self.model_name, self.label)
 
     def prepare_engine(self):
         from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -845,7 +845,7 @@ def verify(self):
 
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
-            self.data_classification_policy or ["proprietary"], "when loading from python dictionary"
+            ["proprietary"], "when loading from python dictionary"
         )
 
     def load_iterables(self) -> MultiStream:
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -3414,25 +3414,83 @@ def add_macro_scores(self, f1_result, recall_result, precision_result, result):
             result["precision_macro"] = self.zero_division
 
 
-class NER(CustomF1):
-    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
+class KeyValueExtraction(GlobalMetric):
 
-    prediction_type = List[Tuple[str, str]]
+    prediction_type = Dict[str,str]
+    metric : Metric
+    single_reference_per_prediction = True
+    main_score = ""
+    def prepare(self):
+        super().prepare()
+        self.main_score = f"{self.metric.main_score}_micro"
 
-    def get_element_group(self, element, additional_input):
-        return element[1]
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        task_data: List[Dict],
+    ) -> dict:
+        references = [element[0] for element in references]
 
-    def get_element_representation(self, element, additional_input):
-        return str(element)
+        key_statistics = {}
+        all_reference_keys = set()
+        for reference in references:
+            all_reference_keys.update(list(reference.keys()))
+        for key in all_reference_keys:
+            key_statistics[key]= []
+
+        num_prediction_keys=0
+        illegal_prediction_keys=0
+        for reference, prediction in zip(references, predictions):
+            for key in all_reference_keys:
+                if (key not in reference and key not in prediction):
+                    continue
+                if (key in reference and key in prediction):
+                    multi_stream = MultiStream.from_iterables({"test": [{"prediction" : prediction[key],
+                                                                        "references" : [reference[key]]}
+                                                                                                                                                                                                          ]})
+                    output_multi_stream = self.metric(multi_stream)
+                    output_stream = output_multi_stream["test"]
+                    score = next(iter(output_stream))["score"]["global"]["score"]
+                    key_statistics[key].append(score)
+                else:
+                    key_statistics[key].append(0.0)
+
+            for key in prediction.keys():
+                num_prediction_keys += 1
+                if key not in all_reference_keys:
+                    illegal_prediction_keys += 1
 
+        result={}
 
-class KeyValueExtraction(CustomF1):
-    """F1 Metrics that receives as input a list of (Key,Value) pairs."""
+        average = 0
+        total = 0
+
+        weighted_average = 0
+        for key in key_statistics:
+            mean_for_key = numpy.mean(key_statistics[key])
+            num = len(key_statistics[key])
+            total += num
+            average += mean_for_key
+            weighted_average += mean_for_key * num
+            result[f"{self.metric.main_score}_{key}"] = mean_for_key
+
+        result[f"{self.metric.main_score}_micro"] = weighted_average / total
+        result[f"{self.metric.main_score}_macro"] = average / len(key_statistics)
+        if (num_prediction_keys !=0):
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 1 - 1.0 * illegal_prediction_keys /  num_prediction_keys
+        else:
+            result[f"{self.metric.main_score}_legal_keys_in_predictions"] = 0
+
+        return result
+
+class NER(CustomF1):
+    """F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
 
     prediction_type = List[Tuple[str, str]]
 
     def get_element_group(self, element, additional_input):
-        return element[0]
+        return element[1]
 
     def get_element_representation(self, element, additional_input):
         return str(element)
diff --git a/src/unitxt/struct_data_operators.py b/src/unitxt/struct_data_operators.py
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py
diff --git a/tests/library/test_struct_data_operators.py b/tests/library/test_struct_data_operators.py

Original file line number	Diff line number	Diff line change
`@@ -35,9 +35,9 @@ def text_to_image(text: str):`
`35`	`35`
`36`	`36`	`test_set = [`
`37`	`37`	`{`
`38`		`- "input": text_to_image("John lives in Texas."),`
	`38`	`+ "input": text_to_image("John lives in New York."),`
`39`	`39`	`"keys": keys,`
`40`		`- "key_value_pairs_answer": {"Worker": "John", "LivesIn": "Texas"},`
	`40`	`+ "key_value_pairs_answer": {"Worker": "John", "LivesIn": "New York"},`
`41`	`41`	`},`
`42`	`42`	`{`
`43`	`43`	`"input": text_to_image("Phil works at Apple and eats an apple."),`
`@@ -53,10 +53,11 @@ def text_to_image(text: str):`
`53`	`53`	`test_set=test_set,`
`54`	`54`	`split="test",`
`55`	`55`	`format="formats.chat_api",`
	`56`	`+ metrics=["metrics.key_value_extraction.accuracy","metrics.key_value_extraction.token_overlap"]`
`56`	`57`	`)`
`57`	`58`
`58`	`59`	`model = CrossProviderInferenceEngine(`
`59`		`- model="llama-3-2-11b-vision-instruct", provider="watsonx"`
	`60`	`+ model="llama-3-2-90b-vision-instruct", provider="watsonx", temperature=0`
`60`	`61`	`)`
`61`	`62`
`62`	`63`	`predictions = model(dataset)`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`{`
`8`	`8`	`"__type__": "post_process",`
`9`	`9`	`"operator": {`
`10`		`- "__type__": "json_str_to_list_of_key_value_pairs"`
	`10`	`+ "__type__": "json_str_to_dict"`
`11`	`11`	`}`
`12`	`12`	`}`
`13`	`13`	`],`
Original file line number	Diff line number	Diff line change
`@@ -845,7 +845,7 @@ def verify(self):`
`845`	`845`
`846`	`846`	`def _maybe_set_classification_policy(self):`
`847`	`847`	`self.set_default_data_classification(`
`848`		`- self.data_classification_policy or ["proprietary"], "when loading from python dictionary"`
	`848`	`+ ["proprietary"], "when loading from python dictionary"`
`849`	`849`	`)`
`850`	`850`
`851`	`851`	`def load_iterables(self) -> MultiStream:`