Refactor JaccardIndex and StringContainment metrics to use ReductionInstanceMetric (#1816)

elronbandel · yoavkatz · web-flow · commit 88b8ce671099 · 2025-06-03T18:42:21.000+03:00
* Refactor JaccardIndex and StringContainment metrics to use ReductionInstanceMetric

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Add JaccardIndexWords metric and corresponding tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Update StringContainment metric to use Any type for predictions and references

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Update spearmanr and mean squared error

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Return old metric and fix old tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Update documentation and test of JaccardIndex

Renamed JacardIndexWords to JaccardIndexString and made it more general by default (not providing the split).

Also moved to regex tokenzier in jacard_index_words to make it work with multiple spaces.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added documentation and checking for MeanSquaredError

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated json

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fix spearmanr

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Added RMSE metric

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Format

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Revert naming

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Add spearmanr_p_value to evaluation metrics in TestAPI

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* FIx tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

---------

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;
Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;
Co-authored-by: Yoav Katz &lt;katz@il.ibm.com&gt;
Co-authored-by: Yoav Katz &lt;68273864+yoavkatz@users.noreply.github.com&gt;
diff --git a/prepare/metrics/jaccard_index.py b/prepare/metrics/jaccard_index.py
@@ -1,8 +1,18 @@
 from unitxt import add_to_catalog
-from unitxt.metrics import JaccardIndex
+from unitxt.metrics import JaccardIndex, JaccardIndexString
+from unitxt.string_operators import RegexSplit
 from unitxt.test_utils.metrics import test_metric
 
-metric = JaccardIndex()
+metric = JaccardIndex(
+    __description__="""JaccardIndex metric that operates on predictions and references that are list of elements.
+    For each prediction, it calculates the score as Intersect(prediction,reference)/Union(prediction,reference).
+    If multiple references exist, it takes for each predictions, the best ratio achieved by one of the references.
+    It then aggregates the mean over all references.
+
+    Note the metric assumes the prediction and references are either a set of elements or a list of elements.
+    If the prediction and references are strings use JaccardIndexString metrics like "metrics.jaccard_index_words" .
+    """
+)
 
 predictions = [["A", "B", "C"]]
 references = [[["B", "A", "D"]]]
@@ -27,3 +37,39 @@
 )
 
 add_to_catalog(metric, "metrics.jaccard_index", overwrite=True)
+
+
+metric = JaccardIndexString(
+    __description__="""JaccardIndex metric that operates on prediction and references that are strings.
+    It first splits the the string into words using space as a separator.
+
+    For each prediction, it calculates the ratio Intersect(prediction_words,reference_words)/Union(prediction_words,reference_words).
+    If multiple references exist, it takes the best ratio achieved by one of the references.
+
+    """,
+    splitter=RegexSplit(by=r"\s+"),
+)
+
+predictions = ["A B C"]
+references = [["B  A  D"]]
+
+instance_targets = [
+    {"jaccard_index": 0.5, "score": 0.5, "score_name": "jaccard_index"},
+]
+
+global_target = {
+    "jaccard_index": 0.5,
+    "score": 0.5,
+    "score_name": "jaccard_index",
+    "num_of_instances": 1,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+
+add_to_catalog(metric, "metrics.jaccard_index_words", overwrite=True)
diff --git a/prepare/metrics/mean_squared_error.py b/prepare/metrics/mean_squared_error.py
@@ -0,0 +1,91 @@
+from unitxt import add_to_catalog
+from unitxt.metrics import MeanSquaredError, RootMeanSquaredError
+from unitxt.test_utils.metrics import test_metric
+
+metric = MeanSquaredError(
+    __description__="""Metric to calculate the mean squared error (MSE) between the prediction and the reference values.
+
+    Assume both the prediction and reference are floats.
+
+    Support only a single reference per prediction  .
+    """
+)
+predictions = [1.0, 2.0, 1.0]
+references = [[-1.0], [1.0], [0.0]]
+
+instance_targets = [
+    {"mean_squared_error": 4.0, "score": 4.0, "score_name": "mean_squared_error"},
+    {"mean_squared_error": 1.0, "score": 1.0, "score_name": "mean_squared_error"},
+    {"mean_squared_error": 1.0, "score": 1.0, "score_name": "mean_squared_error"},
+]
+
+global_target = {
+    "mean_squared_error": 2.0,
+    "score": 2.0,
+    "score_name": "mean_squared_error",
+    "mean_squared_error_ci_low": 1.0,
+    "mean_squared_error_ci_high": 4.0,
+    "score_ci_low": 1.0,
+    "score_ci_high": 4.0,
+    "num_of_instances": 3,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+
+add_to_catalog(metric, "metrics.mean_squared_error", overwrite=True)
+
+
+metric = RootMeanSquaredError(
+    __description__="""Metric to calculate the root mean squared error (RMSE) between the prediction and the reference values.
+
+    Assume both the prediction and reference are floats.
+
+    Support only a single reference per prediction  .
+    """
+)
+
+
+instance_targets = [
+    {
+        "root_mean_squared_error": 2.0,
+        "score": 2.0,
+        "score_name": "root_mean_squared_error",
+    },
+    {
+        "root_mean_squared_error": 1.0,
+        "score": 1.0,
+        "score_name": "root_mean_squared_error",
+    },
+    {
+        "root_mean_squared_error": 1.0,
+        "score": 1.0,
+        "score_name": "root_mean_squared_error",
+    },
+]
+
+global_target = {
+    "root_mean_squared_error": 1.41,
+    "score": 1.41,
+    "score_name": "root_mean_squared_error",
+    "root_mean_squared_error_ci_low": 1.0,
+    "root_mean_squared_error_ci_high": 2.0,
+    "score_ci_low": 1.0,
+    "score_ci_high": 2.0,
+    "num_of_instances": 3,
+}
+
+outputs = test_metric(
+    metric=metric,
+    predictions=predictions,
+    references=references,
+    instance_targets=instance_targets,
+    global_target=global_target,
+)
+
+add_to_catalog(metric, "metrics.root_mean_squared_error", overwrite=True)
diff --git a/prepare/metrics/spearman.py b/prepare/metrics/spearman.py
@@ -1,36 +1,30 @@
 import numpy as np
 from unitxt import add_to_catalog
-from unitxt.metrics import MetricPipeline, Spearmanr
-from unitxt.operators import Copy
+from unitxt.metrics import Spearmanr
 from unitxt.test_utils.metrics import test_metric
 
-metric = MetricPipeline(
-    main_score="spearmanr",
-    preprocess_steps=[
-        Copy(field="references/0", to_field="references"),
-    ],
-    metric=Spearmanr(),
-    prediction_type=float,
-)
-
-predictions = [1.0, 2.0, 1.0]
-references = [[-1.0], [1.0], [0.0]]
+metric = Spearmanr(n_resamples=100)
+predictions = [1.0, 3.0, 1.1, 2.0, 8.0]
+references = [[-1.0], [1.0], [0.10], [2.0], [6.0]]
 
 instance_targets = [
     {"spearmanr": np.nan, "score": np.nan, "score_name": "spearmanr"},
     {"spearmanr": np.nan, "score": np.nan, "score_name": "spearmanr"},
     {"spearmanr": np.nan, "score": np.nan, "score_name": "spearmanr"},
+    {"spearmanr": np.nan, "score": np.nan, "score_name": "spearmanr"},
+    {"spearmanr": np.nan, "score": np.nan, "score_name": "spearmanr"},
 ]
 
 global_target = {
-    "spearmanr": 0.87,
-    "score": 0.87,
+    "num_of_instances": 5,
+    "score": 0.9,
+    "score_ci_high": 1.0,
+    "score_ci_low": 0.11,
     "score_name": "spearmanr",
-    "spearmanr_ci_low": np.nan,
-    "spearmanr_ci_high": np.nan,
-    "score_ci_low": np.nan,
-    "score_ci_high": np.nan,
-    "num_of_instances": 3,
+    "spearmanr": 0.9,
+    "spearmanr_ci_high": 1.0,
+    "spearmanr_ci_low": 0.11,
+    "spearmanr_p_value": 0.04,
 }
 
 outputs = test_metric(
diff --git a/src/unitxt/catalog/metrics/jaccard_index.json b/src/unitxt/catalog/metrics/jaccard_index.json
@@ -1,3 +1,4 @@
 {
-    "__type__": "jaccard_index"
+    "__type__": "jaccard_index",
+    "__description__": "JaccardIndex metric that operates on predictions and references that are list of elements.\n    For each prediction, it calculates the score as Intersect(prediction,reference)/Union(prediction,reference).\n    If multiple references exist, it takes for each predictions, the best ratio achieved by one of the references.\n    It then aggregates the mean over all references.\n\n    Note the metric assumes the prediction and references are either a set of elements or a list of elements.\n    If the prediction and references are strings use JaccardIndexString metrics like \"metrics.jaccard_index_words\" .\n    "
 }
diff --git a/src/unitxt/catalog/metrics/jaccard_index_words.json b/src/unitxt/catalog/metrics/jaccard_index_words.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "jaccard_index_string",
+    "__description__": "JaccardIndex metric that operates on prediction and references that are strings.\n    It first splits the the string into words using space as a separator.\n\n    For each prediction, it calculates the ratio Intersect(prediction_words,reference_words)/Union(prediction_words,reference_words).\n    If multiple references exist, it takes the best ratio achieved by one of the references.\n\n    ",
+    "splitter": {
+        "__type__": "regex_split",
+        "by": "\\s+"
+    }
+}
diff --git a/src/unitxt/catalog/metrics/mean_squared_error.json b/src/unitxt/catalog/metrics/mean_squared_error.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "mean_squared_error",
+    "__description__": "Metric to calculate the mean squared error (MSE) between the prediction and the reference values.\n\n    Assume both the prediction and reference are floats.\n\n    Support only a single reference per prediction  .\n    "
+}
diff --git a/src/unitxt/catalog/metrics/root_mean_squared_error.json b/src/unitxt/catalog/metrics/root_mean_squared_error.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "root_mean_squared_error",
+    "__description__": "Metric to calculate the root mean squared error (RMSE) between the prediction and the reference values.\n\n    Assume both the prediction and reference are floats.\n\n    Support only a single reference per prediction  .\n    "
+}
diff --git a/src/unitxt/catalog/metrics/spearman.json b/src/unitxt/catalog/metrics/spearman.json
@@ -1,15 +1,4 @@
 {
-    "__type__": "metric_pipeline",
-    "main_score": "spearmanr",
-    "preprocess_steps": [
-        {
-            "__type__": "copy",
-            "field": "references/0",
-            "to_field": "references"
-        }
-    ],
-    "metric": {
-        "__type__": "spearmanr"
-    },
-    "prediction_type": "float"
+    "__type__": "spearmanr",
+    "n_resamples": 100
 }
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
diff --git a/tests/library/test_api.py b/tests/library/test_api.py
diff --git a/tests/library/test_fusion.py b/tests/library/test_fusion.py
diff --git a/tests/library/test_metrics.py b/tests/library/test_metrics.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`{`
`2`		`- "__type__": "jaccard_index"`
	`2`	`+ "__type__": "jaccard_index",`
	`3`	+ "__description__": "JaccardIndex metric that operates on predictions and references that are list of elements.\n For each prediction, it calculates the score as Intersect(prediction,reference)/Union(prediction,reference).\n If multiple references exist, it takes for each predictions, the best ratio achieved by one of the references.\n It then aggregates the mean over all references.\n\n Note the metric assumes the prediction and references are either a set of elements or a list of elements.\n If the prediction and references are strings use JaccardIndexString metrics like \"metrics.jaccard_index_words\" .\n "
`3`	`4`	`}`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "__type__": "mean_squared_error",
 +    "__description__": "Metric to calculate the mean squared error (MSE) between the prediction and the reference values.\n\n    Assume both the prediction and reference are floats.\n\n    Support only a single reference per prediction  .\n    "
 +}