fix: mlflow run id changes in SDK (eval-hub#85)

scheruku-rh · web-flow · commit 98a8aa2b6763 · 2026-03-24T14:21:06.000+01:00
* fix: mlflow run id changes in SDK

* fix: lint errors
diff --git a/src/evalhub/adapter/callbacks.py b/src/evalhub/adapter/callbacks.py
@@ -32,17 +32,23 @@ class _MlflowOps:
 
         from evalhub.adapter.mlflow import MlflowArtifact
 
-        callbacks.mlflow.save(
+        rid = callbacks.mlflow.save(
             results,
             job_spec,
             artifacts=[
                 MlflowArtifact("results.json", json_bytes, "application/json"),
                 MlflowArtifact("report.html", html_bytes, "text/html"),
             ],
         )
+        if rid:
+            results.mlflow_run_id = rid
 
     Metrics, params, and all artifacts are saved in a single MLflow run.
-    Does nothing if ``job_spec.experiment_name`` is not set.
+    Does nothing if ``job_spec.experiment_name`` is not set (returns ``None``).
+
+    Returns the MLflow run id when a run is created. Assign it to
+    ``results.mlflow_run_id`` before ``callbacks.report_results(results)`` so
+    Eval Hub stores the link.
 
     The backend is controlled by the ``backend`` constructor argument or the
     ``EVALHUB_MLFLOW_BACKEND`` environment variable:
@@ -60,16 +66,15 @@ def save(
         results: JobResults,
         job_spec: JobSpec,
         artifacts: list[MlflowArtifact] | None = None,
-    ) -> None:
+    ) -> str | None:
         if not job_spec.experiment_name:
             logger.debug("No MLflow experiment configured, skipping")
-            return
+            return None
 
         try:
             if self._backend == MlflowBackend.UPSTREAM:
-                self._save_upstream(results, job_spec, artifacts)
-            else:
-                self._save_odh(results, job_spec, artifacts)
+                return self._save_upstream(results, job_spec, artifacts)
+            return self._save_odh(results, job_spec, artifacts)
         except Exception as e:
             logger.error("Failed to save to MLflow: %s", e)
             raise RuntimeError(f"MLflow save failed: {e}") from e
@@ -82,16 +87,17 @@ def save(
     def _build_params_metrics(
         results: JobResults,
     ) -> tuple[list, list]:
-        from .mlflow import Metric, Param
+        from .mlflow import Metric, Param, sanitize_metric_key_for_api
 
         params = [
             Param("benchmark_id", results.benchmark_id),
             Param("model_name", results.model_name),
             Param("num_examples_evaluated", str(results.num_examples_evaluated)),
             Param("duration_seconds", str(results.duration_seconds)),
         ]
+        # MLflow rejects commas etc. in metric keys; Eval Hub keeps r.metric_name as-is.
         metrics: list[Metric] = [
-            Metric(r.metric_name, float(r.metric_value))
+            Metric(sanitize_metric_key_for_api(r.metric_name), float(r.metric_value))
             for r in results.results
             if isinstance(r.metric_value, int | float)
         ]
@@ -104,21 +110,23 @@ def _save_odh(
         results: JobResults,
         job_spec: JobSpec,
         artifacts: list[MlflowArtifact] | None,
-    ) -> None:
+    ) -> str:
         from .mlflow import MlflowClient
 
         params, metrics = self._build_params_metrics(results)
         run_tags: dict[str, str] = {
             tag["key"]: tag["value"] for tag in (job_spec.tags or [])
         }
 
+        run_id: str = ""
         with MlflowClient() as client:
             experiment_id = client.get_or_create_experiment(
                 job_spec.experiment_name or ""
             )
             with client.start_run(
                 experiment_id, run_name=job_spec.id, tags=run_tags
-            ) as run_id:
+            ) as rid:
+                run_id = rid
                 client.log_batch(run_id, metrics=metrics, params=params)
                 for artifact in artifacts or []:
                     client.upload_artifact(
@@ -129,20 +137,21 @@ def _save_odh(
                     )
 
         logger.info(
-            "Saved to MLflow (odh) experiment '%s' (run: %s) — "
+            "Saved to MLflow (odh) experiment '%s' (run_id: %s) — "
             "%d metric(s), %d artifact(s)",
             job_spec.experiment_name,
-            job_spec.id,
+            run_id,
             len(metrics),
             len(artifacts or []),
         )
+        return run_id
 
     def _save_upstream(
         self,
         results: JobResults,
         job_spec: JobSpec,
         artifacts: list[MlflowArtifact] | None,
-    ) -> None:
+    ) -> str:
         import tempfile
         from pathlib import Path as _Path
 
@@ -160,7 +169,9 @@ def _save_upstream(
         }
 
         mlflow.set_experiment(job_spec.experiment_name)
-        with mlflow.start_run(run_name=job_spec.id, tags=run_tags):
+        run_id = ""
+        with mlflow.start_run(run_name=job_spec.id, tags=run_tags) as active_run:
+            run_id = active_run.info.run_id
             mlflow.log_params({p.key: p.value for p in params})
             mlflow.log_metrics({m.key: m.value for m in metrics})
 
@@ -177,13 +188,14 @@ def _save_upstream(
                     mlflow.log_artifact(str(tmp_file), artifact_path=artifact_dir)
 
         logger.info(
-            "Saved to MLflow (upstream) experiment '%s' (run: %s) — "
+            "Saved to MLflow (upstream) experiment '%s' (run_id: %s) — "
             "%d metric(s), %d artifact(s)",
             job_spec.experiment_name,
-            job_spec.id,
+            run_id,
             len(metrics),
             len(artifacts or []),
         )
+        return run_id
 
 
 class DefaultCallbacks(JobCallbacks):
@@ -192,6 +204,16 @@ class DefaultCallbacks(JobCallbacks):
     This implementation:
     - Reports status updates to sidecar (if available) or logs them
     - Pushes OCI artifacts directly using OCIArtifactPersister
+    - ``report_results(results)``: POSTs final results to Eval Hub; if
+      ``results.mlflow_run_id`` is set (for example from ``save()``), that id
+      is included (if unset, the field is left out).
+
+    Example::
+
+        rid = callbacks.mlflow.save(results, job_spec)
+        if rid:
+            results.mlflow_run_id = rid
+        callbacks.report_results(results)
 
     This is the recommended callback implementation for both production and development.
 
@@ -612,6 +634,9 @@ def report_results(self, results: JobResults) -> None:
                 if self.provider_id:
                     status_event["provider_id"] = self.provider_id
 
+                if results.mlflow_run_id:
+                    status_event["mlflow_run_id"] = results.mlflow_run_id
+
                 # Include OCI artifact reference if available
                 if results.oci_artifact:
                     status_event["artifacts"] = {
diff --git a/src/evalhub/adapter/mlflow.py b/src/evalhub/adapter/mlflow.py
@@ -12,6 +12,7 @@
 import logging
 import mimetypes
 import os
+import re
 import time
 from collections.abc import Iterable, Iterator
 from contextlib import contextmanager
@@ -42,6 +43,19 @@ class Metric:
     step: int = 0
 
 
+# Tracking REST /runs/log-batch rejects keys outside [A-Za-z0-9_\-.\s:/]
+_BAD_MLFLOW_METRIC_KEY_CHARS = re.compile(r"[^a-zA-Z0-9_\-.\s:/]+")
+
+
+def sanitize_metric_key_for_api(name: str) -> str:
+    """Map metric names to MLflow-safe keys (e.g. lm-eval ``acc,none`` → ``acc_none``).
+
+    Used only when logging to MLflow; ``JobResults`` metric names are unchanged.
+    """
+    s = _BAD_MLFLOW_METRIC_KEY_CHARS.sub("_", name).strip().strip("_")
+    return s or "metric"
+
+
 @dataclass
 class Param:
     key: str
diff --git a/src/evalhub/adapter/models/job.py b/src/evalhub/adapter/models/job.py
@@ -260,6 +260,11 @@ class JobResults(BaseModel):
         default=None, description="OCI artifact info if persisted"
     )
 
+    mlflow_run_id: str | None = Field(
+        default=None,
+        description="Optional MLflow run id included on the terminal results event when set",
+    )
+
 
 class JobCallbacks(ABC):
     """Abstract interface for job callbacks.
diff --git a/tests/unit/test_callbacks_events.py b/tests/unit/test_callbacks_events.py
@@ -0,0 +1,141 @@
+"""Tests for DefaultCallbacks POST /events payload (mlflow_run_id)."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+from unittest.mock import MagicMock, patch
+
+from evalhub.adapter.callbacks import DefaultCallbacks
+from evalhub.adapter.models.job import JobResults
+from evalhub.models.api import EvaluationResult
+
+
+def _results(mlflow_run_id: str | None = None) -> JobResults:
+    return JobResults(
+        id="job-1",
+        benchmark_id="arc_easy",
+        benchmark_index=0,
+        model_name="m",
+        results=[
+            EvaluationResult(metric_name="acc", metric_value=0.9, metric_type="float")
+        ],
+        num_examples_evaluated=1,
+        duration_seconds=1.0,
+        completed_at=datetime.now(UTC),
+        mlflow_run_id=mlflow_run_id,
+    )
+
+
+def test_report_results_sends_mlflow_run_id_when_set_on_job_results() -> None:
+    mock_http = MagicMock()
+    resp = MagicMock()
+    resp.raise_for_status = MagicMock()
+    mock_http.post.return_value = resp
+
+    with patch.object(DefaultCallbacks, "_create_http_client", return_value=mock_http):
+        callbacks = DefaultCallbacks(
+            job_id="job-1",
+            benchmark_id="arc_easy",
+            provider_id="lm_evaluation_harness",
+            benchmark_index=0,
+            sidecar_url="http://evalhub:8080",
+            insecure=True,
+        )
+
+    callbacks.report_results(_results(mlflow_run_id="mlflow-run-abc"))
+
+    mock_http.post.assert_called_once()
+    body = mock_http.post.call_args.kwargs["json"]
+    assert body["benchmark_status_event"]["mlflow_run_id"] == "mlflow-run-abc"
+
+
+def test_report_results_omits_mlflow_run_id_when_not_set() -> None:
+    mock_http = MagicMock()
+    resp = MagicMock()
+    resp.raise_for_status = MagicMock()
+    mock_http.post.return_value = resp
+
+    with patch.object(DefaultCallbacks, "_create_http_client", return_value=mock_http):
+        callbacks = DefaultCallbacks(
+            job_id="job-1",
+            benchmark_id="arc_easy",
+            benchmark_index=0,
+            sidecar_url="http://evalhub:8080",
+            insecure=True,
+        )
+
+    callbacks.report_results(_results())
+
+    body = mock_http.post.call_args.kwargs["json"]
+    assert "mlflow_run_id" not in body["benchmark_status_event"]
+
+
+def test_mlflow_save_returns_run_id_from_odh_path() -> None:
+    """Regression: save() must return _save_odh/_save_upstream result (not None)."""
+    from evalhub.adapter.callbacks import _MlflowOps
+    from evalhub.adapter.config import MlflowBackend
+    from evalhub.adapter.models.job import JobResults, JobSpec
+    from evalhub.models.api import EvaluationResult, ModelConfig
+
+    spec = JobSpec(
+        id="j1",
+        provider_id="p",
+        benchmark_id="b",
+        benchmark_index=0,
+        model=ModelConfig(url="http://localhost/v1", name="m"),
+        parameters={},
+        callback_url="http://localhost/",
+        experiment_name="exp",
+    )
+    results = JobResults(
+        id="j1",
+        benchmark_id="b",
+        benchmark_index=0,
+        model_name="m",
+        results=[
+            EvaluationResult(metric_name="acc", metric_value=1.0, metric_type="float")
+        ],
+        num_examples_evaluated=1,
+        duration_seconds=1.0,
+        completed_at=datetime.now(UTC),
+    )
+    ops = _MlflowOps(backend=MlflowBackend.ODH)
+    with patch.object(_MlflowOps, "_save_odh", return_value="run-from-odh") as m:
+        rid = ops.save(results, spec)
+    assert rid == "run-from-odh"
+    m.assert_called_once()
+
+
+def test_mlflow_save_returns_run_id_from_upstream_path() -> None:
+    from evalhub.adapter.callbacks import _MlflowOps
+    from evalhub.adapter.config import MlflowBackend
+    from evalhub.adapter.models.job import JobResults, JobSpec
+    from evalhub.models.api import EvaluationResult, ModelConfig
+
+    spec = JobSpec(
+        id="j1",
+        provider_id="p",
+        benchmark_id="b",
+        benchmark_index=0,
+        model=ModelConfig(url="http://localhost/v1", name="m"),
+        parameters={},
+        callback_url="http://localhost/",
+        experiment_name="exp",
+    )
+    results = JobResults(
+        id="j1",
+        benchmark_id="b",
+        benchmark_index=0,
+        model_name="m",
+        results=[
+            EvaluationResult(metric_name="acc", metric_value=1.0, metric_type="float")
+        ],
+        num_examples_evaluated=1,
+        duration_seconds=1.0,
+        completed_at=datetime.now(UTC),
+    )
+    ops = _MlflowOps(backend=MlflowBackend.UPSTREAM)
+    with patch.object(_MlflowOps, "_save_upstream", return_value="run-upstream") as m:
+        rid = ops.save(results, spec)
+    assert rid == "run-upstream"
+    m.assert_called_once()
diff --git a/tests/unit/test_mlflow_metric_key.py b/tests/unit/test_mlflow_metric_key.py
@@ -0,0 +1,22 @@
+"""MLflow metric key sanitization for REST API rules."""
+
+from evalhub.adapter.mlflow import sanitize_metric_key_for_api
+
+
+def test_sanitize_lm_eval_style_comma() -> None:
+    assert sanitize_metric_key_for_api("acc,none") == "acc_none"
+
+
+def test_sanitize_preserves_allowed_chars() -> None:
+    assert (
+        sanitize_metric_key_for_api("exact_match,strict-match")
+        == "exact_match_strict-match"
+    )
+
+
+def test_sanitize_empty_fallback() -> None:
+    assert sanitize_metric_key_for_api(",,,") == "metric"
+
+
+def test_sanitize_simple_name_unchanged() -> None:
+    assert sanitize_metric_key_for_api("accuracy") == "accuracy"