[tool] fix: attach to existing MLflow run when MLFLOW_RUN_ID is set (#4740)

dubin555 · web-flow · commit 2d02788dcceb · 2025-12-31T21:30:49.000+08:00
diff --git a/verl/utils/tracking.py b/verl/utils/tracking.py
@@ -86,10 +86,17 @@ def __init__(self, project_name, experiment_name, default_backend: str | list[st
             MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", "sqlite:////tmp/mlruns.db")
             mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
 
-            # Project_name is actually experiment_name in MLFlow
-            # If experiment does not exist, will create a new experiment
-            experiment = mlflow.set_experiment(project_name)
-            mlflow.start_run(experiment_id=experiment.experiment_id, run_name=experiment_name)
+            # Some cloud providers like Azure ML or Databricks automatically set MLFLOW_RUN_ID
+            # If set, attach to the existing run instead of creating a new one
+            run_id = os.environ.get("MLFLOW_RUN_ID")
+            if run_id:
+                mlflow.start_run(run_id=run_id)
+            else:
+                # Project_name is actually experiment_name in MLFlow
+                # If experiment does not exist, will create a new experiment
+                experiment = mlflow.set_experiment(project_name)
+                mlflow.start_run(experiment_id=experiment.experiment_id, run_name=experiment_name)
+
             mlflow.log_params(_compute_mlflow_params_from_objects(config))
             self.logger["mlflow"] = _MlflowLoggingAdapter()