fix: planner cleanup on job complete

raresgaia123 · raresgaia123 · commit fdf1c5610bda · 2025-12-05T16:33:47.000+02:00
Whenever a job is stopped, failed, completed, planner will still keep record of the current pipelines. So when we want to restart the same job, or a job with the same id, planner will wrongly decide to do scale_in, in the case where the queue level allows a scale_in. To fix this issue, we need to do cleanup in planner, whenever job context does cleanup.
diff --git a/infscale/controller/job_context.py b/infscale/controller/job_context.py
@@ -1270,6 +1270,7 @@ def cleanup(self) -> None:
         self._new_cfg = None
         self._flow_graph_patched = False
         self._worlds_conflict_count = {}
+        self.ctrl.planner.remove_pipeline_data(self.job_id)
 
     def _release_gpu_resources(self, agent_data: AgentMetaData) -> None:
         resources = self.ctrl.agent_contexts[agent_data.id].resources
diff --git a/infscale/controller/planner.py b/infscale/controller/planner.py
@@ -110,6 +110,11 @@ def __init__(self, path: str, autoscale: bool) -> None:
 
         self.pipeline_data: dict[str, list[PipelineData]] = {}
 
+    def remove_pipeline_data(self, job_id: str) -> None:
+        """Remove pipeline data for job id."""
+        if job_id in self.pipeline_data:
+            del self.pipeline_data[job_id]
+
     def update_pipeline_data(self, wids_to_remove: set[str], job_id: str) -> None:
         """Update pipeline data based on worker ids."""
         if job_id not in self.pipeline_data: