Skip to content

Commit 62eac46

Browse files
committed
Handle jobs that are already gone during deletion
1 parent e690b6c commit 62eac46

File tree

1 file changed

+19
-7
lines changed

1 file changed

+19
-7
lines changed

src/cloudai/systems/kubernetes/kubernetes_system.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -431,13 +431,25 @@ def _delete_mpi_job(self, job_name: str) -> None:
431431

432432
def _delete_batch_job(self, job_name: str) -> None:
433433
logging.debug(f"Deleting batch job '{job_name}'")
434-
api_response = self.batch_v1.delete_namespaced_job(
435-
name=job_name,
436-
namespace=self.default_namespace,
437-
body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5),
438-
)
439-
api_response = cast("k8s.client.V1Job", api_response)
434+
try:
435+
api_response = self.batch_v1.delete_namespaced_job(
436+
name=job_name,
437+
namespace=self.default_namespace,
438+
body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5),
439+
)
440+
except lazy.k8s.client.ApiException as e:
441+
if e.status == 404:
442+
logging.debug(f"Batch job '{job_name}' not found. It may have already been deleted.")
443+
return
440444

445+
logging.error(
446+
f"An error occurred while attempting to delete batch job '{job_name}'. "
447+
f"Error code: {e.status}. Message: {e.reason}. "
448+
"Please verify the job name and Kubernetes API server."
449+
)
450+
raise
451+
452+
api_response = cast("k8s.client.V1Job", api_response)
441453
logging.debug(f"Batch job '{job_name}' deleted with status: {api_response.status}")
442454

443455
def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
@@ -662,7 +674,7 @@ def store_logs_for_job(self, job_name: str, output_dir: Path) -> None:
662674
"""
663675
pod_names = self.get_pod_names_for_job(job_name)
664676
if not pod_names:
665-
logging.warning(f"No pods found for job '{job_name}'")
677+
logging.debug(f"No pods found for job '{job_name}'")
666678
return
667679

668680
output_dir.mkdir(parents=True, exist_ok=True)

0 commit comments

Comments
 (0)