Skip to content

Commit 1409ba7

Browse files
authored
Merge pull request #779 from NVIDIA/am/bug-4844156
Address issues with Sleep test over K8s
2 parents fa1428b + cc5ffbe commit 1409ba7

File tree

5 files changed

+112
-12
lines changed

5 files changed

+112
-12
lines changed

src/cloudai/_core/json_gen_strategy.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -55,8 +55,13 @@ def sanitize_k8s_job_name(self, job_name: str) -> str:
5555
sanitized_name = job_name.lower()
5656
sanitized_name = re.sub(r"[^a-z0-9-]", "-", sanitized_name)
5757
sanitized_name = re.sub(r"^[^a-z0-9]+", "", sanitized_name)
58+
sanitized_name = sanitized_name[:253]
5859
sanitized_name = re.sub(r"[^a-z0-9]+$", "", sanitized_name)
59-
return sanitized_name[:253]
60+
61+
if not sanitized_name:
62+
raise ValueError(f"'{job_name}' cannot be sanitized to a valid Kubernetes job name.")
63+
64+
return sanitized_name
6065

6166
def store_test_run(self) -> None:
6267
from cloudai.models.scenario import TestRunDetails

src/cloudai/systems/kubernetes/kubernetes_system.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -431,13 +431,25 @@ def _delete_mpi_job(self, job_name: str) -> None:
431431

432432
def _delete_batch_job(self, job_name: str) -> None:
433433
logging.debug(f"Deleting batch job '{job_name}'")
434-
api_response = self.batch_v1.delete_namespaced_job(
435-
name=job_name,
436-
namespace=self.default_namespace,
437-
body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5),
438-
)
439-
api_response = cast("k8s.client.V1Job", api_response)
434+
try:
435+
api_response = self.batch_v1.delete_namespaced_job(
436+
name=job_name,
437+
namespace=self.default_namespace,
438+
body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5),
439+
)
440+
except lazy.k8s.client.ApiException as e:
441+
if e.status == 404:
442+
logging.debug(f"Batch job '{job_name}' not found. It may have already been deleted.")
443+
return
444+
445+
logging.error(
446+
f"An error occurred while attempting to delete batch job '{job_name}'. "
447+
f"Error code: {e.status}. Message: {e.reason}. "
448+
"Please verify the job name and Kubernetes API server."
449+
)
450+
raise
440451

452+
api_response = cast("k8s.client.V1Status", api_response)
441453
logging.debug(f"Batch job '{job_name}' deleted with status: {api_response.status}")
442454

443455
def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
@@ -662,7 +674,7 @@ def store_logs_for_job(self, job_name: str, output_dir: Path) -> None:
662674
"""
663675
pod_names = self.get_pod_names_for_job(job_name)
664676
if not pod_names:
665-
logging.warning(f"No pods found for job '{job_name}'")
677+
logging.debug(f"No pods found for job '{job_name}'")
666678
return
667679

668680
output_dir.mkdir(parents=True, exist_ok=True)

src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,10 @@ def gen_json(self) -> Dict[Any, Any]:
3333
job_spec = {
3434
"apiVersion": "batch/v1",
3535
"kind": "Job",
36-
"metadata": {"name": self.test_run.name, "namespace": kubernetes_system.default_namespace},
36+
"metadata": {
37+
"name": self.sanitize_k8s_job_name(self.test_run.name),
38+
"namespace": kubernetes_system.default_namespace,
39+
},
3740
"spec": {
3841
"ttlSecondsAfterFinished": 0,
3942
"template": {
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
import pytest
19+
20+
from cloudai.core import JsonGenStrategy, TestRun
21+
from cloudai.systems.kubernetes import KubernetesSystem
22+
23+
24+
class MyJsonGenStrategy(JsonGenStrategy):
25+
def gen_json(self) -> dict:
26+
return {}
27+
28+
29+
@pytest.mark.parametrize(
30+
"tname,expected",
31+
[
32+
("simple-name", "simple-name"),
33+
("name_with_underscores", "name-with-underscores"),
34+
("name.with.dots", "name-with-dots"),
35+
("name@with#special$chars", "name-with-special-chars"),
36+
("NameWithUpperCase", "namewithuppercase"),
37+
("a" * 260, "a" * 253),
38+
("---leading-and-trailing---", "leading-and-trailing"),
39+
("a" * 250 + "-" * 3 + "b" * 10, "a" * 250), # ensure no trailing hyphens on truncation
40+
],
41+
)
42+
def test_job_name_sanitization(k8s_system: KubernetesSystem, base_tr: TestRun, tname: str, expected: str) -> None:
43+
base_tr.name = tname
44+
json_gen = MyJsonGenStrategy(k8s_system, base_tr)
45+
assert json_gen.sanitize_k8s_job_name(base_tr.name) == expected
46+
47+
48+
def test_job_name_sanitization_raises(k8s_system: KubernetesSystem, base_tr: TestRun) -> None:
49+
base_tr.name = "!@#$%^&*()"
50+
json_gen = MyJsonGenStrategy(k8s_system, base_tr)
51+
with pytest.raises(ValueError):
52+
json_gen.sanitize_k8s_job_name(base_tr.name)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
from cloudai.core import TestRun
19+
from cloudai.systems.kubernetes import KubernetesSystem
20+
from cloudai.workloads.sleep import SleepCmdArgs, SleepKubernetesJsonGenStrategy, SleepTestDefinition
21+
22+
23+
def test_job_name_sanitization(k8s_system: KubernetesSystem) -> None:
24+
tdef = SleepTestDefinition(name="name", description="desc", test_template_name="tt", cmd_args=SleepCmdArgs())
25+
tr = TestRun(name="t!e@st#-n$am%e^", test=tdef, nodes=["node1"], num_nodes=1)
26+
json_gen = SleepKubernetesJsonGenStrategy(k8s_system, tr)
27+
28+
assert json_gen.gen_json()["metadata"]["name"] == json_gen.sanitize_k8s_job_name(tr.name)

0 commit comments

Comments
 (0)