Skip to content

Commit 665bb3f

Browse files
fix: add preStop hook e2e test cases
Signed-off-by: abhijeet-dhumal <abhijeetdhumal652@gmail.com>
1 parent 0f211ce commit 665bb3f

4 files changed

Lines changed: 37 additions & 18 deletions

File tree

pkg/rhai/e2e/progression_e2e_test.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -541,14 +541,27 @@ var _ = ginkgo.Describe("RHAI Progression Tracking E2E Tests", func() {
541541

542542
ginkgo.By("Verifying controller continues to reconcile despite connection errors")
543543
// Controller should log errors but continue running
544-
// TrainJob should not have trainerStatus annotation since metrics are unreachable
544+
// TrainJob should not have trainerStatus annotation since metrics are unreachable during running phase
545545
gomega.Consistently(func(g gomega.Gomega) {
546546
gotTrainJob := &trainer.TrainJob{}
547547
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(trainJob), gotTrainJob)).Should(gomega.Succeed())
548548

549-
// Annotation should not be created if metrics are never reachable
550-
_, exists := gotTrainJob.Annotations[constants.AnnotationTrainerStatus]
551-
g.Expect(exists).Should(gomega.BeFalse(), "trainerStatus should not be created when metrics are unreachable")
549+
// Only check while job is still running (not completed/failed)
550+
isRunning := true
551+
for _, cond := range gotTrainJob.Status.Conditions {
552+
if (cond.Type == trainer.TrainJobComplete || cond.Type == trainer.TrainJobFailed) &&
553+
cond.Status == metav1.ConditionTrue {
554+
isRunning = false
555+
break
556+
}
557+
}
558+
559+
// Annotation should not be created while running if metrics are unreachable
560+
// (It will be synthesized after completion, which is checked later)
561+
if isRunning {
562+
_, exists := gotTrainJob.Annotations[constants.AnnotationTrainerStatus]
563+
g.Expect(exists).Should(gomega.BeFalse(), "trainerStatus should not be created during running when metrics are unreachable")
564+
}
552565
}, 10*time.Second, interval).Should(gomega.Succeed())
553566

554567
ginkgo.By("Waiting for TrainJob to complete despite metrics errors")

pkg/rhai/e2e/resources/failing-test-runtime.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ spec:
1212
replicatedJobs:
1313
- name: node
1414
template:
15+
metadata:
16+
labels:
17+
trainer.kubeflow.org/trainjob-ancestor-step: trainer
1518
spec:
1619
backoffLimit: 0
1720
template:
@@ -40,7 +43,7 @@ spec:
4043
"trainMetrics": {"loss": 1.0},
4144
"evalMetrics": {}
4245
}
43-
46+
4447
def do_GET(self):
4548
if self.path == '/metrics':
4649
self.send_response(200)
@@ -49,7 +52,7 @@ spec:
4952
self.wfile.write(json.dumps(self.progress_data).encode())
5053
else:
5154
self.send_error(404)
52-
55+
5356
def log_message(self, *args): pass
5457
5558
def start_metrics_server(port=28080):
@@ -59,20 +62,20 @@ spec:
5962
6063
# Start metrics server
6164
start_metrics_server(28080)
62-
65+
6366
# Wait briefly for server to be ready
6467
time.sleep(1)
6568
6669
# Fast training that will fail at 50% (3 seconds total)
6770
print("Starting training that will fail...")
6871
total_steps = 30
6972
fail_at_step = 15 # Fail at 50%
70-
73+
7174
for step in range(fail_at_step):
7275
time.sleep(0.2) # 0.2s per step
7376
progress = int((step / total_steps) * 100)
7477
remaining = int((total_steps - step) * 0.2)
75-
78+
7679
MetricsHandler.progress_data = {
7780
"progressPercentage": progress,
7881
"estimatedRemainingSeconds": remaining,
@@ -99,4 +102,3 @@ spec:
99102
timeoutSeconds: 1
100103
failureThreshold: 2
101104
restartPolicy: Never
102-

pkg/rhai/e2e/resources/no-metrics-runtime.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ spec:
1212
replicatedJobs:
1313
- name: node
1414
template:
15+
metadata:
16+
labels:
17+
trainer.kubeflow.org/trainjob-ancestor-step: trainer
1518
spec:
1619
backoffLimit: 0
1720
template:
@@ -29,15 +32,14 @@ spec:
2932
# This job does NOT expose metrics endpoint
3033
# Controller should handle connection errors gracefully
3134
print("Starting training WITHOUT metrics server...")
32-
35+
3336
# Fast simulation (5 seconds total)
3437
total_steps = 25
3538
for step in range(total_steps):
3639
time.sleep(0.2)
3740
if step % 5 == 0:
3841
print(f"Step {step}/{total_steps} (no metrics exposed)")
39-
42+
4043
print("Training completed (no metrics were exposed)")
4144
sys.exit(0)
4245
restartPolicy: Never
43-

pkg/rhai/e2e/resources/wrapper-test-runtime.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ spec:
1212
replicatedJobs:
1313
- name: node
1414
template:
15+
metadata:
16+
labels:
17+
trainer.kubeflow.org/trainjob-ancestor-step: trainer
1518
spec:
1619
backoffLimit: 0
1720
template:
@@ -40,7 +43,7 @@ spec:
4043
"trainMetrics": {"loss": 1.0},
4144
"evalMetrics": {}
4245
}
43-
46+
4447
def do_GET(self):
4548
if self.path == '/metrics':
4649
self.send_response(200)
@@ -49,7 +52,7 @@ spec:
4952
self.wfile.write(json.dumps(self.progress_data).encode())
5053
else:
5154
self.send_error(404)
52-
55+
5356
def log_message(self, *args): pass
5457
5558
def start_metrics_server(port=28080):
@@ -59,7 +62,7 @@ spec:
5962
6063
# Start metrics server
6164
start_metrics_server(28080)
62-
65+
6366
# Wait briefly for server to be ready
6467
time.sleep(1)
6568
@@ -70,7 +73,7 @@ spec:
7073
time.sleep(0.2) # 0.2s per step = 10s total
7174
progress = int((step / total_steps) * 100)
7275
remaining = int((total_steps - step) * 0.2)
73-
76+
7477
MetricsHandler.progress_data = {
7578
"progressPercentage": progress,
7679
"estimatedRemainingSeconds": remaining,
@@ -95,4 +98,3 @@ spec:
9598
timeoutSeconds: 1
9699
failureThreshold: 2
97100
restartPolicy: Never
98-

0 commit comments

Comments
 (0)