Sanitize NaN/Infinity in metrics JSON before parsing

sutaakar · claude · sutaakar · commit 0f225763a0d1 · 2026-04-07T09:41:42.000+02:00
Python's json.dumps() allows NaN and Infinity by default, but these
are not valid JSON per the spec. When training metrics contain these
values (e.g., grad_norm when loss=0, eval_loss with bf16 on ROCm),
Go's json.Unmarshal fails, causing the operator to never capture
training progress.

Add sanitizeJSON() to replace NaN/Infinity with null before parsing
in both PollTrainingProgress and CaptureMetricsFromTerminationMessage.

Ref: RHOAIENG-56898

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/pkg/rhai/progression/progression.go b/pkg/rhai/progression/progression.go
@@ -22,6 +22,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"regexp"
 	"strconv"
 	"sync"
 	"time"
@@ -283,7 +284,7 @@ func PollTrainingProgress(ctx context.Context, pod *corev1.Pod, metricsPort stri
 	}
 
 	var status TrainerStatus
-	if err := json.Unmarshal(body, &status); err != nil {
+	if err := json.Unmarshal(sanitizeJSON(body), &status); err != nil {
 		return nil, fmt.Errorf("failed to parse metrics JSON: %w", err)
 	}
 
@@ -292,6 +293,18 @@ func PollTrainingProgress(ctx context.Context, pod *corev1.Pod, metricsPort stri
 	return &status, nil
 }
 
+// sanitizeJSON replaces NaN and Infinity values with null in JSON strings.
+// Python's json.dumps() allows these by default, but they are not valid JSON,
+// causing Go's json.Unmarshal to fail.
+var nanPattern = regexp.MustCompile(`:\s*NaN\b`)
+var infPattern = regexp.MustCompile(`:\s*-?Infinity\b`)
+
+func sanitizeJSON(data []byte) []byte {
+	s := nanPattern.ReplaceAll(data, []byte(": null"))
+	s = infPattern.ReplaceAll(s, []byte(": null"))
+	return s
+}
+
 // cleanInvalidMetrics removes invalid values while keeping valid fields.
 // Defense against custom implementations, malformed requests, or edge cases.
 func cleanInvalidMetrics(m *TrainerStatus) {
@@ -495,7 +508,7 @@ func CaptureMetricsFromTerminationMessage(ctx context.Context, pod *corev1.Pod)
 
 		// Parse JSON from termination message
 		var status AnnotationStatus
-		if err := json.Unmarshal([]byte(message), &status); err != nil {
+		if err := json.Unmarshal(sanitizeJSON([]byte(message)), &status); err != nil {
 			return nil, fmt.Errorf("failed to parse termination message JSON: %w", err)
 		}
 
diff --git a/pkg/rhai/progression/progression_test.go b/pkg/rhai/progression/progression_test.go
@@ -271,6 +271,53 @@ func TestPollTrainingProgress(t *testing.T) {
 			wantStatus:     nil,
 			wantErr:        true,
 		},
+		{
+			name: "NaN and Infinity values are sanitized to null",
+			responseBody: `{
+				"progressPercentage": 100,
+				"estimatedRemainingSeconds": 0,
+				"currentStep": 50,
+				"totalSteps": 50,
+				"currentEpoch": 5,
+				"totalEpochs": 5,
+				"trainMetrics": {"loss": 0.0, "grad_norm": NaN, "learning_rate": 1e-06},
+				"evalMetrics": {"eval_loss": NaN, "eval_runtime": 0.04}
+			}`,
+			responseStatus: http.StatusOK,
+			wantStatus: &TrainerStatus{
+				ProgressPercentage:        ptrInt(100),
+				EstimatedRemainingSeconds: ptrInt(0),
+				CurrentStep:               ptrInt(50),
+				TotalSteps:                ptrInt(50),
+				CurrentEpoch:              ptrFloat64(5),
+				TotalEpochs:               ptrInt(5),
+				TrainMetrics: map[string]interface{}{
+					"loss":          0.0,
+					"grad_norm":     nil,
+					"learning_rate": 1e-06,
+				},
+				EvalMetrics: map[string]interface{}{
+					"eval_loss":    nil,
+					"eval_runtime": 0.04,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "negative Infinity is sanitized to null",
+			responseBody: `{
+				"progressPercentage": 50,
+				"trainMetrics": {"loss": -Infinity}
+			}`,
+			responseStatus: http.StatusOK,
+			wantStatus: &TrainerStatus{
+				ProgressPercentage: ptrInt(50),
+				TrainMetrics: map[string]interface{}{
+					"loss": nil,
+				},
+			},
+			wantErr: false,
+		},
 	}
 
 	for _, tt := range tests {
@@ -537,6 +584,59 @@ func TestGetPrimaryPod(t *testing.T) {
 	}
 }
 
+func TestSanitizeJSON(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+		want  string
+	}{
+		{
+			name:  "NaN replaced with null",
+			input: `{"grad_norm": NaN, "loss": 0.5}`,
+			want:  `{"grad_norm": null, "loss": 0.5}`,
+		},
+		{
+			name:  "Infinity replaced with null",
+			input: `{"loss": Infinity}`,
+			want:  `{"loss": null}`,
+		},
+		{
+			name:  "negative Infinity replaced with null",
+			input: `{"loss": -Infinity}`,
+			want:  `{"loss": null}`,
+		},
+		{
+			name:  "multiple NaN values",
+			input: `{"grad_norm": NaN, "eval_loss": NaN, "loss": 0.1}`,
+			want:  `{"grad_norm": null, "eval_loss": null, "loss": 0.1}`,
+		},
+		{
+			name:  "no special values unchanged",
+			input: `{"loss": 0.5, "step": 100}`,
+			want:  `{"loss": 0.5, "step": 100}`,
+		},
+		{
+			name:  "NaN in string value not replaced",
+			input: `{"name": "NaNcy", "loss": NaN}`,
+			want:  `{"name": "NaNcy", "loss": null}`,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := string(sanitizeJSON([]byte(tt.input)))
+			if got != tt.want {
+				t.Errorf("sanitizeJSON() = %q, want %q", got, tt.want)
+			}
+			// Verify result is valid JSON
+			var m map[string]interface{}
+			if err := json.Unmarshal([]byte(got), &m); err != nil {
+				t.Errorf("sanitizeJSON() produced invalid JSON: %v", err)
+			}
+		})
+	}
+}
+
 func TestCleanInvalidMetrics(t *testing.T) {
 	tests := []struct {
 		name   string
@@ -1267,6 +1367,39 @@ func TestCaptureMetricsFromTerminationMessage(t *testing.T) {
 			wantErr: true,
 			wantNil: true,
 		},
+		{
+			name: "NaN and Infinity in termination message are sanitized",
+			pod: &corev1.Pod{
+				Status: corev1.PodStatus{
+					ContainerStatuses: []corev1.ContainerStatus{
+						{
+							Name: "node",
+							State: corev1.ContainerState{
+								Terminated: &corev1.ContainerStateTerminated{
+									Message: `{"progressPercentage": 100, "estimatedRemainingSeconds": 0, "currentStep": 50, "totalSteps": 50, "trainMetrics": {"loss": 0.0, "grad_norm": NaN}, "evalMetrics": {"eval_loss": NaN, "eval_runtime": 0.04}}`,
+								},
+							},
+						},
+					},
+				},
+			},
+			wantErr: false,
+			wantNil: false,
+			checkFunc: func(t *testing.T, status *AnnotationStatus) {
+				if status.ProgressPercentage == nil || *status.ProgressPercentage != 100 {
+					t.Errorf("ProgressPercentage = %v, want 100", status.ProgressPercentage)
+				}
+				if status.TrainMetrics == nil {
+					t.Fatal("TrainMetrics is nil")
+				}
+				if status.TrainMetrics["grad_norm"] != nil {
+					t.Errorf("grad_norm should be nil (sanitized from NaN), got %v", status.TrainMetrics["grad_norm"])
+				}
+				if status.EvalMetrics["eval_loss"] != nil {
+					t.Errorf("eval_loss should be nil (sanitized from NaN), got %v", status.EvalMetrics["eval_loss"])
+				}
+			},
+		},
 		{
 			name: "invalid JSON in termination message",
 			pod: &corev1.Pod{