fix: prevent int32 overflow in MonoVertex autoscaler desiredReplicas (#3421)

suryapratap-01 · web-flow · commit 49ce7d32c19c · 2026-05-13T07:44:51.000-07:00
Signed-off-by: suryapratap-01 &lt;suryapratap.personal@gmail.com&gt;
diff --git a/pkg/reconciler/monovertex/scaling/scaling.go b/pkg/reconciler/monovertex/scaling/scaling.go
@@ -308,17 +308,31 @@ func (s *Scaler) desiredReplicas(_ context.Context, monoVtx *dfv1.MonoVertex, pr
 		return int32(monoVtx.Status.Replicas)
 	}
 
-	var desired int32
 	// We calculate the time of finishing processing the pending messages,
 	// and then we know how many replicas are needed to get them done in target seconds.
-	desired = int32(math.Round(((float64(pending) / processingRate) / float64(monoVtx.Spec.Scale.GetTargetProcessingSeconds())) * float64(monoVtx.Status.ReadyReplicas)))
+	// Clamp the float64 result before casting to int32 to prevent wraparound when the
+	// intermediate value exceeds math.MaxInt32 (can happen with large pending + near-zero rate).
+	desiredRaw := math.Round(((float64(pending) / processingRate) / float64(monoVtx.Spec.Scale.GetTargetProcessingSeconds())) * float64(monoVtx.Status.ReadyReplicas))
+	if desiredRaw > math.MaxInt32 {
+		desiredRaw = math.MaxInt32
+	}
+	desired := int32(desiredRaw)
 
 	// we only scale down to zero when the pending and rate are both zero.
 	if desired == 0 {
 		desired = 1
 	}
-	if desired > int32(pending) && pending > 0 { // For some corner cases, we don't want to scale up to more than pending.
-		desired = int32(pending)
+	// For some corner cases, we don't want to scale up to more than pending.
+	// pending is int64 (matches the daemon's wrapperspb.Int64Value type), but desired
+	// replicas must fit in int32 per the Kubernetes replica spec, so we guard the cast.
+	if pending > 0 {
+		pendingCap := int32(math.MaxInt32)
+		if pending <= math.MaxInt32 {
+			pendingCap = int32(pending)
+		}
+		if desired > pendingCap {
+			desired = pendingCap
+		}
 	}
 	return desired
 }
diff --git a/pkg/reconciler/monovertex/scaling/scaling_test.go b/pkg/reconciler/monovertex/scaling/scaling_test.go
@@ -15,3 +15,130 @@ limitations under the License.
 */
 
 package scaling
+
+import (
+	"context"
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	dfv1 "github.com/numaproj/numaflow/pkg/apis/numaflow/v1alpha1"
+)
+
+func monoVtxWithScale(targetSec uint32, readyReplicas uint32, currentReplicas uint32) *dfv1.MonoVertex {
+	mv := &dfv1.MonoVertex{}
+	mv.Spec.Scale.TargetProcessingSeconds = &targetSec
+	mv.Status.ReadyReplicas = readyReplicas
+	mv.Status.Replicas = currentReplicas
+	return mv
+}
+
+func TestDesiredReplicas(t *testing.T) {
+	s := &Scaler{}
+	ctx := context.Background()
+
+	tests := []struct {
+		name            string
+		pending         int64
+		processingRate  float64
+		targetSec       uint32
+		readyReplicas   uint32
+		currentReplicas uint32
+		expected        int32
+	}{
+		{
+			name:           "bothZero_scaleToZero",
+			pending:        0,
+			processingRate: 0,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       0,
+		},
+		{
+			name:            "rateZero_returnsCurrent",
+			pending:         100,
+			processingRate:  0,
+			targetSec:       20,
+			readyReplicas:   1,
+			currentReplicas: 3,
+			expected:        3,
+		},
+		{
+			name:           "normal",
+			pending:        100,
+			processingRate: 5,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       1,
+		},
+		{
+			name:           "desiredZero_clampedToOne",
+			pending:        1,
+			processingRate: 1000,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       1,
+		},
+		{
+			// desired = round((3/0.5)/20 * 1) = round(0.3) = 0 → clamped to 1.
+			// pending cap (3) > 1 so no further cap applied.
+			name:           "capByPending_desiredLessThanPending",
+			pending:        3,
+			processingRate: 0.5,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       1,
+		},
+		{
+			// pending cap path: desired > pending, so cap to pending.
+			name:           "capByPending_desiredGreaterThanPending",
+			pending:        3,
+			processingRate: 0.01,
+			targetSec:      1,
+			readyReplicas:  5,
+			expected:       3,
+		},
+		{
+			// Regression test for issue #3415: pending=100,000, rate=0.001 msg/s, targetSec=20,
+			// readyReplicas=1 → raw float64 = 5,000,000,000 which overflows int32 without the fix.
+			// After the MaxInt32 float clamp, desired is then capped to pending (100,000) since
+			// we must never scale to more replicas than there are messages.
+			name:           "overflow_fromIssue3415",
+			pending:        100_000,
+			processingRate: 0.001,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       100_000,
+		},
+		{
+			name:           "extremeOverflow",
+			pending:        1_000_000,
+			processingRate: 0.0001,
+			targetSec:      1,
+			readyReplicas:  10,
+			expected:       1_000_000,
+		},
+		{
+			// pending > math.MaxInt32: the pending-cap guard must not wrap to negative.
+			name:           "pendingExceedsMaxInt32",
+			pending:        int64(math.MaxInt32) + 1000,
+			processingRate: 1e9,
+			targetSec:      20,
+			readyReplicas:  1,
+			expected:       1,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			mv := monoVtxWithScale(tc.targetSec, tc.readyReplicas, tc.currentReplicas)
+			got := s.desiredReplicas(ctx, mv, tc.processingRate, tc.pending)
+			assert.Equal(t, tc.expected, got)
+			// Invariant: result must never be negative (except the explicit scale-to-zero case).
+			if tc.expected != 0 {
+				assert.True(t, got > 0, "desiredReplicas must not return a non-positive value for non-zero expected")
+			}
+		})
+	}
+}