Inference: Fix EPP Endpoint Sync and Eliminates Races

danehans · danehans · commit fc451d62d58c · 2025-11-04T11:34:52.000-08:00
- Stores endpoints via atomic.Value and adds setEndpoints/getEndpoints to snapshot
  safely without locks.
- Updates Equals to compare endpoint snapshots without locks, fixing race condition
  in krt.Equal/DeepEqual.
- Switches error handling to hasErrors/snapshotErrors/setErrors. The backend path now
  returns empty ClusterLoadAssinment when errors exist.
- Updates tests to seed errors via setErrors and avoid direct field access.
- Keeps DP collection returning Backend IR on empty endpoints and relaxes route pass
  to allow empty endpoint sets.

Signed-off-by: Daneyon Hansen &lt;daneyon.hansen@solo.io&gt;
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/backends.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/backends.go
@@ -11,9 +11,7 @@ import (
 	"google.golang.org/protobuf/types/known/anypb"
 	"google.golang.org/protobuf/types/known/structpb"
 	"google.golang.org/protobuf/types/known/wrapperspb"
-	"istio.io/istio/pkg/kube/krt"
 
-	"github.com/kgateway-dev/kgateway/v2/internal/kgateway/krtcollections"
 	"github.com/kgateway-dev/kgateway/v2/internal/kgateway/utils"
 	"github.com/kgateway-dev/kgateway/v2/pkg/pluginsdk/ir"
 )
@@ -22,11 +20,10 @@ func processPoolBackendObjIR(
 	ctx context.Context,
 	in ir.BackendObjectIR,
 	out *envoyclusterv3.Cluster,
-	podIdx krt.Index[string, krtcollections.LocalityPod],
 ) *ir.EndpointsForBackend {
 	// Build an endpoint list
 	irPool := in.ObjIr.(*inferencePool)
-	poolEps := irPool.resolvePoolEndpoints(podIdx)
+	poolEps := irPool.getEndpoints()
 	if len(poolEps) == 0 {
 		logger.Warn("no endpoints resolved for InferencePool",
 			"namespace", irPool.obj.GetNamespace(),
@@ -35,9 +32,10 @@ func processPoolBackendObjIR(
 
 	// If the pool has errors, create an empty LoadAssignment to return a 503
 	if irPool.hasErrors() {
+		errs := irPool.snapshotErrors()
 		logger.Debug("skipping endpoints due to InferencePool errors",
 			"pool", in.ResourceName(),
-			"errors", irPool.errors,
+			"errors", errs,
 		)
 		out.LoadAssignment = &envoyendpointv3.ClusterLoadAssignment{
 			ClusterName: out.Name,
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/backends_test.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/backends_test.go
@@ -9,16 +9,11 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	structpb "google.golang.org/protobuf/types/known/structpb"
-	"istio.io/istio/pkg/kube/krt"
-	"istio.io/istio/pkg/kube/krt/krttest"
-	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	inf "sigs.k8s.io/gateway-api-inference-extension/api/v1"
 
-	"github.com/kgateway-dev/kgateway/v2/internal/kgateway/krtcollections"
 	"github.com/kgateway-dev/kgateway/v2/internal/kgateway/wellknown"
 	"github.com/kgateway-dev/kgateway/v2/pkg/pluginsdk/ir"
-	krtpkg "github.com/kgateway-dev/kgateway/v2/pkg/utils/krtutil"
 )
 
 func makeBackendIR(pool *inf.InferencePool) *ir.BackendObjectIR {
@@ -53,34 +48,14 @@ func TestProcessPoolBackendObjIR_BuildsLoadAssignment(t *testing.T) {
 		},
 	}
 
-	// Build a fake Pod and wrap it into a LocalityPod
-	corePod := &corev1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			Name:      "pod1",
-			Namespace: "ns",
-			Labels:    map[string]string{"app": "test"},
-		},
-		Status: corev1.PodStatus{PodIP: "10.0.0.1"},
-	}
-	fakeLP := krtcollections.LocalityPod{
-		Named:           krt.NewNamed(corePod),
-		AugmentedLabels: corePod.Labels,
-		Addresses:       []string{corePod.Status.PodIP},
-	}
-
-	// Create a mock and with the LocalityPod collection
-	mock := krttest.NewMock(t, []any{fakeLP})
-	podCol := krttest.GetMockCollection[krtcollections.LocalityPod](mock)
-
-	// Index the pods
-	poolKey := fmt.Sprintf("%s/%s", pool.Namespace, pool.Name)
-	podIdx := krtpkg.UnnamedIndex(podCol, func(p krtcollections.LocalityPod) []string {
-		return []string{poolKey}
-	})
+	// Build the Backend IR and seed endpoints
+	beIR := makeBackendIR(pool)
+	irp := beIR.ObjIr.(*inferencePool)
+	irp.setEndpoints([]endpoint{{address: "10.0.0.1", port: 9000}})
 
 	// Call the code under test
 	cluster := &envoyclusterv3.Cluster{}
-	ret := processPoolBackendObjIR(context.Background(), *makeBackendIR(pool), cluster, podIdx)
+	ret := processPoolBackendObjIR(context.Background(), *beIR, cluster)
 	assert.Nil(t, ret, "Should return nil for a static cluster")
 
 	// Validate the generated LoadAssignment
@@ -119,13 +94,8 @@ func TestProcessPoolBackendObjIR_SkipsOnErrors(t *testing.T) {
 	irp := beIR.ObjIr.(*inferencePool)
 	irp.setErrors([]error{fmt.Errorf("failure injected")})
 
-	// Empty pod index
-	mock := krttest.NewMock(t, []any{})
-	podCol := krttest.GetMockCollection[krtcollections.LocalityPod](mock)
-	podIdx := krtpkg.UnnamedIndex(podCol, func(krtcollections.LocalityPod) []string { return nil })
-
 	cluster := &envoyclusterv3.Cluster{}
-	ret := processPoolBackendObjIR(context.Background(), *beIR, cluster, podIdx)
+	ret := processPoolBackendObjIR(context.Background(), *beIR, cluster)
 	assert.Nil(t, ret)
 
 	cla := cluster.LoadAssignment
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/collections.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/collections.go
@@ -121,9 +121,8 @@ func initInferencePoolCollections(
 					eps = append(eps, endpoint{address: ip, port: irPool.targetPorts[0].number})
 				}
 			}
-			if len(eps) == 0 {
-				return nil
-			}
+			// Always return a backend IR so the static cluster exists.
+			// Endpoints may be empty on first pass, they'll populate in subsequent passes.
 			irPool.setEndpoints(eps)
 			return buildBackendObjIrFromPool(irPool)
 		},
@@ -135,7 +134,7 @@ func initInferencePoolCollections(
 		backendsDP,
 		func(_ krt.HandlerContext, be ir.BackendObjectIR) *ir.EndpointsForBackend {
 			stub := &envoyclusterv3.Cluster{Name: be.ClusterName()}
-			return processPoolBackendObjIR(ctx, be, stub, podIdx)
+			return processPoolBackendObjIR(ctx, be, stub)
 		},
 	)
 
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/ir.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/ir.go
@@ -4,7 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"maps"
-	"sync"
+	"sync/atomic"
 	"time"
 
 	"istio.io/istio/pkg/kube/krt"
@@ -27,20 +27,17 @@ type inferencePool struct {
 	// configRef is a reference to the extension configuration. A configRef is typically implemented
 	// as a Kubernetes Service resource.
 	configRef *service
-	// mu is a mutex to protect access to the errors list.
-	mu sync.Mutex
-	// errors is a list of errors that occurred while processing the InferencePool.
-	errors []error
+	// errors that occurred while processing the InferencePool.
+	errorsV    atomic.Value
+	errorCount atomic.Int64
 	// endpoints define the list of endpoints resolved by the podSelector.
-	endpoints []endpoint
+	endpoints atomic.Value
 	// failOpen configures how the proxy handles traffic when the EPP extension is
 	// non-responsive. When set to `false` and the gRPC stream cannot be established, or if
 	// it is closed prematurely with an error, the request will fail. When set to `true` and
 	// the gRPC stream cannot be established, the request is forwarded based on the cluster
 	// load balancing configuration.
 	//
-	// Defaults to `false`.
-	//
 	failOpen bool
 }
 
@@ -67,27 +64,35 @@ func newInferencePool(pool *inf.InferencePool) *inferencePool {
 		ports: []servicePort{port},
 	}
 
-	return &inferencePool{
+	ir := &inferencePool{
 		obj:         pool,
 		podSelector: convertSelector(pool.Spec.Selector.MatchLabels),
 		// InferencePool v1 only supports single port
 		targetPorts: []targetPort{{number: int32(pool.Spec.TargetPorts[0].Number)}},
 		configRef:   svcIR,
-		endpoints:   []endpoint{},
 		failOpen:    isFailOpen(pool),
 	}
+	ir.endpoints.Store([]endpoint(nil))
+	ir.errorsV.Store([]error(nil))
+	ir.errorCount.Store(0)
+
+	return ir
 }
 
 func (ir *inferencePool) setEndpoints(eps []endpoint) {
-	ir.mu.Lock()
-	defer ir.mu.Unlock()
-	ir.endpoints = eps
+	cp := append([]endpoint(nil), eps...)
+	ir.endpoints.Store(cp)
 }
 
 func (ir *inferencePool) getEndpoints() []endpoint {
-	ir.mu.Lock()
-	defer ir.mu.Unlock()
-	return ir.endpoints
+	v := ir.endpoints.Load()
+	if v == nil {
+		return nil
+	}
+	src := v.([]endpoint)
+	out := make([]endpoint, len(src))
+	copy(out, src)
+	return out
 }
 
 // resolvePoolEndpoints returns the slice of <IP:Port> for the given pool
@@ -125,31 +130,34 @@ func (ir *inferencePool) Equals(other any) bool {
 	if !ok {
 		return false
 	}
+
 	// Compare pod selector
 	if !maps.Equal(ir.Selector(), otherPool.Selector()) {
 		return false
 	}
+
 	// Compare error presence (we only need the boolean)
 	if ir.hasErrors() != otherPool.hasErrors() {
 		return false
 	}
-	// Compare endpoint set (order‑insensitive)
-	ir.mu.Lock()
-	otherPool.mu.Lock()
-	defer ir.mu.Unlock()
-	defer otherPool.mu.Unlock()
-	if len(ir.endpoints) != len(otherPool.endpoints) {
+
+	// Snapshot endpoints (avoid holding locks during compare)
+	epsA := ir.getEndpoints()
+	epsB := otherPool.getEndpoints()
+
+	if len(epsA) != len(epsB) {
 		return false
 	}
-	seen := make(map[string]struct{}, len(ir.endpoints))
-	for _, ep := range ir.endpoints {
+	seen := make(map[string]struct{}, len(epsA))
+	for _, ep := range epsA {
 		seen[ep.string()] = struct{}{}
 	}
-	for _, ep := range otherPool.endpoints {
+	for _, ep := range epsB {
 		if _, ok := seen[ep.string()]; !ok {
 			return false
 		}
 	}
+
 	// Compare target port
 	// InferencePool v1 only supports single port
 	if len(ir.targetPorts) != 1 || len(otherPool.targetPorts) != 1 {
@@ -158,6 +166,7 @@ func (ir *inferencePool) Equals(other any) bool {
 	if ir.targetPorts[0].number != otherPool.targetPorts[0].number {
 		return false
 	}
+
 	// Compare object metadata
 	if ir.obj.GetName() != otherPool.obj.GetName() ||
 		ir.obj.GetNamespace() != otherPool.obj.GetNamespace() ||
@@ -166,14 +175,17 @@ func (ir *inferencePool) Equals(other any) bool {
 		ir.obj.GetGeneration() != otherPool.obj.GetGeneration() {
 		return false
 	}
+
 	// Compare configRef
 	if !ir.configRefEquals(otherPool) {
 		return false
 	}
+
 	// Compare failure mode
 	if !ir.failOpenEqual(otherPool) {
 		return false
 	}
+
 	return true
 }
 
@@ -190,25 +202,26 @@ func (ir *inferencePool) configRefEquals(other *inferencePool) bool {
 
 // setErrors atomically replaces p.errors under lock.
 func (ir *inferencePool) setErrors(errs []error) {
-	ir.mu.Lock()
-	defer ir.mu.Unlock()
-	ir.errors = errs
+	cp := append([]error(nil), errs...)
+	ir.errorsV.Store(cp)
+	ir.errorCount.Store(int64(len(cp)))
 }
 
 // snapshotErrors returns a copy of p.errors under lock.
 func (ir *inferencePool) snapshotErrors() []error {
-	ir.mu.Lock()
-	defer ir.mu.Unlock()
-	out := make([]error, len(ir.errors))
-	copy(out, ir.errors)
+	v := ir.errorsV.Load()
+	if v == nil {
+		return nil
+	}
+	src := v.([]error)
+	out := make([]error, len(src))
+	copy(out, src)
 	return out
 }
 
 // hasErrors checks if the inferencePool has any errors.
 func (ir *inferencePool) hasErrors() bool {
-	ir.mu.Lock()
-	defer ir.mu.Unlock()
-	return len(ir.errors) > 0
+	return ir.errorCount.Load() > 0
 }
 
 func (ir *inferencePool) failOpenEqual(other *inferencePool) bool {
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/plugin.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/plugin.go
@@ -61,7 +61,7 @@ func NewPlugin(ctx context.Context, commonCols *collections.CommonCollections) s
 
 	// Wrap the init function so it can capture commonCols.Pods
 	initBackend := func(ctx context.Context, in ir.BackendObjectIR, out *envoyclusterv3.Cluster) *ir.EndpointsForBackend {
-		return processPoolBackendObjIR(ctx, in, out, p.podIndex)
+		return processPoolBackendObjIR(ctx, in, out)
 	}
 
 	return sdk.Plugin{
@@ -215,11 +215,9 @@ func (p *endpointPickerPass) ApplyForBackend(
 
 	// Ensure we are working with the latest set of endpoints for the pool.
 	eps := irPool.resolvePoolEndpoints(p.podIdx)
-	if len(eps) == 0 {
-		return fmt.Errorf("no endpoints found for InferencePool %s/%s",
-			irPool.obj.GetNamespace(),
-			irPool.obj.GetName())
-	}
+	// If the pool has no endpoints yet, do not fail translation.
+	// Keep the route valid and provide an empty subset hint so the EPP
+	// will return 503 (or honor fail-open) rather than causing a 500.
 	irPool.setEndpoints(eps)
 
 	// Tell the EPP the subset of endpoints to choose from.
diff --git a/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/status_test.go b/internal/kgateway/extensions2/plugins/inferenceextension/endpointpicker/status_test.go
@@ -89,7 +89,7 @@ func TestUpdatePoolStatus_NoReferences_NoErrors(t *testing.T) {
 			Namespace: poolNN.Namespace,
 			Name:      poolNN.Name,
 		},
-		ObjIr: &inferencePool{errors: nil},
+		ObjIr: &inferencePool{},
 	}
 
 	// Call the function to update the pool status
@@ -181,7 +181,7 @@ func TestUpdatePoolStatus_WithReference_NoErrors(t *testing.T) {
 			Namespace: poolNN.Namespace,
 			Name:      poolNN.Name,
 		},
-		ObjIr: &inferencePool{errors: nil},
+		ObjIr: &inferencePool{},
 	}
 
 	// Call the function to update the pool status
@@ -289,14 +289,18 @@ func TestUpdatePoolStatus_WithReference_WithErrors(t *testing.T) {
 		ControllerName: controllerName,
 		Routes:         fakeRoutesIndex(col),
 	}
+
+	poolIR := &inferencePool{}
+	poolIR.setErrors([]error{fmt.Errorf("test error")})
+
 	beIR := ir.BackendObjectIR{
 		ObjectSource: ir.ObjectSource{
 			Group:     inf.GroupVersion.Group,
 			Kind:      wellknown.InferencePoolKind,
 			Namespace: poolNN.Namespace,
 			Name:      poolNN.Name,
 		},
-		ObjIr: &inferencePool{errors: []error{fmt.Errorf("test error")}},
+		ObjIr: poolIR,
 	}
 
 	// Call the function to update the pool status with errors
@@ -428,7 +432,7 @@ func TestUpdatePoolStatus_DeleteRoute(t *testing.T) {
 			Namespace: poolNN.Namespace,
 			Name:      poolNN.Name,
 		},
-		ObjIr: &inferencePool{errors: nil},
+		ObjIr: &inferencePool{},
 	}
 
 	// Call the function to update the pool status with the route
@@ -476,7 +480,7 @@ func TestUpdatePoolStatus_WithExtraGws(t *testing.T) {
 			Namespace: ns,
 			Name:      poolName,
 		},
-		ObjIr: &inferencePool{errors: nil},
+		ObjIr: &inferencePool{},
 	}
 
 	// Simulate controller knowing about a parent Gateway even if no HTTPRoute is present

Original file line number	Diff line number	Diff line change
`@@ -121,9 +121,8 @@ func initInferencePoolCollections(`
`121`	`121`	`eps = append(eps, endpoint{address: ip, port: irPool.targetPorts[0].number})`
`122`	`122`	`}`
`123`	`123`	`}`
`124`		`- if len(eps) == 0 {`
`125`		`- return nil`
`126`		`- }`
	`124`	`+ // Always return a backend IR so the static cluster exists.`
	`125`	`+ // Endpoints may be empty on first pass, they'll populate in subsequent passes.`
`127`	`126`	`irPool.setEndpoints(eps)`
`128`	`127`	`return buildBackendObjIrFromPool(irPool)`
`129`	`128`	`},`
`@@ -135,7 +134,7 @@ func initInferencePoolCollections(`
`135`	`134`	`backendsDP,`
`136`	`135`	`func(_ krt.HandlerContext, be ir.BackendObjectIR) *ir.EndpointsForBackend {`
`137`	`136`	`stub := &envoyclusterv3.Cluster{Name: be.ClusterName()}`
`138`		`- return processPoolBackendObjIR(ctx, be, stub, podIdx)`
	`137`	`+ return processPoolBackendObjIR(ctx, be, stub)`
`139`	`138`	`},`
`140`	`139`	`)`
`141`	`140`