Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/kubernetes-e2e-tests/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ runs:
CLUSTER_NAME: ${{ inputs.cluster-name }}
ISTIO_VERSION: ${{ inputs.istio-version }}
TEST_PKG: ./test/e2e/tests
# Doesn't really matter, but using the same as `go build` means we can re-use the same Go cache.
CGO_ENABLED: 0
shell: bash
run: make e2e-test
- name: Archive bug report directory on failure
Expand Down
9 changes: 1 addition & 8 deletions .github/actions/prep-go-runner/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ runs:
docker system df -v

# https://github.com/actions/runner-images/discussions/3242 github runners are bad at cleanup
echo "Removing large packages"
time sudo apt-get remove -y '^dotnet-.*' '^llvm-.*' 'php.*' '^mongodb-.*' '^mysql-.*' azure-cli google-chrome-stable \
firefox powershell mono-devel libgl1-mesa-dri || true
time sudo apt-get autoremove -y || true
time sudo apt-get clean -y || true
echo "Done removing large packages"
df -h

# Clean up pre-installed tools
# For some reason, GHA often takes minutes (up to 20min observed) to clean up somehow.
Expand Down Expand Up @@ -85,4 +78,4 @@ runs:
- name: Install Dependencies
if: steps.cache.outputs.cache-hit != 'true'
shell: bash
run: make mod-download
run: time make mod-download
5 changes: 5 additions & 0 deletions .github/actions/setup-kind-cluster/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ inputs:
required: false
default: "false"
description: Whether to install localstack
agentgateway:
required: false
default: "false"
description: Whether this test only needs agentgateway
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"only needs agentgateway" meaning envoy is disabled?


runs:
using: "composite"
Expand All @@ -44,6 +48,7 @@ runs:
CONFORMANCE_VERSION: ${{ inputs.gateway-api-version }}
CONFORMANCE_CHANNEL: ${{ inputs.gateway-api-channel }}
LOCALSTACK: ${{ inputs.localstack }}
AGENTGATEWAY: ${{ inputs.agentgateway }}
CONFORMANCE: true
run: |
./hack/kind/setup-kind.sh
3 changes: 2 additions & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ jobs:
- cluster-name: 'agent-gateway-cluster'
go-test-args: '-timeout=25m'
go-test-run-regex: '^TestAgentgatewayIntegration'
agentgateway: 'true'
# August 29, 2025: ~3 minutes
- cluster-name: 'api-validation'
go-test-args: '-timeout=10m'
Expand Down Expand Up @@ -105,6 +106,7 @@ jobs:
kubectl-version: ${{ steps.dotenv.outputs.kubectl_version }}
istio-version: ${{ steps.dotenv.outputs.istio_version }}
localstack: ${{ matrix.test.localstack }}
agentgateway: ${{ matrix.test.agentgateway }}
- id: run-tests
uses: ./.github/actions/kubernetes-e2e-tests
with:
Expand All @@ -118,4 +120,3 @@ jobs:
run: |
echo "After job disk space:"
df -h
docker system df -v
15 changes: 14 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,8 @@ endif
endif

# Skip -race on e2e. This requires building the codebase twice, and provides no value as the only code executed is test code.
E2E_GO_TEST_ARGS ?= -timeout=25m -cpu=4 -outputdir=$(OUTPUT_DIR)
# Skip -vet; we already run it on the linter step and its very slow.
E2E_GO_TEST_ARGS ?= -vet=off -timeout=25m -outputdir=$(OUTPUT_DIR)
# Testing flags: https://pkg.go.dev/cmd/go#hdr-Testing_flags
# The default timeout for a suite is 10 minutes, but this can be overridden by setting the -timeout flag. Currently set
# to 25 minutes based on the time it takes to run the longest test setup (kgateway_test).
Expand Down Expand Up @@ -430,15 +431,26 @@ kgateway: $(CONTROLLER_OUTPUT_DIR)/kgateway-linux-$(GOARCH)
$(CONTROLLER_OUTPUT_DIR)/Dockerfile: cmd/kgateway/Dockerfile
cp $< $@

$(CONTROLLER_OUTPUT_DIR)/Dockerfile.agentgateway: cmd/kgateway/Dockerfile.agentgateway
cp $< $@

$(CONTROLLER_OUTPUT_DIR)/.docker-stamp-$(VERSION)-$(GOARCH): $(CONTROLLER_OUTPUT_DIR)/kgateway-linux-$(GOARCH) $(CONTROLLER_OUTPUT_DIR)/Dockerfile
$(BUILDX_BUILD) --load $(PLATFORM) $(CONTROLLER_OUTPUT_DIR) -f $(CONTROLLER_OUTPUT_DIR)/Dockerfile \
--build-arg GOARCH=$(GOARCH) \
--build-arg ENVOY_IMAGE=$(ENVOY_IMAGE) \
-t $(IMAGE_REGISTRY)/$(CONTROLLER_IMAGE_REPO):$(VERSION)
@touch $@

$(CONTROLLER_OUTPUT_DIR)/.docker-stamp-agentgateway-$(VERSION)-$(GOARCH): $(CONTROLLER_OUTPUT_DIR)/kgateway-linux-$(GOARCH) $(CONTROLLER_OUTPUT_DIR)/Dockerfile.agentgateway
$(BUILDX_BUILD) --load $(PLATFORM) $(CONTROLLER_OUTPUT_DIR) -f $(CONTROLLER_OUTPUT_DIR)/Dockerfile.agentgateway \
--build-arg GOARCH=$(GOARCH) \
-t $(IMAGE_REGISTRY)/$(CONTROLLER_IMAGE_REPO):$(VERSION)
@touch $@

.PHONY: kgateway-docker
kgateway-docker: $(CONTROLLER_OUTPUT_DIR)/.docker-stamp-$(VERSION)-$(GOARCH)
.PHONY: kgateway-agentgateway-docker
kgateway-agentgateway-docker: $(CONTROLLER_OUTPUT_DIR)/.docker-stamp-agentgateway-$(VERSION)-$(GOARCH)

#----------------------------------------------------------------------------------
# SDS Server - gRPC server for serving Secret Discovery Service config
Expand Down Expand Up @@ -717,6 +729,7 @@ kind-load-%:
# Depends on: IMAGE_REGISTRY, VERSION, CLUSTER_NAME
# Envoy image may be specified via ENVOY_IMAGE on the command line or at the top of this file
kind-build-and-load-%: %-docker kind-load-% ; ## Use to build specified image and load it into kind
kind-build-and-load-kgateway-agentgateway: kgateway-agentgateway-docker kind-load-kgateway ; ## Use to build specified image and load it into kind

# Update the docker image used by a deployment
# This works for most of our deployments because the deployment name and container name both match
Expand Down
9 changes: 9 additions & 0 deletions cmd/kgateway/Dockerfile.agentgateway
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM cgr.dev/chainguard/static

ARG GOARCH=amd64

COPY kgateway-linux-$GOARCH /usr/local/bin/kgateway

USER 10101

ENTRYPOINT ["/usr/local/bin/kgateway"]
37 changes: 22 additions & 15 deletions hack/kind/setup-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,36 +56,43 @@ function create_kind_cluster_or_skip() {
fi
}

function create_and_setup() {
create_kind_cluster_or_skip

# 5. Apply the Kubernetes Gateway API CRDs
kubectl apply --server-side -f "https://github.com/kubernetes-sigs/gateway-api/releases/download/$CONFORMANCE_VERSION/$CONFORMANCE_CHANNEL-install.yaml"

# 6. Apply the Kubernetes Gateway API Inference Extension CRDs
kubectl apply --kustomize "https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=$GIE_CRD_VERSION"
Comment on lines +62 to +66
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: remove the 5. / 6. numbering


. $SCRIPT_DIR/setup-metalllb-on-kind.sh
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be only if CONFORMANCE=true, like before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metallb is used for all tests, which all set CONFORMANCE=true. I think its a legacy thing that is now obsolete

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in that case it would be good to clean up the CONFORMANCE var if it's not used anymore (can be a follow-up)

}

# 1. Create a kind cluster (or skip creation if a cluster with name=CLUSTER_NAME already exists)
# This config is roughly based on: https://kind.sigs.k8s.io/docs/user/ingress/
create_kind_cluster_or_skip
create_and_setup &
KIND_PID=$!

if [[ $SKIP_DOCKER == 'true' ]]; then
# TODO(tim): refactor the Makefile & CI scripts so we're loading local
# charts to real helm repos, and then we can remove this block.
echo "SKIP_DOCKER=true, not building images or chart"
else
# 2. Make all the docker images and load them to the kind cluster
VERSION=$VERSION CLUSTER_NAME=$CLUSTER_NAME make kind-build-and-load
if [[ $AGENTGATEWAY == 'true' ]]; then
# Skip expensive envoy build
VERSION=$VERSION CLUSTER_NAME=$CLUSTER_NAME make kind-build-and-load-kgateway-agentgateway
else
VERSION=$VERSION CLUSTER_NAME=$CLUSTER_NAME make kind-build-and-load
fi

# 3. Build the test helm chart, ensuring we have a chart in the `_test` folder
VERSION=$VERSION make package-kgateway-charts

fi

# 5. Apply the Kubernetes Gateway API CRDs
kubectl apply --server-side -f "https://github.com/kubernetes-sigs/gateway-api/releases/download/$CONFORMANCE_VERSION/$CONFORMANCE_CHANNEL-install.yaml"

# 6. Apply the Kubernetes Gateway API Inference Extension CRDs
kubectl apply --kustomize "https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd?ref=$GIE_CRD_VERSION"

# 7. Conformance test setup
if [[ $CONFORMANCE == "true" ]]; then
echo "Running conformance test setup"
. $SCRIPT_DIR/setup-metalllb-on-kind.sh
fi
wait "$KIND_PID"
Comment on lines +73 to +93
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The create_and_setup function is run in the background (line 73) but the script has -e flag set. If the background process fails, the script won't immediately exit due to set -e. While wait "$KIND_PID" on line 93 will eventually check the exit status, the docker build steps (lines 82-90) will continue running even if cluster setup fails. This could lead to confusing error messages where docker build succeeds but fails to load images into a non-existent cluster. Consider adding error handling or checking the cluster is ready before starting the builds.

Copilot uses AI. Check for mistakes.

# 9. Setup localstack
# 7. Setup localstack
if [[ $LOCALSTACK == "true" ]]; then
echo "Setting up localstack"
. $SCRIPT_DIR/setup-localstack.sh
Expand Down
26 changes: 22 additions & 4 deletions test/e2e/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ import (
"io/fs"
"os"
"path/filepath"
"runtime"
"testing"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kgateway-dev/kgateway/v2/pkg/utils/helmutils"
"github.com/kgateway-dev/kgateway/v2/test/e2e/testutils/actions"
"github.com/kgateway-dev/kgateway/v2/test/e2e/testutils/assertions"
Expand Down Expand Up @@ -78,7 +81,9 @@ func CreateTestInstallationForCluster(
// between TestInstallation outputs per CI run
GeneratedFiles: MustGeneratedFiles(installContext.InstallNamespace, clusterContext.Name),
}
runtime.SetFinalizer(installation, func(i *TestInstallation) { i.finalize() })
testutils.Cleanup(t, func() {
installation.finalize()
})
return installation
}

Expand Down Expand Up @@ -159,7 +164,7 @@ func (i *TestInstallation) InstallKgatewayCRDsFromLocalChart(ctx context.Context

// Check if we should skip installation if the release already exists (PERSIST_INSTALL or FAIL_FAST_AND_PERSIST mode)
if testutils.ShouldPersistInstall() || testutils.ShouldFailFastAndPersist() {
if i.Actions.Helm().ReleaseExists(ctx, helmutils.CRDChartName, i.Metadata.InstallNamespace) {
if i.releaseExists(ctx, helmutils.CRDChartName, i.Metadata.InstallNamespace) {
return
}
}
Expand All @@ -185,7 +190,7 @@ func (i *TestInstallation) InstallKgatewayCoreFromLocalChart(ctx context.Context

// Check if we should skip installation if the release already exists (PERSIST_INSTALL or FAIL_FAST_AND_PERSIST mode)
if testutils.ShouldPersistInstall() || testutils.ShouldFailFastAndPersist() {
if i.Actions.Helm().ReleaseExists(ctx, helmutils.ChartName, i.Metadata.InstallNamespace) {
if i.releaseExists(ctx, helmutils.ChartName, i.Metadata.InstallNamespace) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just update Actions.Helm().ReleaseExists to use the new impl?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I didn't do this is the Helm() action is for running the helm command so it doesn't have access to the cluster client at all. Not sure we want to do that broader change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I think we can handle that in a follow-up if needed

return
}
}
Expand Down Expand Up @@ -309,3 +314,16 @@ func MustGeneratedFiles(tmpDirId, clusterId string) GeneratedFiles {
FailureDir: failureDir,
}
}
func (i *TestInstallation) releaseExists(ctx context.Context, releaseName, namespace string) bool {
l := &corev1.SecretList{}
if err := i.ClusterContext.Client.List(ctx, l, &client.ListOptions{
Namespace: namespace,
LabelSelector: labels.SelectorFromSet(map[string]string{
"owner": "helm",
"name": releaseName,
}),
}); err != nil {
return false
}
return len(l.Items) > 0
}
Comment on lines +317 to +329
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new releaseExists implementation queries Kubernetes secrets directly instead of using the Helm CLI. While this may be faster, it's less robust than the original ReleaseExists method because:

  1. It silently returns false on any error (line 325-326), which could mask real cluster connection issues
  2. The label selector assumes Helm v3's storage format, which could break if Helm changes its internal implementation
  3. The original method properly handles namespace scoping and validates the release is actually functional

Consider adding error logging at minimum, or evaluate if the performance gain justifies replacing the official Helm API.

Copilot uses AI. Check for mistakes.
2 changes: 1 addition & 1 deletion test/e2e/tests/base/base_suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ func (s *BaseTestingSuite) ApplyManifests(testCase *TestCase) {
s.TestInstallation.Assertions.EventuallyPodsRunning(s.Ctx, ns, metav1.ListOptions{
LabelSelector: fmt.Sprintf("%s=%s", defaults.WellKnownAppLabel, name),
// Provide a longer timeout as the pod needs to be pulled and pass HCs
}, time.Second*60, time.Second*2)
}, time.Second*60, time.Second)
Copy link

Copilot AI Dec 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The polling interval has been reduced from 2 seconds to 1 second. While this may speed up tests that pass quickly, it doubles the polling frequency which could:

  1. Increase load on the Kubernetes API server during tests
  2. Lead to more flaky tests if the system is under load
  3. Potentially hit rate limits in constrained environments

Consider whether the performance gain (potentially saving ~1 second per assertion) justifies the increased API load and potential for flakiness.

Suggested change
}, time.Second*60, time.Second)
}, time.Second*60, 2*time.Second)

Copilot uses AI. Check for mistakes.
}
}

Expand Down
2 changes: 1 addition & 1 deletion test/e2e/testutils/cluster/istio.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func downloadIstio(version string) (string, error) {

func UninstallIstio(istioctlBinary, kubeContext string) error {
// sh -c yes | istioctl uninstall —purge —context <kube-context>
cmd := exec.Command("sh", "-c", fmt.Sprintf("yes | %s uninstall --purge --context %s", istioctlBinary, kubeContext)) //nolint:gosec // G204: controlled istioctl uninstall command in tests
cmd := exec.Command("sh", "-c", fmt.Sprintf("yes | %s uninstall --purge --context '%s'", istioctlBinary, kubeContext)) //nolint:gosec // G204: controlled istioctl uninstall command in tests
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
Expand Down
5 changes: 0 additions & 5 deletions test/e2e/testutils/cluster/kind.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
package cluster

import (
"fmt"
"os"

"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -31,10 +30,6 @@ func MustKindContextWithScheme(clusterName string, scheme *runtime.Scheme) *Cont
}

kubeCtx := os.Getenv(testutils.KubeCtx)
if len(kubeCtx) == 0 {
kubeCtx = fmt.Sprintf("kind-%s", clusterName)
}

restCfg, err := kubeutils.GetRestConfigWithKubeContext(kubeCtx)
if err != nil {
panic(err)
Expand Down
4 changes: 4 additions & 0 deletions test/helpers/kube_dump.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,17 @@ import (
"github.com/kgateway-dev/kgateway/v2/pkg/utils/threadsafe"
kgatewayAdminCli "github.com/kgateway-dev/kgateway/v2/test/controllerutils/admincli"
"github.com/kgateway-dev/kgateway/v2/test/envoyutils/admincli"
"github.com/kgateway-dev/kgateway/v2/test/testutils"
)

// StandardKgatewayDumpOnFail creates a dump of the kubernetes state and certain envoy data from
// the admin interface when a test fails.
// Look at `KubeDumpOnFail` && `EnvoyDumpOnFail` for more details
func StandardKgatewayDumpOnFail(outLog io.Writer, kubectlCli *kubectl.Cli, outDir string, namespaces []string) func() {
return func() {
if os.Getenv(testutils.SkipDump) == "true" {
return
}
fmt.Printf("Test failed. Dumping state from %s...\n", strings.Join(namespaces, ", "))

ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
Expand Down
3 changes: 3 additions & 0 deletions test/testutils/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ const (
// loop if it litters.
SkipAllTeardown = "SKIP_ALL_TEARDOWN"

// SkipDump, if true, disables test dumping
SkipDump = "SKIP_DUMP"

// InstallNamespace is the namespace in which kgateway is installed
InstallNamespace = "INSTALL_NAMESPACE"

Expand Down