PENT-103-part-1: refactor integration test so it works on clusterd org (#336)

zhming0 · web-flow · commit e0f08bd5acb1 · 2024-06-03T21:19:22.000+10:00
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,6 @@ Brewfile.lock.json
 .vscode
 
 dist/
+
+# For all glorious direnv users.
+.envrc
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -16,12 +16,72 @@ just --list
 
 # Integration Tests
 
+## Architecture
+
+Agent Stack K8s integration tests depend on a running Buildkite instance. By default, they use the production Buildkite.
+
+```mermaid
+flowchart LR
+    c((Controller)) -->|create jobs| K
+    Buildkite <-->|Pull jobs| c
+    subgraph K8s cluster
+        K(Kube API)
+    end
+```
+
+During test run, the test suites:
+1. create ephemeral pipelines and queues for a given [Buildkite Agent Cluster](https://buildkite.com/docs/clusters/overview).
+2. Run executor, which will monitor jobs from the target queue in target Buildkite Cluster,
+   starts new Jobs in a Kubernetes cluster.
+3. Test suite will clean up those ephemeral objects in the end.
+
+To run integration test locally, we recommend you to run individual test. For example,
+
+```bash
+just test -run TestWalkingSkeleton
+```
+
 ## Setup
-For running the integration tests you'll need to add some additional scopes to your Buildkite API token:
+
+Any member of the public should be able to run our integration as long as you are an user of Buildkite, and you have
+access to a Kubernetes cluster.
+
+Concretely, to get the integration test running locally, you will need:
+1. A valid Buildkite API token (presuming you are a customer of Buildkite).
+2. A valid Buildkite Agent Token in your target Buildkite Cluster. The agent token needs to be installed in your K8s
+   cluster.
+3. Your organization name in Buildkite and your target Buildkite Cluster UUID.
+4. Depending on test cases, you may also need a SSH keys, please read below.
+5. Your shell environment will need CLI write access to a k8s cluster.
+
+### Use environment variables
+
+We found it's convenient to supply API token, organization name, and cluster UUID as environment variables.
+
+```bash
+export BUILDKITE_TOKEN="bkua_**************"
+export ORG="your-cool-org-slug"
+export CLUSTER_UUID="UUID-UUID-UUID-UUID"
+```
+
+### Token Scopes
+
+Required Buildkite API token scopes:
 
 - `read_artifacts`
 - `read_build_logs`
 - `write_pipelines`
+- `write_clusters`
+
+### Install Agent Token
+
+Agent token is used by the k8s jobs instead of controller, so:
+
+```bash
+kubectl create secret generic buildkite-agent-token --from-literal=BUILDKITE_AGENT_TOKEN=my-agent-token
+```
+
+### SSH secret
 
 You'll also need to create an SSH secret in your cluster to run [this test pipeline](internal/integration/fixtures/secretref.yaml). This SSH key needs to be associated with your GitHub account to be able to clone this public repo, and must be in a form acceptable to OpenSSH (aka `BEGIN OPENSSH PRIVATE KEY`, not `BEGIN PRIVATE KEY`).
 
@@ -34,13 +94,16 @@ The integration tests on the [`kubernetes-agent-stack`](https://buildkite.com/bu
 
 
 ## Cleanup
-These will be deleted automatically for successful tests, but for unsuccessful tests, then will remain after then end of the test job to allow you to debug them.
-However, this means they should be cleaned up manually. To do this run
+
+In general, pipelines and queues will be deleted automatically for successful tests, but for unsuccessful tests, then will remain after then end of the test job to allow you to debug them.
+
+To do clean them up:
+
 ```bash
-CLEANUP_PIPELINES=true just cleanup-orphans --org=buildkite-kubernetes-stack --buildkite-token=<buildkite-api-token>
+just cleanup-orphans
 ```
 
-The token will need to have graphql access as well as:
+The token will need to have GraphQL access as well as:
 - `read_artifacts`
 - `write_pipelines`
 
@@ -50,19 +113,17 @@ To clean these out you should run the following in a kubernetes context in the n
 kubectl get -o jsonpath='{.items[*].metadata.name}' jobs | xargs -L1 kubectl delete job
 ```
 
-At the time of writing, the CI pipeline is run in an EKS cluster, `agent-stack-k8s-ci` in the `buildkite-agent` AWS account.
-The controller is deployed to the `buildkite` namespace in that cluster.
-See https://docs.aws.amazon.com/eks/latest/userguide/create-kubeconfig.html for how to obtain a kubeconfig for an EKS cluster.
+## CI ❤️  Integration Test
 
-# Run from source
+At the time of writing, the CI pipeline run in an EKS cluster, `agent-stack-k8s-ci` in the `buildkite-agent` AWS account.
+CI deployes the controller onto `buildkite` namespace in that cluster.
 
-First store the agent token in a Kubernetes secret:
+# Run from source
 
-```bash!
-kubectl create secret generic buildkite-agent-token --from-literal=BUILDKITE_AGENT_TOKEN=my-agent-token
-```
+Running from the source can be useful for debugging purpose, you will generally need to meet the same requirement of
+running a integration test.
 
-Next start the controller:
+In this case, you can choose to supply some inputs via CLI parameters instead of environment variable.
 
 ```bash!
 just run --org my-org --buildkite-token my-api-token --debug
diff --git a/internal/controller/config/config.go b/internal/controller/config/config.go
@@ -22,17 +22,19 @@ var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
 // mapstructure (the module) supports switching the struct tag to "json", viper does not. So we have
 // to have the `mapstructure` tag for viper and the `json` tag is used by the mapstructure!
 type Config struct {
-	Debug                       bool            `json:"debug"`
-	JobTTL                      time.Duration   `json:"job-ttl"`
-	PollInterval                time.Duration   `json:"poll-interval"`
-	AgentTokenSecret            string          `json:"agent-token-secret"              validate:"required"`
-	BuildkiteToken              string          `json:"buildkite-token"                 validate:"required"`
-	Image                       string          `json:"image"                           validate:"required"`
-	MaxInFlight                 int             `json:"max-in-flight"                   validate:"min=0"`
-	Namespace                   string          `json:"namespace"                       validate:"required"`
-	Org                         string          `json:"org"                             validate:"required"`
-	Tags                        stringSlice     `json:"tags"                            validate:"min=1"`
-	ProfilerAddress             string          `json:"profiler-address"                validate:"omitempty,hostname_port"`
+	Debug            bool          `json:"debug"`
+	JobTTL           time.Duration `json:"job-ttl"`
+	PollInterval     time.Duration `json:"poll-interval"`
+	AgentTokenSecret string        `json:"agent-token-secret"              validate:"required"`
+	BuildkiteToken   string        `json:"buildkite-token"                 validate:"required"`
+	Image            string        `json:"image"                           validate:"required"`
+	MaxInFlight      int           `json:"max-in-flight"                   validate:"min=0"`
+	Namespace        string        `json:"namespace"                       validate:"required"`
+	Org              string        `json:"org"                             validate:"required"`
+	Tags             stringSlice   `json:"tags"                            validate:"min=1"`
+	ProfilerAddress  string        `json:"profiler-address"                validate:"omitempty,hostname_port"`
+	// This field is mandatory for most new orgs.
+	// Some old orgs allows unclustered setup.
 	ClusterUUID                 string          `json:"cluster-uuid"                    validate:"omitempty"`
 	AdditionalRedactedVars      stringSlice     `json:"additional-redacted-vars"        validate:"omitempty"`
 	PodSpecPatch                *corev1.PodSpec `json:"pod-spec-patch"                  validate:"omitempty"`
diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go
@@ -22,8 +22,7 @@ func TestWalkingSkeleton(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -44,8 +43,7 @@ func TestPodSpecPatchInStep(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 
@@ -62,8 +60,7 @@ func TestPodSpecPatchInStepFailsWhenPatchingContainerCommands(t *testing.T) {
 	}.Init()
 
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
@@ -80,8 +77,7 @@ func TestPodSpecPatchInController(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	cfg := cfg
 	cfg.PodSpecPatch = &corev1.PodSpec{
 		Containers: []corev1.Container{
@@ -113,8 +109,7 @@ func TestControllerPicksUpJobsWithSubsetOfAgentTags(t *testing.T) {
 	}.Init()
 
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 
 	cfg := cfg
 	cfg.Tags = append(cfg.Tags, "foo=bar") // job has queue=<something>, agent has queue=<something> and foo=bar
@@ -133,8 +128,7 @@ func TestControllerSetsAdditionalRedactedVars(t *testing.T) {
 	}.Init()
 
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 
 	cfg := cfg
 	cfg.AdditionalRedactedVars = []string{"ELEVEN_HERBS_AND_SPICES"}
@@ -157,8 +151,7 @@ func TestPrePostCheckoutHooksRun(t *testing.T) {
 	}.Init()
 
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
@@ -176,8 +169,7 @@ func TestChown(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -198,8 +190,7 @@ func TestSSHRepoClone(t *testing.T) {
 		Get(ctx, "agent-stack-k8s", metav1.GetOptions{})
 	require.NoError(t, err, "agent-stack-k8s secret must exist")
 
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -215,8 +206,7 @@ func TestPluginCloneFailsTests(t *testing.T) {
 
 	ctx := context.Background()
 
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertFail(ctx, build)
@@ -232,8 +222,7 @@ func TestMaxInFlightLimited(t *testing.T) {
 
 	ctx := context.Background()
 
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	cfg := cfg
 	cfg.MaxInFlight = 1
 	tc.StartController(ctx, cfg)
@@ -271,8 +260,7 @@ func TestMaxInFlightUnlimited(t *testing.T) {
 
 	ctx := context.Background()
 
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	cfg := cfg
 	cfg.MaxInFlight = 0
 	tc.StartController(ctx, cfg)
@@ -315,8 +303,7 @@ func TestSidecars(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -331,8 +318,7 @@ func TestExtraVolumeMounts(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -346,8 +332,7 @@ func TestInvalidPodSpec(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertFail(ctx, build)
@@ -365,8 +350,7 @@ func TestInvalidPodJSON(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertFail(ctx, build)
@@ -384,8 +368,7 @@ func TestEnvVariables(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertSuccess(ctx, build)
@@ -400,8 +383,7 @@ func TestImagePullBackOffCancelled(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertFail(ctx, build)
@@ -416,8 +398,7 @@ func TestArtifactsUploadFailedJobs(t *testing.T) {
 		GraphQL: api.NewClient(cfg.BuildkiteToken),
 	}.Init()
 	ctx := context.Background()
-	pipelineID, cleanup := tc.CreatePipeline(ctx)
-	t.Cleanup(cleanup)
+	pipelineID := tc.PrepareQueueAndPipelineWithCleanup(ctx)
 	tc.StartController(ctx, cfg)
 	build := tc.TriggerBuild(ctx, pipelineID)
 	tc.AssertFail(ctx, build)
diff --git a/internal/integration/interrupt_test.go b/internal/integration/interrupt_test.go
@@ -40,6 +40,7 @@ func CleanupOnInterrupt(cleanup func()) {
 
 // EnsureCleanup will run the provided cleanup function when the test ends,
 // either via t.Cleanup or on interrupt via CleanupOnInterrupt.
+// But this can't cover test timeout case.
 func EnsureCleanup(t *testing.T, cleanup func()) {
 	t.Cleanup(cleanup)
 	CleanupOnInterrupt(cleanup)
diff --git a/internal/integration/main_test.go b/internal/integration/main_test.go
@@ -20,9 +20,11 @@ const (
 )
 
 var (
-	branch            string
-	cfg               config.Config
-	cleanupPipelines  bool
+	branch           string
+	cfg              config.Config
+	cleanupPipelines bool
+	// Preserve pipelines even if the test passses.
+	// By default, failed pipeline will always be kept.
 	preservePipelines bool
 
 	//go:embed fixtures/*
diff --git a/internal/integration/testcase_test.go b/internal/integration/testcase_test.go
diff --git a/justfile b/justfile

-Original file line number
+Diff line change
 .vscode
 dist/
++
 +# For all glorious direnv users.
 +.envrc