From e0751d5af2f1b0dea4422f1c411a40702b4eef1f Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Mon, 3 Mar 2025 08:16:40 -0800 Subject: [PATCH 1/8] test: add FIS tests for varying degrees of zonal failures --- test/suites/chaos/zonal_failure_test.go | 460 ++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 test/suites/chaos/zonal_failure_test.go diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go new file mode 100644 index 000000000000..f3bf98c8063f --- /dev/null +++ b/test/suites/chaos/zonal_failure_test.go @@ -0,0 +1,460 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package chaos_test + +import ( + "context" + "fmt" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/aws/aws-sdk-go-v2/service/fis" + fistypes "github.com/aws/aws-sdk-go-v2/service/fis/types" + awsiam "github.com/aws/aws-sdk-go-v2/service/iam" + iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" + awserrors "github.com/aws/karpenter-provider-aws/pkg/errors" + "github.com/google/uuid" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" + + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" + coretest "sigs.k8s.io/karpenter/pkg/test" + + awsenv "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// FIS role variables are defined at package level to be accessible across tests +var fisRoleName string +var fisRoleArn string + +var _ = Describe("ZonalFailure", func() { + BeforeEach(func() { + setupFISRole(env) + }) + + AfterEach(func() { + cleanupFISRole(env) + }) + + DescribeTable("should recover from AZ failures with different failure rates", + func(failureRate string, appLabel string, description string) { + By(fmt.Sprintf("Creating a multi-AZ deployment with multiple replicas for %s test", description)) + nodePool = coretest.ReplaceRequirements(nodePool, karpv1.NodeSelectorRequirementWithMinValues{ + NodeSelectorRequirement: corev1.NodeSelectorRequirement{ + Key: karpv1.CapacityTypeLabelKey, + Operator: corev1.NodeSelectorOpIn, + Values: []string{karpv1.CapacityTypeSpot}, + }, + }) + nodePool.Spec.Disruption.ConsolidationPolicy = karpv1.ConsolidationPolicyWhenEmptyOrUnderutilized + nodePool.Spec.Disruption.ConsolidateAfter = karpv1.MustParseNillableDuration("30s") + + numPods := 15 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": appLabel}, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(0)), + TopologySpreadConstraints: []corev1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "topology.kubernetes.io/zone", + WhenUnsatisfiable: corev1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": appLabel}, + }, + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + // Create a deployment with multiple pods + env.ExpectCreated(nodeClass, nodePool, dep) + + // Wait for all pods to be running + env.EventuallyExpectHealthyPodCount(selector, numPods) + + // Start a context with a timeout for the chaos test + ctx, cancel := context.WithTimeout(env.Context, 15*time.Minute) + defer cancel() + + // Start node count monitor + startNodeCountMonitor(ctx, env.Client) + + // Get current nodes and group them by AZ + nodesByAZ := make(map[string][]*corev1.Node) + nodeList := &corev1.NodeList{} + Expect(env.Client.List(ctx, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + // Process nodes, find AZ with most nodes, and collect instances in a single pass + var targetAZ string + var maxNodes int + var instances []ec2types.Instance + + for i := range nodeList.Items { + node := &nodeList.Items[i] + az := node.Labels[corev1.LabelTopologyZone] + if az == "" { + continue + } + nodesByAZ[az] = append(nodesByAZ[az], node) + if len(nodesByAZ[az]) > maxNodes { + maxNodes = len(nodesByAZ[az]) + targetAZ = az + } + } + + // Ensure we have nodes in multiple AZs + Expect(len(nodesByAZ)).To(BeNumerically(">", 1), "Expected nodes in multiple AZs") + + By(fmt.Sprintf("Simulating %s with %d nodes (%s%% failure rate)", + description, maxNodes, failureRate)) + + // Get EC2 instance information for nodes in the target AZ + for _, node := range nodesByAZ[targetAZ] { + instance := env.GetInstance(node.Name) + instances = append(instances, instance) + } + + // Create FIS experiment to simulate AZ failure with specified failure rate + experiment := createAZFailureExperiment(ctx, env, targetAZ, instances, failureRate) + + // Wait for the experiment to complete + By(fmt.Sprintf("Waiting for the %s experiment to complete", description)) + Eventually(func(g Gomega) { + select { + case <-ctx.Done(): + By("Chaos test timeout reached, skipping experiment status check") + return + default: + exp, err := env.FISAPI.GetExperiment(ctx, &fis.GetExperimentInput{ + Id: experiment.Id, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(exp.Experiment.State.Status).To(Or( + Equal(fistypes.ExperimentStatusCompleted), + Equal(fistypes.ExperimentStatusStopped), + Equal(fistypes.ExperimentStatusFailed), + )) + } + }, 10*time.Minute, 30*time.Second).Should(Succeed()) + + // Verify that the system recovered + By(fmt.Sprintf("Verifying system recovery from %s", description)) + env.EventuallyExpectHealthyPodCountWithTimeout(5*time.Minute, selector, numPods) + + // Clean up + env.ExpectDeleted(dep) + env.ExpectExperimentTemplateDeleted(*experiment.Id) + + Eventually(func(g Gomega) { + // First delete all nodes to trigger proper cleanup + nodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env.Context, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + // Delete each node to trigger proper Kubernetes cleanup + for _, node := range nodeList.Items { + fmt.Printf("Deleting node %s\n", node.Name) + g.Expect(env.Client.Delete(env.Context, &node)).To(Succeed()) + } + + // Wait for nodes to be deleted from Kubernetes + g.Eventually(func() int { + tempNodeList := &corev1.NodeList{} + g.Expect(env.Client.List(env.Context, tempNodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + return len(tempNodeList.Items) + }, 2*time.Minute, 10*time.Second).Should(BeZero()) + + // Now terminate any remaining EC2 instances to ensure complete cleanup + describeInstancesOutput, err := env.EC2API.DescribeInstances(env.Context, &ec2.DescribeInstancesInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String(fmt.Sprintf("tag:%s", coretest.DiscoveryLabel)), + Values: []string{env.ClusterName}, + }, + }, + }) + g.Expect(err).NotTo(HaveOccurred()) + + // Collect all instance IDs + var instanceIDs []string + for _, reservation := range describeInstancesOutput.Reservations { + for _, instance := range reservation.Instances { + if instance.InstanceId != nil { + instanceIDs = append(instanceIDs, *instance.InstanceId) + } + } + } + + // Terminate instances in batches if needed + if len(instanceIDs) > 0 { + fmt.Printf("Terminating %d remaining EC2 instances\n", len(instanceIDs)) + _, err := env.EC2API.TerminateInstances(env.Context, &ec2.TerminateInstancesInput{ + InstanceIds: instanceIDs, + }) + g.Expect(awserrors.IgnoreNotFound(err)).NotTo(HaveOccurred()) + } + }, 5*time.Minute).Should(Succeed()) + }, + Entry("complete failure", "100", "complete-failure-app", "complete AZ failure"), + Entry("moderate failure", "50", "moderate-failure-app", "moderate AZ failures"), + Entry("minor failure", "25", "minor-failure-app", "minor AZ failures"), + ) +}) + +// createAZFailureExperiment creates an AWS FIS experiment with a specific failure percentage +func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, targetAZ string, instances []ec2types.Instance, failurePercentage string) *fistypes.Experiment { + // Filter instances to only include those in the target AZ + var targetInstances []string + for _, instance := range instances { + if instance.InstanceId != nil && instance.Placement != nil && instance.Placement.AvailabilityZone != nil { + targetInstances = append(targetInstances, *instance.InstanceId) + } + } + + // Create experiment template + template := &fis.CreateExperimentTemplateInput{ + Actions: map[string]fistypes.CreateExperimentTemplateActionInput{ + "stop-instances": { + ActionId: aws.String("aws:ec2:stop-instances"), + Parameters: map[string]string{ + "startInstancesAfterDuration": "PT5M", // Start instances after 5 minutes + }, + Targets: map[string]string{ + "Instances": "target-instances", + }, + }, + "ec2-capacity-error": { + ActionId: aws.String("aws:ec2:api-insufficient-instance-capacity-error"), + Parameters: map[string]string{ + "availabilityZoneIdentifiers": targetAZ, + "duration": "PT5M", // 5 minutes + "percentage": failurePercentage, // Percentage of API calls that will fail + }, + Targets: map[string]string{ + "Roles": "target-roles", + }, + }, + "disrupt-subnet": { + ActionId: aws.String("aws:network:disrupt-connectivity"), + Parameters: map[string]string{ + "duration": "PT5M", // 5 minutes + "scope": "all", + }, + Targets: map[string]string{ + "Subnets": "target-subnets", + }, + }, + }, + Targets: map[string]fistypes.CreateExperimentTemplateTargetInput{ + "target-instances": { + ResourceType: aws.String("aws:ec2:instance"), + SelectionMode: aws.String("all"), + ResourceArns: lo.Map(targetInstances, func(id string, _ int) string { + return fmt.Sprintf("arn:aws:ec2:%s:%s:instance/%s", env.Region, env.ExpectAccountID(), id) + }), + }, + "target-roles": { + ResourceType: aws.String("aws:iam:role"), + SelectionMode: aws.String("all"), + ResourceArns: []string{ + fmt.Sprintf("arn:aws:iam::%s:role/KarpenterNodeRole-%s", env.ExpectAccountID(), env.ClusterName), + }, + }, + "target-subnets": { + ResourceType: aws.String("aws:ec2:subnet"), + SelectionMode: aws.String("all"), + Filters: []fistypes.ExperimentTemplateTargetInputFilter{ + { + Path: aws.String("AvailabilityZone"), + Values: []string{ + targetAZ, + }, + }, + }, + }, + }, + StopConditions: []fistypes.CreateExperimentTemplateStopConditionInput{ + { + Source: aws.String("none"), + }, + }, + RoleArn: aws.String(fisRoleArn), + Description: aws.String(fmt.Sprintf("Simulate AZ failure in %s", targetAZ)), + } + + // Create experiment template + experimentTemplate, err := env.FISAPI.CreateExperimentTemplate(ctx, template) + Expect(err).NotTo(HaveOccurred()) + + // Start experiment + experiment, err := env.FISAPI.StartExperiment(ctx, &fis.StartExperimentInput{ + ExperimentTemplateId: experimentTemplate.ExperimentTemplate.Id, + }) + Expect(err).NotTo(HaveOccurred()) + + return experiment.Experiment +} + +// setupFISRole creates a role for AWS FIS with necessary permissions +func setupFISRole(env *awsenv.Environment) { + // Create a unique role name for this test run to avoid conflicts + uid, err := uuid.NewUUID() + Expect(err).NotTo(HaveOccurred()) + fisRoleName = fmt.Sprintf("KarpenterFISZonalFailureRole-%s", uid.String()) + + // Create the FIS role with necessary permissions + By("Creating FIS role for zonal failure testing") + assumeRolePolicy := `{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "fis.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }` + + createRoleOutput, err := env.IAMAPI.CreateRole(env.Context, &awsiam.CreateRoleInput{ + RoleName: aws.String(fisRoleName), + AssumeRolePolicyDocument: aws.String(assumeRolePolicy), + Description: aws.String("Role for Karpenter zonal failure testing with AWS FIS"), + Tags: []iamtypes.Tag{ + { + Key: aws.String(coretest.DiscoveryLabel), + Value: aws.String(env.ClusterName), + }, + }, + }) + Expect(err).NotTo(HaveOccurred()) + fisRoleArn = *createRoleOutput.Role.Arn + + // Create policy with necessary permissions for FIS actions + uid, err = uuid.NewUUID() + Expect(err).NotTo(HaveOccurred()) + policyName := fmt.Sprintf("KarpenterFISZonalFailurePolicy-%s", uid.String()) + policyDocument := `{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:StopInstances", + "ec2:StartInstances" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": "arn:aws:ec2:*:*:instance/*", + "Condition": { + "StringEquals": { + "ec2:CreateAction": "StopInstances" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "network-manager:*", + "ec2:*NetworkInsightsAccessScope*", + "ec2:*NetworkInsightsAnalysis*", + "ec2:*NetworkInsightsPath*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "iam:ListRoles" + ], + "Resource": "*" + } + ] + }` + + createPolicyOutput, err := env.IAMAPI.CreatePolicy(env.Context, &awsiam.CreatePolicyInput{ + PolicyName: aws.String(policyName), + PolicyDocument: aws.String(policyDocument), + Description: aws.String("Policy for Karpenter zonal failure testing with AWS FIS"), + Tags: []iamtypes.Tag{ + { + Key: aws.String(coretest.DiscoveryLabel), + Value: aws.String(env.ClusterName), + }, + }, + }) + Expect(err).NotTo(HaveOccurred()) + + // Attach policy to role + _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ + RoleName: aws.String(fisRoleName), + PolicyArn: createPolicyOutput.Policy.Arn, + }) + Expect(err).NotTo(HaveOccurred()) + + // Wait for role to propagate + time.Sleep(10 * time.Second) +} + +// cleanupFISRole removes the FIS role and associated policies +func cleanupFISRole(env *awsenv.Environment) { + // Clean up the FIS role and policy + By("Cleaning up FIS role and policy") + + // List attached policies + listPoliciesOutput, err := env.IAMAPI.ListAttachedRolePolicies(env.Context, &awsiam.ListAttachedRolePoliciesInput{ + RoleName: aws.String(fisRoleName), + }) + if err == nil { + // Detach and delete policies + for _, policy := range listPoliciesOutput.AttachedPolicies { + _, err = env.IAMAPI.DetachRolePolicy(env.Context, &awsiam.DetachRolePolicyInput{ + RoleName: aws.String(fisRoleName), + PolicyArn: policy.PolicyArn, + }) + if err == nil { + // Delete policy + _, _ = env.IAMAPI.DeletePolicy(env.Context, &awsiam.DeletePolicyInput{ + PolicyArn: policy.PolicyArn, + }) + } + } + } + + // Delete role + _, _ = env.IAMAPI.DeleteRole(env.Context, &awsiam.DeleteRoleInput{ + RoleName: aws.String(fisRoleName), + }) +} From a5aaa071b90dd2bc78295b427f200b4e2a7dd419 Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Mon, 3 Mar 2025 14:03:20 -0800 Subject: [PATCH 2/8] FIS test fixes --- test/suites/chaos/zonal_failure_test.go | 203 +++++++++++------------- 1 file changed, 95 insertions(+), 108 deletions(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index f3bf98c8063f..de08fcb8948e 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -17,6 +17,7 @@ package chaos_test import ( "context" "fmt" + "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -26,7 +27,6 @@ import ( fistypes "github.com/aws/aws-sdk-go-v2/service/fis/types" awsiam "github.com/aws/aws-sdk-go-v2/service/iam" iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" - awserrors "github.com/aws/karpenter-provider-aws/pkg/errors" "github.com/google/uuid" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" @@ -34,6 +34,8 @@ import ( "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/controller-runtime/pkg/client" + awserrors "github.com/aws/karpenter-provider-aws/pkg/errors" + karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" coretest "sigs.k8s.io/karpenter/pkg/test" @@ -47,11 +49,10 @@ import ( var fisRoleName string var fisRoleArn string -var _ = Describe("ZonalFailure", func() { +var _ = FDescribe("ZonalFailure", func() { BeforeEach(func() { setupFISRole(env) }) - AfterEach(func() { cleanupFISRole(env) }) @@ -139,8 +140,13 @@ var _ = Describe("ZonalFailure", func() { instances = append(instances, instance) } - // Create FIS experiment to simulate AZ failure with specified failure rate - experiment := createAZFailureExperiment(ctx, env, targetAZ, instances, failureRate) + // Create the experiment template with the target AZ and instances + By(fmt.Sprintf("Creating experiment template for AZ %s", targetAZ)) + templateId := createExperimentTemplate(ctx, env, targetAZ, instances, failureRate) + + // Start the experiment + By("Starting the experiment") + experiment := startExperiment(ctx, env, templateId) // Wait for the experiment to complete By(fmt.Sprintf("Waiting for the %s experiment to complete", description)) @@ -150,6 +156,13 @@ var _ = Describe("ZonalFailure", func() { By("Chaos test timeout reached, skipping experiment status check") return default: + // Check if pods have already been rescheduled and are healthy + // If so, we can exit early without waiting for experiment completion + pods := env.Monitor.RunningPods(selector) + if len(pods) == numPods { + By("All pods have been successfully rescheduled and are healthy, continuing test") + return + } exp, err := env.FISAPI.GetExperiment(ctx, &fis.GetExperimentInput{ Id: experiment.Id, }) @@ -168,7 +181,7 @@ var _ = Describe("ZonalFailure", func() { // Clean up env.ExpectDeleted(dep) - env.ExpectExperimentTemplateDeleted(*experiment.Id) + env.ExpectExperimentTemplateDeleted(templateId) Eventually(func(g Gomega) { // First delete all nodes to trigger proper cleanup @@ -225,8 +238,8 @@ var _ = Describe("ZonalFailure", func() { ) }) -// createAZFailureExperiment creates an AWS FIS experiment with a specific failure percentage -func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, targetAZ string, instances []ec2types.Instance, failurePercentage string) *fistypes.Experiment { +// createExperimentTemplate creates an AWS FIS experiment template for AZ failure testing +func createExperimentTemplate(ctx context.Context, env *awsenv.Environment, targetAZ string, instances []ec2types.Instance, failurePercentage string) string { // Filter instances to only include those in the target AZ var targetInstances []string for _, instance := range instances { @@ -235,6 +248,10 @@ func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, tar } } + // Get subnets in the target AZ + subnetARNs := getSubnetsInAZ(ctx, env, targetAZ) + By(fmt.Sprintf("Found %d subnets in AZ %s", len(subnetARNs), targetAZ)) + // Create experiment template template := &fis.CreateExperimentTemplateInput{ Actions: map[string]fistypes.CreateExperimentTemplateActionInput{ @@ -251,8 +268,8 @@ func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, tar ActionId: aws.String("aws:ec2:api-insufficient-instance-capacity-error"), Parameters: map[string]string{ "availabilityZoneIdentifiers": targetAZ, - "duration": "PT5M", // 5 minutes - "percentage": failurePercentage, // Percentage of API calls that will fail + "duration": "PT5M", + "percentage": failurePercentage, }, Targets: map[string]string{ "Roles": "target-roles", @@ -261,7 +278,7 @@ func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, tar "disrupt-subnet": { ActionId: aws.String("aws:network:disrupt-connectivity"), Parameters: map[string]string{ - "duration": "PT5M", // 5 minutes + "duration": "PT5M", "scope": "all", }, Targets: map[string]string{ @@ -272,29 +289,22 @@ func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, tar Targets: map[string]fistypes.CreateExperimentTemplateTargetInput{ "target-instances": { ResourceType: aws.String("aws:ec2:instance"), - SelectionMode: aws.String("all"), + SelectionMode: aws.String("ALL"), ResourceArns: lo.Map(targetInstances, func(id string, _ int) string { return fmt.Sprintf("arn:aws:ec2:%s:%s:instance/%s", env.Region, env.ExpectAccountID(), id) }), }, "target-roles": { ResourceType: aws.String("aws:iam:role"), - SelectionMode: aws.String("all"), + SelectionMode: aws.String("ALL"), ResourceArns: []string{ fmt.Sprintf("arn:aws:iam::%s:role/KarpenterNodeRole-%s", env.ExpectAccountID(), env.ClusterName), }, }, "target-subnets": { ResourceType: aws.String("aws:ec2:subnet"), - SelectionMode: aws.String("all"), - Filters: []fistypes.ExperimentTemplateTargetInputFilter{ - { - Path: aws.String("AvailabilityZone"), - Values: []string{ - targetAZ, - }, - }, - }, + SelectionMode: aws.String("ALL"), + ResourceArns: subnetARNs, }, }, StopConditions: []fistypes.CreateExperimentTemplateStopConditionInput{ @@ -310,12 +320,15 @@ func createAZFailureExperiment(ctx context.Context, env *awsenv.Environment, tar experimentTemplate, err := env.FISAPI.CreateExperimentTemplate(ctx, template) Expect(err).NotTo(HaveOccurred()) - // Start experiment + return *experimentTemplate.ExperimentTemplate.Id +} + +// startExperiment starts an experiment from the given template and returns the experiment +func startExperiment(ctx context.Context, env *awsenv.Environment, templateId string) *fistypes.Experiment { experiment, err := env.FISAPI.StartExperiment(ctx, &fis.StartExperimentInput{ - ExperimentTemplateId: experimentTemplate.ExperimentTemplate.Id, + ExperimentTemplateId: aws.String(templateId), }) Expect(err).NotTo(HaveOccurred()) - return experiment.Experiment } @@ -324,13 +337,15 @@ func setupFISRole(env *awsenv.Environment) { // Create a unique role name for this test run to avoid conflicts uid, err := uuid.NewUUID() Expect(err).NotTo(HaveOccurred()) - fisRoleName = fmt.Sprintf("KarpenterFISZonalFailureRole-%s", uid.String()) + // Truncate UUID to ensure role name stays under 64 characters + shortUID := uid.String()[:8] + fisRoleName = fmt.Sprintf("Karp-FIS-Role-%s", shortUID) // Create the FIS role with necessary permissions - By("Creating FIS role for zonal failure testing") + assumeRolePolicy := `{ - "Version": "2012-10-17", - "Statement": [ + "Version": "2012-10-17", + "Statement": [ { "Effect": "Allow", "Principal": { @@ -355,72 +370,16 @@ func setupFISRole(env *awsenv.Environment) { Expect(err).NotTo(HaveOccurred()) fisRoleArn = *createRoleOutput.Role.Arn - // Create policy with necessary permissions for FIS actions - uid, err = uuid.NewUUID() - Expect(err).NotTo(HaveOccurred()) - policyName := fmt.Sprintf("KarpenterFISZonalFailurePolicy-%s", uid.String()) - policyDocument := `{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:DescribeInstances", - "ec2:StopInstances", - "ec2:StartInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateTags" - ], - "Resource": "arn:aws:ec2:*:*:instance/*", - "Condition": { - "StringEquals": { - "ec2:CreateAction": "StopInstances" - } - } - }, - { - "Effect": "Allow", - "Action": [ - "network-manager:*", - "ec2:*NetworkInsightsAccessScope*", - "ec2:*NetworkInsightsAnalysis*", - "ec2:*NetworkInsightsPath*" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:GetRole", - "iam:ListRoles" - ], - "Resource": "*" - } - ] - }` - - createPolicyOutput, err := env.IAMAPI.CreatePolicy(env.Context, &awsiam.CreatePolicyInput{ - PolicyName: aws.String(policyName), - PolicyDocument: aws.String(policyDocument), - Description: aws.String("Policy for Karpenter zonal failure testing with AWS FIS"), - Tags: []iamtypes.Tag{ - { - Key: aws.String(coretest.DiscoveryLabel), - Value: aws.String(env.ClusterName), - }, - }, + // Attach AWS managed policies for FIS + By("Attaching AWS managed policy AWSFaultInjectionSimulatorEC2Access to FIS role") + _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ + RoleName: aws.String(fisRoleName), + PolicyArn: aws.String("arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access"), }) Expect(err).NotTo(HaveOccurred()) - - // Attach policy to role _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ RoleName: aws.String(fisRoleName), - PolicyArn: createPolicyOutput.Policy.Arn, + PolicyArn: aws.String("arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorNetworkAccess"), }) Expect(err).NotTo(HaveOccurred()) @@ -430,31 +389,59 @@ func setupFISRole(env *awsenv.Environment) { // cleanupFISRole removes the FIS role and associated policies func cleanupFISRole(env *awsenv.Environment) { - // Clean up the FIS role and policy - By("Cleaning up FIS role and policy") - - // List attached policies listPoliciesOutput, err := env.IAMAPI.ListAttachedRolePolicies(env.Context, &awsiam.ListAttachedRolePoliciesInput{ RoleName: aws.String(fisRoleName), }) - if err == nil { - // Detach and delete policies - for _, policy := range listPoliciesOutput.AttachedPolicies { - _, err = env.IAMAPI.DetachRolePolicy(env.Context, &awsiam.DetachRolePolicyInput{ - RoleName: aws.String(fisRoleName), + Expect(err).NotTo(HaveOccurred()) + for _, policy := range listPoliciesOutput.AttachedPolicies { + // Detach all policies + _, err = env.IAMAPI.DetachRolePolicy(env.Context, &awsiam.DetachRolePolicyInput{ + RoleName: aws.String(fisRoleName), + PolicyArn: policy.PolicyArn, + }) + Expect(err).NotTo(HaveOccurred()) + + // Only delete custom policies (not AWS managed policies) + if !strings.HasPrefix(*policy.PolicyArn, "arn:aws:iam::aws:policy/") { + _, _ = env.IAMAPI.DeletePolicy(env.Context, &awsiam.DeletePolicyInput{ PolicyArn: policy.PolicyArn, }) - if err == nil { - // Delete policy - _, _ = env.IAMAPI.DeletePolicy(env.Context, &awsiam.DeletePolicyInput{ - PolicyArn: policy.PolicyArn, - }) - } } } - - // Delete role _, _ = env.IAMAPI.DeleteRole(env.Context, &awsiam.DeleteRoleInput{ RoleName: aws.String(fisRoleName), }) } + +// getSubnetsInAZ discovers all subnets in a specific availability zone +func getSubnetsInAZ(ctx context.Context, env *awsenv.Environment, targetAZ string) []string { + // Describe subnets in the target AZ + describeSubnetsOutput, err := env.EC2API.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ + Filters: []ec2types.Filter{ + { + Name: aws.String("availability-zone"), + Values: []string{targetAZ}, + }, + { + Name: aws.String("tag:karpenter.sh/discovery"), + Values: []string{env.ClusterName}, + }, + }, + }) + Expect(err).NotTo(HaveOccurred()) + + // Extract subnet ARNs + var subnetARNs []string + for _, subnet := range describeSubnetsOutput.Subnets { + if subnet.SubnetId != nil { + subnetARN := fmt.Sprintf("arn:aws:ec2:%s:%s:subnet/%s", + env.Region, env.ExpectAccountID(), *subnet.SubnetId) + subnetARNs = append(subnetARNs, subnetARN) + } + } + + Expect(len(subnetARNs)).To(BeNumerically(">", 0), + fmt.Sprintf("No subnets found in AZ %s", targetAZ)) + + return subnetARNs +} From 591f25adc7da26f10f0e7977213392c33150e4f2 Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 07:24:45 -0800 Subject: [PATCH 3/8] remove focus --- test/suites/chaos/zonal_failure_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index de08fcb8948e..859b591af175 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -49,7 +49,7 @@ import ( var fisRoleName string var fisRoleArn string -var _ = FDescribe("ZonalFailure", func() { +var _ = Describe("ZonalFailure", func() { BeforeEach(func() { setupFISRole(env) }) From 6efb35310cb650070b3ec40555e4039d8d3a4174 Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 07:35:51 -0800 Subject: [PATCH 4/8] prevent confused deputy --- test/suites/chaos/zonal_failure_test.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index 859b591af175..20b2ae388f67 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -343,7 +343,7 @@ func setupFISRole(env *awsenv.Environment) { // Create the FIS role with necessary permissions - assumeRolePolicy := `{ + assumeRolePolicy := fmt.Sprintf(`{ "Version": "2012-10-17", "Statement": [ { @@ -351,10 +351,18 @@ func setupFISRole(env *awsenv.Environment) { "Principal": { "Service": "fis.amazonaws.com" }, - "Action": "sts:AssumeRole" + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "aws:SourceAccount": "%s" + }, + "ArnLike": { + "aws:SourceArn": "arn:aws:fis:%s:%s:experiment/*" + } + } } ] - }` + }`, env.ExpectAccountID(), env.Region, env.ExpectAccountID()) createRoleOutput, err := env.IAMAPI.CreateRole(env.Context, &awsiam.CreateRoleInput{ RoleName: aws.String(fisRoleName), @@ -371,7 +379,6 @@ func setupFISRole(env *awsenv.Environment) { fisRoleArn = *createRoleOutput.Role.Arn // Attach AWS managed policies for FIS - By("Attaching AWS managed policy AWSFaultInjectionSimulatorEC2Access to FIS role") _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ RoleName: aws.String(fisRoleName), PolicyArn: aws.String("arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorEC2Access"), From 1ec4955d771df954c6011807cce2cd08a52ac718 Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 10:38:19 -0800 Subject: [PATCH 5/8] address test issues --- test/suites/chaos/suite_test.go | 10 +- test/suites/chaos/zonal_failure_test.go | 151 ++++++++++++++++-------- 2 files changed, 114 insertions(+), 47 deletions(-) diff --git a/test/suites/chaos/suite_test.go b/test/suites/chaos/suite_test.go index 56b1d23c1232..f8fcbbd521d5 100644 --- a/test/suites/chaos/suite_test.go +++ b/test/suites/chaos/suite_test.go @@ -41,6 +41,7 @@ import ( v1 "github.com/aws/karpenter-provider-aws/pkg/apis/v1" "github.com/aws/karpenter-provider-aws/test/pkg/debug" "github.com/aws/karpenter-provider-aws/test/pkg/environment/aws" + "github.com/aws/karpenter-provider-aws/test/pkg/environment/common" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -66,7 +67,7 @@ var _ = BeforeEach(func() { nodeClass = env.DefaultEC2NodeClass() nodePool = env.DefaultNodePool(nodeClass) }) -var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { ChaosCleanup(env) }) var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Chaos", func() { @@ -227,3 +228,10 @@ func startNodeCountMonitor(ctx context.Context, kubeClient client.Client) { } }() } + +func ChaosCleanup(env *aws.Environment) { + env.CleanupObjects(common.CleanableObjects...) + env.EventuallyExpectNoLeakedKubeNodeLease() + env.ConsistentlyExpectNodeCount(">=", 0, time.Second*5) + env.ExpectActiveKarpenterPod() +} diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index 20b2ae388f67..e6b6544eeffd 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -17,7 +17,6 @@ package chaos_test import ( "context" "fmt" - "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -48,13 +47,13 @@ import ( // FIS role variables are defined at package level to be accessible across tests var fisRoleName string var fisRoleArn string - +var customPolicyArn string var _ = Describe("ZonalFailure", func() { BeforeEach(func() { - setupFISRole(env) + setupRoles(env) }) AfterEach(func() { - cleanupFISRole(env) + cleanupRoles(env) }) DescribeTable("should recover from AZ failures with different failure rates", @@ -64,7 +63,7 @@ var _ = Describe("ZonalFailure", func() { NodeSelectorRequirement: corev1.NodeSelectorRequirement{ Key: karpv1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, - Values: []string{karpv1.CapacityTypeSpot}, + Values: []string{karpv1.CapacityTypeOnDemand}, }, }) nodePool.Spec.Disruption.ConsolidationPolicy = karpv1.ConsolidationPolicyWhenEmptyOrUnderutilized @@ -99,21 +98,19 @@ var _ = Describe("ZonalFailure", func() { env.EventuallyExpectHealthyPodCount(selector, numPods) // Start a context with a timeout for the chaos test - ctx, cancel := context.WithTimeout(env.Context, 15*time.Minute) + ctx, cancel := context.WithCancel(env.Context) defer cancel() // Start node count monitor startNodeCountMonitor(ctx, env.Client) - // Get current nodes and group them by AZ + // find AZ with most nodes nodesByAZ := make(map[string][]*corev1.Node) - nodeList := &corev1.NodeList{} - Expect(env.Client.List(ctx, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) - - // Process nodes, find AZ with most nodes, and collect instances in a single pass var targetAZ string var maxNodes int var instances []ec2types.Instance + nodeList := &corev1.NodeList{} + Expect(env.Client.List(ctx, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) for i := range nodeList.Items { node := &nodeList.Items[i] @@ -170,8 +167,8 @@ var _ = Describe("ZonalFailure", func() { g.Expect(exp.Experiment.State.Status).To(Or( Equal(fistypes.ExperimentStatusCompleted), Equal(fistypes.ExperimentStatusStopped), - Equal(fistypes.ExperimentStatusFailed), )) + g.Expect(exp.Experiment.State.Status).To(Not(Equal(fistypes.ExperimentActionStatusFailed))) } }, 10*time.Minute, 30*time.Second).Should(Succeed()) @@ -184,15 +181,16 @@ var _ = Describe("ZonalFailure", func() { env.ExpectExperimentTemplateDeleted(templateId) Eventually(func(g Gomega) { - // First delete all nodes to trigger proper cleanup nodeList := &corev1.NodeList{} g.Expect(env.Client.List(env.Context, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) - - // Delete each node to trigger proper Kubernetes cleanup for _, node := range nodeList.Items { - fmt.Printf("Deleting node %s\n", node.Name) g.Expect(env.Client.Delete(env.Context, &node)).To(Succeed()) } + nodeClaimList := &karpv1.NodeClaimList{} + g.Expect(env.Client.List(env.Context, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + for _, nodeClaim := range nodeClaimList.Items { + g.Expect(env.Client.Delete(env.Context, &nodeClaim)).To(Succeed()) + } // Wait for nodes to be deleted from Kubernetes g.Eventually(func() int { @@ -332,17 +330,42 @@ func startExperiment(ctx context.Context, env *awsenv.Environment, templateId st return experiment.Experiment } -// setupFISRole creates a role for AWS FIS with necessary permissions -func setupFISRole(env *awsenv.Environment) { - // Create a unique role name for this test run to avoid conflicts +// setupRoles creates a role for AWS FIS with necessary permissions +func setupRoles(env *awsenv.Environment) { + // First, check if the FIS service-linked role exists + serviceRoleName := "AWSServiceRoleForFIS" + _, err := env.IAMAPI.GetRole(env.Context, &awsiam.GetRoleInput{ + RoleName: aws.String(serviceRoleName), + }) + + if err != nil { + // If the role doesn't exist, create it + if awserrors.IsNotFound(err) { + By("Creating FIS service-linked role") + _, err = env.IAMAPI.CreateServiceLinkedRole(env.Context, &awsiam.CreateServiceLinkedRoleInput{ + AWSServiceName: aws.String("fis.amazonaws.com"), + Description: aws.String("Service-linked role for AWS Fault Injection Simulator"), + }) + Expect(err).NotTo(HaveOccurred()) + + // Wait for the role to be available + By("Waiting for FIS service-linked role to be available") + Eventually(func() error { + _, err := env.IAMAPI.GetRole(env.Context, &awsiam.GetRoleInput{ + RoleName: aws.String(serviceRoleName), + }) + return err + }, 30*time.Second, 2*time.Second).Should(Succeed()) + } else { + Expect(err).NotTo(HaveOccurred(), "Failed to check for FIS service-linked role") + } + } + + By("Creating FIS experiment role") uid, err := uuid.NewUUID() Expect(err).NotTo(HaveOccurred()) - // Truncate UUID to ensure role name stays under 64 characters shortUID := uid.String()[:8] fisRoleName = fmt.Sprintf("Karp-FIS-Role-%s", shortUID) - - // Create the FIS role with necessary permissions - assumeRolePolicy := fmt.Sprintf(`{ "Version": "2012-10-17", "Statement": [ @@ -390,31 +413,75 @@ func setupFISRole(env *awsenv.Environment) { }) Expect(err).NotTo(HaveOccurred()) + // Create custom policy for API error injection + By("Creating custom policy for API error injection") + policyName := fmt.Sprintf("Karp-FIS-APIError-Policy-%s", shortUID) + policyDocument := fmt.Sprintf(`{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:InjectApiError" + ], + "Resource": "*", + "Condition": { + "StringEquals": { + "ec2:FisActionId": "aws:ec2:api-insufficient-instance-capacity-error" + }, + "ArnLike": { + "ec2:FisTargetArns": [ + "arn:aws:iam::%s:role/KarpenterNodeRole-%s" + ] + } + } + } + ] + }`, env.ExpectAccountID(), env.ClusterName) + + createPolicyOutput, err := env.IAMAPI.CreatePolicy(env.Context, &awsiam.CreatePolicyInput{ + PolicyName: aws.String(policyName), + PolicyDocument: aws.String(policyDocument), + Description: aws.String("Policy for Karpenter zonal failure API error injection with AWS FIS"), + Tags: []iamtypes.Tag{ + { + Key: aws.String(coretest.DiscoveryLabel), + Value: aws.String(env.ClusterName), + }, + }, + }) + Expect(err).NotTo(HaveOccurred()) + customPolicyArn = *createPolicyOutput.Policy.Arn + // Attach custom API error policy to role + _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ + RoleName: aws.String(fisRoleName), + PolicyArn: createPolicyOutput.Policy.Arn, + }) + Expect(err).NotTo(HaveOccurred()) + // Wait for role to propagate time.Sleep(10 * time.Second) } -// cleanupFISRole removes the FIS role and associated policies -func cleanupFISRole(env *awsenv.Environment) { +// cleanupRoles removes the FIS role and associated policies +func cleanupRoles(env *awsenv.Environment) { listPoliciesOutput, err := env.IAMAPI.ListAttachedRolePolicies(env.Context, &awsiam.ListAttachedRolePoliciesInput{ RoleName: aws.String(fisRoleName), }) Expect(err).NotTo(HaveOccurred()) for _, policy := range listPoliciesOutput.AttachedPolicies { - // Detach all policies _, err = env.IAMAPI.DetachRolePolicy(env.Context, &awsiam.DetachRolePolicyInput{ RoleName: aws.String(fisRoleName), PolicyArn: policy.PolicyArn, }) Expect(err).NotTo(HaveOccurred()) - - // Only delete custom policies (not AWS managed policies) - if !strings.HasPrefix(*policy.PolicyArn, "arn:aws:iam::aws:policy/") { - _, _ = env.IAMAPI.DeletePolicy(env.Context, &awsiam.DeletePolicyInput{ - PolicyArn: policy.PolicyArn, - }) - } } + + _, err = env.IAMAPI.DeletePolicy(env.Context, &awsiam.DeletePolicyInput{ + PolicyArn: aws.String(customPolicyArn), + }) + Expect(err).NotTo(HaveOccurred()) + _, _ = env.IAMAPI.DeleteRole(env.Context, &awsiam.DeleteRoleInput{ RoleName: aws.String(fisRoleName), }) @@ -422,7 +489,6 @@ func cleanupFISRole(env *awsenv.Environment) { // getSubnetsInAZ discovers all subnets in a specific availability zone func getSubnetsInAZ(ctx context.Context, env *awsenv.Environment, targetAZ string) []string { - // Describe subnets in the target AZ describeSubnetsOutput, err := env.EC2API.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ Filters: []ec2types.Filter{ { @@ -437,18 +503,11 @@ func getSubnetsInAZ(ctx context.Context, env *awsenv.Environment, targetAZ strin }) Expect(err).NotTo(HaveOccurred()) - // Extract subnet ARNs - var subnetARNs []string - for _, subnet := range describeSubnetsOutput.Subnets { - if subnet.SubnetId != nil { - subnetARN := fmt.Sprintf("arn:aws:ec2:%s:%s:subnet/%s", - env.Region, env.ExpectAccountID(), *subnet.SubnetId) - subnetARNs = append(subnetARNs, subnetARN) - } - } - - Expect(len(subnetARNs)).To(BeNumerically(">", 0), + arns := lo.Map(describeSubnetsOutput.Subnets, func(subnet ec2types.Subnet, _ int) string { + return *subnet.SubnetArn + }) + Expect(len(arns)).To(BeNumerically(">", 0), fmt.Sprintf("No subnets found in AZ %s", targetAZ)) - return subnetARNs + return arns } From 577dbfc72820bda63df8ebd84b818df98a18b82e Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 10:41:53 -0800 Subject: [PATCH 6/8] reduce comments where unnecessary --- test/suites/chaos/zonal_failure_test.go | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index e6b6544eeffd..fadd78b3d816 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -44,7 +44,6 @@ import ( . "github.com/onsi/gomega" ) -// FIS role variables are defined at package level to be accessible across tests var fisRoleName string var fisRoleArn string var customPolicyArn string @@ -91,17 +90,12 @@ var _ = Describe("ZonalFailure", func() { }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - // Create a deployment with multiple pods env.ExpectCreated(nodeClass, nodePool, dep) - - // Wait for all pods to be running env.EventuallyExpectHealthyPodCount(selector, numPods) - // Start a context with a timeout for the chaos test ctx, cancel := context.WithCancel(env.Context) defer cancel() - // Start node count monitor startNodeCountMonitor(ctx, env.Client) // find AZ with most nodes @@ -125,27 +119,22 @@ var _ = Describe("ZonalFailure", func() { } } - // Ensure we have nodes in multiple AZs Expect(len(nodesByAZ)).To(BeNumerically(">", 1), "Expected nodes in multiple AZs") By(fmt.Sprintf("Simulating %s with %d nodes (%s%% failure rate)", description, maxNodes, failureRate)) - // Get EC2 instance information for nodes in the target AZ for _, node := range nodesByAZ[targetAZ] { instance := env.GetInstance(node.Name) instances = append(instances, instance) } - // Create the experiment template with the target AZ and instances By(fmt.Sprintf("Creating experiment template for AZ %s", targetAZ)) templateId := createExperimentTemplate(ctx, env, targetAZ, instances, failureRate) - // Start the experiment By("Starting the experiment") experiment := startExperiment(ctx, env, templateId) - // Wait for the experiment to complete By(fmt.Sprintf("Waiting for the %s experiment to complete", description)) Eventually(func(g Gomega) { select { @@ -172,11 +161,10 @@ var _ = Describe("ZonalFailure", func() { } }, 10*time.Minute, 30*time.Second).Should(Succeed()) - // Verify that the system recovered By(fmt.Sprintf("Verifying system recovery from %s", description)) env.EventuallyExpectHealthyPodCountWithTimeout(5*time.Minute, selector, numPods) - // Clean up + By("Cleaning up test resources") env.ExpectDeleted(dep) env.ExpectExperimentTemplateDeleted(templateId) @@ -192,14 +180,12 @@ var _ = Describe("ZonalFailure", func() { g.Expect(env.Client.Delete(env.Context, &nodeClaim)).To(Succeed()) } - // Wait for nodes to be deleted from Kubernetes g.Eventually(func() int { tempNodeList := &corev1.NodeList{} g.Expect(env.Client.List(env.Context, tempNodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) return len(tempNodeList.Items) }, 2*time.Minute, 10*time.Second).Should(BeZero()) - // Now terminate any remaining EC2 instances to ensure complete cleanup describeInstancesOutput, err := env.EC2API.DescribeInstances(env.Context, &ec2.DescribeInstancesInput{ Filters: []ec2types.Filter{ { @@ -210,7 +196,6 @@ var _ = Describe("ZonalFailure", func() { }) g.Expect(err).NotTo(HaveOccurred()) - // Collect all instance IDs var instanceIDs []string for _, reservation := range describeInstancesOutput.Reservations { for _, instance := range reservation.Instances { @@ -220,7 +205,6 @@ var _ = Describe("ZonalFailure", func() { } } - // Terminate instances in batches if needed if len(instanceIDs) > 0 { fmt.Printf("Terminating %d remaining EC2 instances\n", len(instanceIDs)) _, err := env.EC2API.TerminateInstances(env.Context, &ec2.TerminateInstancesInput{ @@ -236,7 +220,6 @@ var _ = Describe("ZonalFailure", func() { ) }) -// createExperimentTemplate creates an AWS FIS experiment template for AZ failure testing func createExperimentTemplate(ctx context.Context, env *awsenv.Environment, targetAZ string, instances []ec2types.Instance, failurePercentage string) string { // Filter instances to only include those in the target AZ var targetInstances []string @@ -321,7 +304,6 @@ func createExperimentTemplate(ctx context.Context, env *awsenv.Environment, targ return *experimentTemplate.ExperimentTemplate.Id } -// startExperiment starts an experiment from the given template and returns the experiment func startExperiment(ctx context.Context, env *awsenv.Environment, templateId string) *fistypes.Experiment { experiment, err := env.FISAPI.StartExperiment(ctx, &fis.StartExperimentInput{ ExperimentTemplateId: aws.String(templateId), @@ -330,7 +312,6 @@ func startExperiment(ctx context.Context, env *awsenv.Environment, templateId st return experiment.Experiment } -// setupRoles creates a role for AWS FIS with necessary permissions func setupRoles(env *awsenv.Environment) { // First, check if the FIS service-linked role exists serviceRoleName := "AWSServiceRoleForFIS" @@ -452,7 +433,6 @@ func setupRoles(env *awsenv.Environment) { }) Expect(err).NotTo(HaveOccurred()) customPolicyArn = *createPolicyOutput.Policy.Arn - // Attach custom API error policy to role _, err = env.IAMAPI.AttachRolePolicy(env.Context, &awsiam.AttachRolePolicyInput{ RoleName: aws.String(fisRoleName), PolicyArn: createPolicyOutput.Policy.Arn, @@ -463,7 +443,6 @@ func setupRoles(env *awsenv.Environment) { time.Sleep(10 * time.Second) } -// cleanupRoles removes the FIS role and associated policies func cleanupRoles(env *awsenv.Environment) { listPoliciesOutput, err := env.IAMAPI.ListAttachedRolePolicies(env.Context, &awsiam.ListAttachedRolePoliciesInput{ RoleName: aws.String(fisRoleName), @@ -487,7 +466,6 @@ func cleanupRoles(env *awsenv.Environment) { }) } -// getSubnetsInAZ discovers all subnets in a specific availability zone func getSubnetsInAZ(ctx context.Context, env *awsenv.Environment, targetAZ string) []string { describeSubnetsOutput, err := env.EC2API.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ Filters: []ec2types.Filter{ From c7b18bc270d6b9a0a1864d06f4ca567b331dedef Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 10:43:03 -0800 Subject: [PATCH 7/8] reduce additional comments --- test/suites/chaos/zonal_failure_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index fadd78b3d816..da2712c85901 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -229,11 +229,9 @@ func createExperimentTemplate(ctx context.Context, env *awsenv.Environment, targ } } - // Get subnets in the target AZ subnetARNs := getSubnetsInAZ(ctx, env, targetAZ) By(fmt.Sprintf("Found %d subnets in AZ %s", len(subnetARNs), targetAZ)) - // Create experiment template template := &fis.CreateExperimentTemplateInput{ Actions: map[string]fistypes.CreateExperimentTemplateActionInput{ "stop-instances": { From a5298716918d3b2c5d35f31ae5d354cb632ba0db Mon Sep 17 00:00:00 2001 From: Reed Schalo Date: Tue, 4 Mar 2025 10:59:06 -0800 Subject: [PATCH 8/8] fix presubmit errors --- test/suites/chaos/zonal_failure_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/suites/chaos/zonal_failure_test.go b/test/suites/chaos/zonal_failure_test.go index da2712c85901..e23e5b1f0528 100644 --- a/test/suites/chaos/zonal_failure_test.go +++ b/test/suites/chaos/zonal_failure_test.go @@ -130,10 +130,10 @@ var _ = Describe("ZonalFailure", func() { } By(fmt.Sprintf("Creating experiment template for AZ %s", targetAZ)) - templateId := createExperimentTemplate(ctx, env, targetAZ, instances, failureRate) + templateID := createExperimentTemplate(ctx, env, targetAZ, instances, failureRate) By("Starting the experiment") - experiment := startExperiment(ctx, env, templateId) + experiment := startExperiment(ctx, env, templateID) By(fmt.Sprintf("Waiting for the %s experiment to complete", description)) Eventually(func(g Gomega) { @@ -166,7 +166,7 @@ var _ = Describe("ZonalFailure", func() { By("Cleaning up test resources") env.ExpectDeleted(dep) - env.ExpectExperimentTemplateDeleted(templateId) + env.ExpectExperimentTemplateDeleted(templateID) Eventually(func(g Gomega) { nodeList := &corev1.NodeList{} @@ -302,9 +302,9 @@ func createExperimentTemplate(ctx context.Context, env *awsenv.Environment, targ return *experimentTemplate.ExperimentTemplate.Id } -func startExperiment(ctx context.Context, env *awsenv.Environment, templateId string) *fistypes.Experiment { +func startExperiment(ctx context.Context, env *awsenv.Environment, templateID string) *fistypes.Experiment { experiment, err := env.FISAPI.StartExperiment(ctx, &fis.StartExperimentInput{ - ExperimentTemplateId: aws.String(templateId), + ExperimentTemplateId: aws.String(templateID), }) Expect(err).NotTo(HaveOccurred()) return experiment.Experiment