Merge pull request #36 from cultureamp/feat/add-configurable-timeout

liamstevens · web-flow · commit fe8b89113be6 · 2025-02-17T10:09:28.000+10:00
feat: add configurable timeout
diff --git a/README.md b/README.md
@@ -8,15 +8,23 @@ Add the following lines to your `pipeline.yml`:
 steps:
   - plugins:
       - cultureamp/ecs-task-runner#v0.0.0:
-          parameter-name: "/my-cool-service/farm-name"
-          command: "./run-my-fully-siq-migrations"
+          parameter-name: "test-parameter"
+          command: "/bin/migrate"
+          timeout: 900
 ```
 
 ## Configuration
 
-### `message` (Required, string)
+### `parameter-name` (Required, string)
+The name or ARN of the parameter in Parameter Store that contains the task definition.
 
-The message to annotate onto the build.
+### `command` (Optional, string)
+The name of the command to run in the task. When omitted, the task will run the command specified in the parameter.
+
+### `timeout` (Optional, integer)
+The timeout in seconds that the plugin will wait for the task to complete. If the task does not complete within this time, the plugin will fail. The task execution will continue to run in the background.
+
+Default: 2700
 
 ## Usage
 This plugin is based on an existing pattern in `murmur` where database migrations are run as a task on ECS. To provide additional context for how this plugin is expected to be used, this is the expected pattern:
@@ -35,7 +43,7 @@ This plugin comes with some assumed infrastructure that needs to be deployed bef
 - An IAM role for the BK agent to start the task
 - A Parameter Store parameter extending the task definition by providing entrypoint overrides and networking configuration
 - A log group for the task
-- A security group for your service (this can be the [base-infrastructure-for-services](https://github.com/cultureamp/base-infrastructure-for-services) source security group
+- A security group for your service (this can be the [base-infrastructure-for-services](https://github.com/cultureamp/base-infrastructure-for-services) source security group)
 
 This can be visualised below:
 ![The overall flow of this plugin and AWS resources](docs/images/diagram.svg)
diff --git a/plugin.yml b/plugin.yml
@@ -9,7 +9,9 @@ configuration:
       type: string
     command:
       type: string
+    timeout:
+      type: integer
   additionalProperties: false
   anyOf:
     - required:
-        - parameter-name
+        - parameter-name
diff --git a/src/aws/ecs.go b/src/aws/ecs.go
@@ -19,7 +19,7 @@ type EcsClientAPI interface {
 	DescribeTaskDefinition(ctx context.Context, params *ecs.DescribeTaskDefinitionInput, optFns ...func(*ecs.Options)) (*ecs.DescribeTaskDefinitionOutput, error)
 }
 
-type ecsWaiterAPI interface {
+type EcsWaiterAPI interface {
 	WaitForOutput(ctx context.Context, params *ecs.DescribeTasksInput, maxWaitDur time.Duration, optFns ...func(*ecs.TasksStoppedWaiterOptions)) (*ecs.DescribeTasksOutput, error)
 }
 
@@ -56,11 +56,10 @@ func SubmitTask(ctx context.Context, ecsAPI EcsClientAPI, input *TaskRunnerConfi
 	return *response.Tasks[0].TaskArn, nil
 }
 
-func WaitForCompletion(ctx context.Context, waiter ecsWaiterAPI, taskArn string) (*ecs.DescribeTasksOutput, error) {
+func WaitForCompletion(ctx context.Context, waiter EcsWaiterAPI, taskArn string, timeOut int) (*ecs.DescribeTasksOutput, error) {
 	cluster := ClusterFromTaskArn(taskArn)
 
-	// TODO: This magic number will be resolved in a future piece of work, not going to refactor this right now
-	maxWaitDuration := 15 * time.Minute //nolint:mnd
+	maxWaitDuration := time.Duration(timeOut) * time.Second
 	result, err := waiter.WaitForOutput(ctx, &ecs.DescribeTasksInput{
 		Cluster: aws.String(cluster),
 		Tasks:   []string{taskArn},
diff --git a/src/aws/ecs_test.go b/src/aws/ecs_test.go
@@ -388,16 +388,23 @@ func TestFindLogStreamFromTaskNegative(t *testing.T) {
 // to allow thing to finish in the background. The return value is used only for when a task fails, and we push
 // this to a log.
 func TestWaitForCompletion(t *testing.T) {
-	mockedWaiter := mockECSWaiter{
-		mockWaitForOutput: func(context.Context, *ecs.DescribeTasksInput, time.Duration, ...func(*ecs.TasksStoppedWaiterOptions)) (*ecs.DescribeTasksOutput, error) {
-			return &ecs.DescribeTasksOutput{
-				Failures: []types.Failure{
-					{
-						Arn:    aws.String("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/07cc583696bd44e0be450bff7314ddaf"),
-						Detail: aws.String("task stopped"),
-						Reason: aws.String("computer is full of beanz"),
-					},
-				}}, errors.New("task stopped: computer is full of beanz")
+	mockedWaiter := map[string]mockECSWaiter{
+		"beans": {
+			mockWaitForOutput: func(context.Context, *ecs.DescribeTasksInput, time.Duration, ...func(*ecs.TasksStoppedWaiterOptions)) (*ecs.DescribeTasksOutput, error) {
+				return &ecs.DescribeTasksOutput{
+					Failures: []types.Failure{
+						{
+							Arn:    aws.String("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/07cc583696bd44e0be450bff7314ddaf"),
+							Detail: aws.String("task stopped"),
+							Reason: aws.String("computer is full of beanz"),
+						},
+					}}, nil
+			},
+		},
+		"slowpoke": {
+			mockWaitForOutput: func(context.Context, *ecs.DescribeTasksInput, time.Duration, ...func(*ecs.TasksStoppedWaiterOptions)) (*ecs.DescribeTasksOutput, error) {
+				return nil, errors.New("task timed out: computer still thinking")
+			},
 		},
 	}
 
@@ -410,35 +417,41 @@ func TestWaitForCompletion(t *testing.T) {
 	tests := []struct {
 		name     string
 		input    string
-		waiter   ecsWaiterAPI
+		waiter   EcsWaiterAPI
 		expected expectedReturn
 	}{
 		{
 			name:   "given a task ARN, it should return the task details",
 			input:  "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/07cc583696bd44e0be450bff7314ddaf",
-			waiter: mockedWaiter,
+			waiter: mockedWaiter["beans"],
 			expected: expectedReturn{&ecs.DescribeTasksOutput{
 				Failures: []types.Failure{
 					{
 						Arn:    aws.String("arn:aws:ecs:us-west-2:123456789012:task/test-cluster/07cc583696bd44e0be450bff7314ddaf"),
 						Detail: aws.String("task stopped"),
 						Reason: aws.String("computer is full of beanz"),
 					},
-				}}, errors.New("task stopped: computer is full of beanz"),
+				}}, nil,
 			},
 		},
+		{
+			name:     "given a task that times out, it should return an error",
+			input:    "arn:aws:ecs:us-west-2:123456789012:task/test-cluster/07cc583696bd44e0be450bff7314ddaf",
+			waiter:   mockedWaiter["slowpoke"],
+			expected: expectedReturn{nil, errors.New("task timed out: computer still thinking")},
+		},
 	}
 
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			result, err := WaitForCompletion(context.TODO(), tc.waiter, tc.input)
-			t.Logf("result: '%v'", err)
-			t.Logf("expected: detail: %v, reason: %v", *tc.expected.Failures[0].Detail, *tc.expected.Failures[0].Reason)
-
-			// The function is most-useful when the underlying task fails. i.e. no news is good news in a real-world scenario
-			// So, we will test the failure cases
-			require.Error(t, err)
-			assert.Equal(t, tc.expected.Failures[0], result.Failures[0])
+			result, err := WaitForCompletion(context.TODO(), tc.waiter, tc.input, 15)
+			t.Logf("name: %s result: '%v'", tc.name, err)
+			// Errors are only returned when the waiter times out
+			if err != nil {
+				require.Equal(t, tc.expected.Error(), err.Error())
+			} else {
+				require.Equal(t, tc.expected.Failures, result.Failures)
+			}
 		})
 	}
 }
diff --git a/src/buildkite/agent.go b/src/buildkite/agent.go
@@ -11,10 +11,14 @@ import (
 	osexec "golang.org/x/sys/execabs"
 )
 
+type AgentAPI interface {
+	Annotate(ctx context.Context, message string, style string, annotationContext string) error
+}
+
 type Agent struct {
 }
 
-func (a *Agent) Annotate(ctx context.Context, message string, style string, annotationContext string) error {
+func (a Agent) Annotate(ctx context.Context, message string, style string, annotationContext string) error {
 	return execCmd(ctx, "buildkite-agent", &message, "annotate", "--style", style, "--context", annotationContext)
 }
 
diff --git a/src/main.go b/src/main.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"os"
 
+	awsinternal "github.com/cultureamp/ecs-task-runner-buildkite-plugin/aws"
 	"github.com/cultureamp/ecs-task-runner-buildkite-plugin/buildkite"
 	"github.com/cultureamp/ecs-task-runner-buildkite-plugin/plugin"
 )
@@ -13,7 +14,7 @@ func main() {
 	fetcher := plugin.EnvironmentConfigFetcher{}
 	taskRunnerPlugin := plugin.TaskRunnerPlugin{}
 
-	err := taskRunnerPlugin.Run(ctx, fetcher)
+	err := taskRunnerPlugin.Run(ctx, fetcher, awsinternal.WaitForCompletion)
 
 	if err != nil {
 		buildkite.LogFailuref("plugin execution failed: %s\n", err.Error())
diff --git a/src/plugin/config.go b/src/plugin/config.go
@@ -7,6 +7,7 @@ import (
 type Config struct {
 	ParameterName string `required:"true"  split_words:"true"`
 	Command       string `required:"false" split_words:"true"`
+	TimeOut       int    `default:"2700"   split_words:"true"`
 }
 
 type EnvironmentConfigFetcher struct {
diff --git a/src/plugin/config_test.go b/src/plugin/config_test.go
@@ -25,6 +25,7 @@ func TestFailOnMissingRequiredEnvironment(t *testing.T) {
 			disabledEnvVars: []string{
 				"BUILDKITE_PLUGIN_ECS_TASK_RUNNER_PARAMETER_NAME",
 				"BUILDKITE_PLUGIN_ECS_TASK_RUNNER_COMMAND",
+				"BUILDKITE_PLUGIN_ECS_TASK_RUNNER_TIMEOUT",
 			},
 			enabledEnvVars: map[string]string{},
 			expectedErr:    "required key BUILDKITE_PLUGIN_ECS_TASK_RUNNER_PARAMETER_NAME missing value",
@@ -100,18 +101,27 @@ func TestSucceedOnMissingOptionalEnvironment(t *testing.T) {
 func TestFetchConfigFromEnvironment(t *testing.T) {
 	unsetEnv(t, "BUILDKITE_PLUGIN_ECS_TASK_RUNNER_PARAMETER_NAME")
 	unsetEnv(t, "BUILDKITE_PLUGIN_ECS_TASK_RUNNER_COMMAND")
+	unsetEnv(t, "BUILDKITE_PLUGIN_ECS_TASK_RUNNER_TIME_OUT")
 
 	var config plugin.Config
 	fetcher := plugin.EnvironmentConfigFetcher{}
 
 	t.Setenv("BUILDKITE_PLUGIN_ECS_TASK_RUNNER_PARAMETER_NAME", "test-parameter")
 	t.Setenv("BUILDKITE_PLUGIN_ECS_TASK_RUNNER_COMMAND", "hello-world")
+	t.Setenv("BUILDKITE_PLUGIN_ECS_TASK_RUNNER_TIME_OUT", "600")
 
 	err := fetcher.Fetch(&config)
 
 	require.NoError(t, err, "fetch should not error")
 	assert.Equal(t, "test-parameter", config.ParameterName, "fetched message should match environment")
-	assert.Equal(t, "hello-world", config.Command, "fetched message should match environment")
+	assert.Equal(t, "hello-world", config.Command, "fetched script should match environment")
+	assert.Equal(t, 600, config.TimeOut, "fetched timeout should match environment")
+
+	// test default value
+	unsetEnv(t, "BUILDKITE_PLUGIN_ECS_TASK_RUNNER_TIME_OUT")
+	err = fetcher.Fetch(&config)
+	require.NoError(t, err, "fetch should not error")
+	assert.Equal(t, 2700, config.TimeOut, "fetched timeout should match environment")
 }
 
 func unsetEnv(t *testing.T, key string) {
diff --git a/src/plugin/task-runner.go b/src/plugin/task-runner.go
@@ -2,6 +2,7 @@ package plugin
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"strings"
 	"time"
@@ -18,16 +19,19 @@ import (
 type TaskRunnerPlugin struct {
 }
 
+type WaitForCompletion func(ctx context.Context, waiter awsinternal.EcsWaiterAPI, taskArn string, timeOut int) (*ecs.DescribeTasksOutput, error)
 type ConfigFetcher interface {
 	Fetch(config *Config) error
 }
 
-func (trp TaskRunnerPlugin) Run(ctx context.Context, fetcher ConfigFetcher) error {
+func (trp TaskRunnerPlugin) Run(ctx context.Context, fetcher ConfigFetcher, waiter WaitForCompletion) error {
 	var config Config
+
 	err := fetcher.Fetch(&config)
 	if err != nil {
 		return fmt.Errorf("plugin configuration error: %w", err)
 	}
+	buildKiteAgent := buildkite.Agent{}
 
 	buildkite.Log("Executing task-runner plugin\n")
 
@@ -61,10 +65,12 @@ func (trp TaskRunnerPlugin) Run(ctx context.Context, fetcher ConfigFetcher) erro
 		// TODO: This is currently a magic number. If we want this to be configurable, remove the nolint directive and fix it up
 		o.MaxDelay = 10 * time.Second //nolint:mnd
 	})
-	result, err := awsinternal.WaitForCompletion(ctx, waiterClient, taskArn)
+	result, err := waiter(ctx, waiterClient, taskArn, config.TimeOut)
+	err = trp.HandleResults(ctx, result, err, buildKiteAgent, config)
 	if err != nil {
-		return fmt.Errorf("failed to wait for task completion: %w\nFailure information: %v", err, result.Failures[0])
+		return fmt.Errorf("failed to handle task results: %w", err)
 	}
+
 	// In a successful scenario for task completion, we would have a `tasks` slice with a single element
 	task := result.Tasks[0]
 	taskLogDetails, err := awsinternal.FindLogStreamFromTask(ctx, ecsClient, task)
@@ -104,3 +110,33 @@ func (trp TaskRunnerPlugin) Run(ctx context.Context, fetcher ConfigFetcher) erro
 	buildkite.Log("done. \n")
 	return nil
 }
+
+func (trp TaskRunnerPlugin) HandleResults(ctx context.Context, output *ecs.DescribeTasksOutput, err error, bkAgent buildkite.AgentAPI, config Config) error {
+	if err != nil {
+		// This comparison is hacky, but is the only way that I could get the wrapped errors surfaced
+		// from the AWS library to be properly handled. It would be better if this was done using errors.As
+		if strings.Contains(err.Error(), "exceeded max wait time for TasksStopped waiter") {
+			err := bkAgent.Annotate(ctx, fmt.Sprintf("Task did not complete successfully within timeout (%d seconds)", config.TimeOut), "error", "ecs-task-runner")
+			if err != nil {
+				return fmt.Errorf("failed to annotate buildkite with task timeout failure: %w", err)
+			}
+			return errors.New("task did not complete within the time limit")
+		}
+		bkerr := bkAgent.Annotate(ctx, fmt.Sprintf("failed to wait for task completion: %v\n", err), "error", "ecs-task-runner")
+		if bkerr != nil {
+			return fmt.Errorf("failed to annotate buildkite with task wait failure: %w, annotation error: %w", err, bkerr)
+		}
+	} else if len(output.Failures) > 0 {
+		// There is still a scenario where the task could return failures but this isn't handled by the waiter
+		// This is due to the waiter only returning errors in scenarios where there are issues querying the task
+		// or scheduling the task. For a list of the Failures that can be returned in this case, see:
+		// https://docs.aws.amazon.com/AmazonECS/latest/developerguide/api_failures_messages.html
+		// specifically, under the `DescribeTasks` API.
+		err := bkAgent.Annotate(ctx, fmt.Sprintf("Task did not complete successfully: %v", output.Failures[0]), "error", "ecs-task-runner")
+		if err != nil {
+			return fmt.Errorf("failed to annotate buildkite with task failure: %w", err)
+		}
+		return fmt.Errorf("task did not complete successfully: %v", output.Failures[0])
+	}
+	return nil
+}
diff --git a/src/plugin/task-runner_test.go b/src/plugin/task-runner_test.go

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ type EcsClientAPI interface {`
`19`	`19`	`DescribeTaskDefinition(ctx context.Context, params ecs.DescribeTaskDefinitionInput, optFns ...func(ecs.Options)) (*ecs.DescribeTaskDefinitionOutput, error)`
`20`	`20`	`}`
`21`	`21`
`22`		`-type ecsWaiterAPI interface {`
	`22`	`+type EcsWaiterAPI interface {`
`23`	`23`	`WaitForOutput(ctx context.Context, params ecs.DescribeTasksInput, maxWaitDur time.Duration, optFns ...func(ecs.TasksStoppedWaiterOptions)) (*ecs.DescribeTasksOutput, error)`
`24`	`24`	`}`
`25`	`25`
`@@ -56,11 +56,10 @@ func SubmitTask(ctx context.Context, ecsAPI EcsClientAPI, input *TaskRunnerConfi`
`56`	`56`	`return *response.Tasks[0].TaskArn, nil`
`57`	`57`	`}`
`58`	`58`
`59`		`-func WaitForCompletion(ctx context.Context, waiter ecsWaiterAPI, taskArn string) (*ecs.DescribeTasksOutput, error) {`
	`59`	`+func WaitForCompletion(ctx context.Context, waiter EcsWaiterAPI, taskArn string, timeOut int) (*ecs.DescribeTasksOutput, error) {`
`60`	`60`	`cluster := ClusterFromTaskArn(taskArn)`
`61`	`61`
`62`		`- // TODO: This magic number will be resolved in a future piece of work, not going to refactor this right now`
`63`		`- maxWaitDuration := 15 * time.Minute //nolint:mnd`
	`62`	`+ maxWaitDuration := time.Duration(timeOut) * time.Second`
`64`	`63`	`result, err := waiter.WaitForOutput(ctx, &ecs.DescribeTasksInput{`
`65`	`64`	`Cluster: aws.String(cluster),`
`66`	`65`	`Tasks: []string{taskArn},`
Original file line number	Diff line number	Diff line change
`@@ -11,10 +11,14 @@ import (`
`11`	`11`	`osexec "golang.org/x/sys/execabs"`
`12`	`12`	`)`
`13`	`13`
	`14`	`+type AgentAPI interface {`
	`15`	`+ Annotate(ctx context.Context, message string, style string, annotationContext string) error`
	`16`	`+}`
	`17`	`+`
`14`	`18`	`type Agent struct {`
`15`	`19`	`}`
`16`	`20`
`17`		`-func (a *Agent) Annotate(ctx context.Context, message string, style string, annotationContext string) error {`
	`21`	`+func (a Agent) Annotate(ctx context.Context, message string, style string, annotationContext string) error {`
`18`	`22`	`return execCmd(ctx, "buildkite-agent", &message, "annotate", "--style", style, "--context", annotationContext)`
`19`	`23`	`}`
`20`	`24`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ import (`
`7`	`7`	`type Config struct {`
`8`	`8`	ParameterName string `required:"true" split_words:"true"`
`9`	`9`	Command string `required:"false" split_words:"true"`
	`10`	+ TimeOut int `default:"2700" split_words:"true"`
`10`	`11`	`}`
`11`	`12`
`12`	`13`	`type EnvironmentConfigFetcher struct {`