cloudposse · aknysh · Mar 26, 2026 · Mar 4, 2026 · Mar 5, 2026 · Mar 24, 2026
@@ -110,24 +110,17 @@ func executeAuthExecCommandCore(cmd *cobra.Command, args []string) error {
 
 	// Prepare shell environment with file-based credentials.
 	// Start with current OS environment + global env from atmos.yaml and let PrepareShellEnvironment configure auth.
+	// PrepareShellEnvironment sanitizes the env (removes IRSA/credential vars) and adds auth vars.
 	baseEnv := envpkg.MergeGlobalEnv(os.Environ(), atmosConfig.Env)
 	envList, err := authManager.PrepareShellEnvironment(ctx, identityName, baseEnv)
 	if err != nil {
 		return fmt.Errorf("failed to prepare command environment: %w", err)
 	}
 
-	// Convert environment list to map for executeCommandWithEnv.
-	envMap := make(map[string]string)
-	for _, envVar := range envList {
-		if idx := strings.IndexByte(envVar, '='); idx >= 0 {
-			key := envVar[:idx]
-			value := envVar[idx+1:]
-			envMap[key] = value
-		}
-	}
-
-	// Execute the command with authentication environment.
-	err = executeCommandWithEnv(commandArgs, envMap)
+	// Execute the command with the sanitized environment directly.
+	// The envList already includes os.Environ() (sanitized) + auth vars,
+	// so we pass it as the complete subprocess environment.
+	err = executeCommandWithEnv(commandArgs, envList)
 	if err != nil {
 		// For any subprocess error, provide a tip about refreshing credentials.
 		// This helps users when AWS tokens are expired or invalid.
@@ -137,29 +130,25 @@ func executeAuthExecCommandCore(cmd *cobra.Command, args []string) error {
 	return nil
 }
 
-// executeCommandWithEnv executes a command with additional environment variables.
-func executeCommandWithEnv(args []string, envVars map[string]string) error {
+// executeCommandWithEnv executes a command with a complete environment.
+// The env parameter should be a fully prepared environment (e.g., from PrepareShellEnvironment).
+// It is used directly as the subprocess environment without re-reading os.Environ().
+func executeCommandWithEnv(args []string, env []string) error {
 	if len(args) == 0 {
 		return fmt.Errorf(errUtils.ErrWrapFormat, errUtils.ErrNoCommandSpecified, errUtils.ErrInvalidSubcommand)
 	}
 
-	// Prepare the command
+	// Prepare the command.
 	cmdName := args[0]
 	cmdArgs := args[1:]
 
-	// Look for the command in PATH
+	// Look for the command in PATH.
 	cmdPath, err := exec.LookPath(cmdName)
 	if err != nil {
 		return fmt.Errorf(errUtils.ErrWrapFormat, errUtils.ErrCommandNotFound, err)
 	}
 
-	// Prepare environment variables
-	env := os.Environ()
-	for key, value := range envVars {
-		env = append(env, fmt.Sprintf("%s=%s", key, value))
-	}
-
-	// Execute the command
+	// Execute the command with the provided environment directly.
 	execCmd := exec.Command(cmdPath, cmdArgs...)
 	execCmd.Env = env
 	execCmd.Stdin = os.Stdin

@@ -3,6 +3,7 @@ package cmd
 import (
 	"bytes"
 	"errors"
+	"os"
 	"runtime"
 	"testing"
 
@@ -218,52 +219,57 @@ func TestExtractIdentityFlag(t *testing.T) {
 }
 
 func TestExecuteCommandWithEnv(t *testing.T) {
+	// Use the test binary itself as a cross-platform subprocess helper.
+	// TestMain in testing_main_test.go handles _ATMOS_TEST_EXIT_ONE.
+	exePath, err := os.Executable()
+	require.NoError(t, err, "os.Executable() must succeed")
+
+	// Prerequisite: verify that env vars reach the child process.
+	t.Run("env propagation to subprocess", func(t *testing.T) {
+		// Running the test binary with -test.run=^$ matches no tests and exits 0,
+		// confirming the subprocess receives the provided environment.
+		err := executeCommandWithEnv(
+			[]string{exePath, "-test.run=^$"},
+			[]string{"TEST_VAR=test-value"},
+		)
+		assert.NoError(t, err)
+	})
+
 	// Test the command execution helper directly.
 	tests := []struct {
 		name          string
 		args          []string
-		envVars       map[string]string
-		skipOnWindows bool
+		envVars       []string
 		expectedError string
-		expectedCode  int // Expected exit code if error is ExitCodeError
+		expectedCode  int // Expected exit code if error is ExitCodeError.
 	}{
 		{
 			name:          "empty args",
 			args:          []string{},
-			envVars:       map[string]string{},
+			envVars:       []string{},
 			expectedError: "no command specified",
 		},
 		{
-			name: "simple echo command",
-			args: []string{"echo", "hello"},
-			envVars: map[string]string{
-				"TEST_VAR": "test-value",
-			},
-			skipOnWindows: true,
+			name:    "successful command",
+			args:    []string{exePath, "-test.run=^$"},
+			envVars: []string{"TEST_VAR=test-value"},
 		},
 		{
 			name:          "nonexistent command",
 			args:          []string{"nonexistent-command-xyz"},
-			envVars:       map[string]string{},
+			envVars:       []string{},
 			expectedError: "command not found",
 		},
 		{
-			name: "command with non-zero exit code",
-			args: []string{"sh", "-c", "exit 2"},
-			envVars: map[string]string{
-				"TEST_VAR": "test-value",
-			},
-			skipOnWindows: true,
-			expectedCode:  2,
+			name:         "command with non-zero exit code",
+			args:         []string{exePath, "-test.run=^$"},
+			envVars:      []string{"_ATMOS_TEST_EXIT_ONE=1"},
+			expectedCode: 1,
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			if tt.skipOnWindows && runtime.GOOS == "windows" {
-				t.Skipf("Skipping test on Windows: command behaves differently")
-			}
-
 			err := executeCommandWithEnv(tt.args, tt.envVars)
 
 			switch {
@@ -274,7 +280,7 @@ func TestExecuteCommandWithEnv(t *testing.T) {
 				}
 			case tt.expectedCode != 0:
 				assert.Error(t, err)
-				// Check that it's an ExitCodeError with the correct code
+				// Check that it's an ExitCodeError with the correct code.
 				var exitCodeErr errUtils.ExitCodeError
 				if assert.True(t, errors.As(err, &exitCodeErr), "error should be ExitCodeError") {
 					assert.Equal(t, tt.expectedCode, exitCodeErr.Code)

@@ -115,6 +115,7 @@ func executeAuthShellCommandCore(cmd *cobra.Command, args []string) error {
 
 	// Prepare shell environment with file-based credentials.
 	// Start with current OS environment and let PrepareShellEnvironment configure auth.
+	// PrepareShellEnvironment sanitizes the env (removes IRSA/credential vars) and adds auth vars.
 	envList, err := authManager.PrepareShellEnvironment(ctx, identityName, os.Environ())
 	if err != nil {
 		return fmt.Errorf("failed to prepare shell environment: %w", err)
@@ -129,18 +130,9 @@ func executeAuthShellCommandCore(cmd *cobra.Command, args []string) error {
 	// Get provider name from the identity to display in shell messages.
 	providerName := authManager.GetProviderForIdentity(identityName)
 
-	// Execute the shell with authentication environment.
-	// ExecAuthShellCommand expects env vars as a map, so convert the list.
-	envMap := make(map[string]string)
-	for _, envVar := range envList {
-		if idx := strings.IndexByte(envVar, '='); idx >= 0 {
-			key := envVar[:idx]
-			value := envVar[idx+1:]
-			envMap[key] = value
-		}
-	}
-
-	return exec.ExecAuthShellCommand(atmosConfigPtr, identityName, providerName, envMap, shell, shellArgs)
+	// Execute the shell with the sanitized environment directly.
+	// envList already includes os.Environ() (sanitized) + auth vars.
+	return exec.ExecAuthShellCommand(atmosConfigPtr, identityName, providerName, envList, shell, shellArgs)
 }
 
 // extractAuthShellFlags extracts --identity and --shell flags from args and returns the remaining shell args.

@@ -8,6 +8,12 @@ import (
 // TestMain provides package-level test setup and teardown.
 // It ensures RootCmd state is properly managed across all tests in the package.
 func TestMain(m *testing.M) {
+	// Cross-platform subprocess helper: exit with code 1 when env flag is set.
+	// This lets tests use the test binary itself as a cross-platform "exit 1" command.
+	if os.Getenv("_ATMOS_TEST_EXIT_ONE") == "1" {
+		os.Exit(1)
+	}
+
 	// Capture initial RootCmd state.
 	initialSnapshot := snapshotRootCmdState()
 

@@ -0,0 +1,75 @@
+# Fix: Auth credential chain skipping AssumeRole when target identity credentials are cached
+
+**Date:** 2026-03-23
+
+**Reported by:** Alexander Matveev (9spokes)
+
+## Problem
+
+On EKS ARC (Actions Runner Controller) runners with IRSA, `atmos auth` returns stale cached credentials instead of performing the actual `AssumeRole` API call. Terraform runs with the runner's pod credentials instead of the Atmos-authenticated planner role.
+
+This is the **second bug** affecting IRSA on EKS. The first handles env var scrubbing — removing `AWS_WEB_IDENTITY_TOKEN_FILE`, `AWS_ROLE_ARN`, and `AWS_ROLE_SESSION_NAME` from the subprocess environment so the AWS SDK doesn't prefer pod identity over file-based credentials. This second bug is in the credential chain cache lookup and causes the `AssumeRole` step to be skipped entirely.
+
+### Root Cause
+
+In `findFirstValidCachedCredentials()`, the bottom-up scan finds valid cached credentials at the **last** identity in the chain (the target). The caller `fetchCachedCredentials()` then advances `startIndex` past the end of the chain, causing `authenticateIdentityChain`'s loop to never execute.
+
+**Walkthrough with a 2-element chain `[github-oidc, core-artifacts/terraform]`:**
+
+1. `findFirstValidCachedCredentials()` scans bottom-up, finds valid cached credentials at index 1 (`core-artifacts/terraform`) → returns `1`
+2. `fetchCachedCredentials(1)` loads those cached creds and returns `startIndex = 1 + 1 = 2`
+3. `authenticateIdentityChain(ctx, 2, cachedCreds)` runs the loop `for i := 2; i < 2` → **loop never executes**
+4. The cached credentials are returned as-is, without ever calling `identity.Authenticate()` (the actual `AssumeRole` API call)
+
+The cached credentials at the last index represent the **output** of that step. `fetchCachedCredentials` is designed to return `startIndex + 1` so the **next** step can use them as input. But when there is no next step (last in chain), the index goes past the end and the identity loop is skipped entirely.
+
+This means if the credential file ever contains stale or incorrect credentials (e.g., written by a previous auth step in the same run, or from a provider-level auth that wrote base credentials under the identity's profile), they get returned without validation through the actual AWS STS call.
+
+### Why This Manifests on EKS ARC Runners
+
+On EKS runners with IRSA, the credential file can contain provider-level (OIDC) credentials that were cached during the provider authentication step. When `findFirstValidCachedCredentials` returns the last index, `fetchCachedCredentials` advances past the chain end, and these provider-level credentials are returned as if they were the assume-role credentials — without the actual `AssumeRole` call ever happening.
+
+## Fix
+
+In `findFirstValidCachedCredentials()`, skip cached credentials at the target (last) identity and continue scanning earlier in the chain. This ensures the identity's `Authenticate()` method is always called for the final step.
+
+**Changed file:** `pkg/auth/manager_chain.go`
+
+```go
+// After validating credentials are not expired:
+if i == len(m.chain)-1 {
+    log.Debug("Skipping cached target identity credentials to force re-authentication",
+        logKeyChainIndex, i, identityNameKey, identityName)
+    continue
+}
+return i
+```
+
+**Behavior with the fix for different chain configurations:**
+
+| Chain | Cache state | `findFirstValidCachedCredentials` returns | Result |
+|-------|-------------|------------------------------------------|--------|
+| `[provider, assume-role]` | Cache at index 1 (last) | Skip index 1 → check index 0 → if valid, return 0 | AssumeRole executes with provider creds as input |
+| `[provider, assume-role]` | Cache only at index 1, nothing at 0 | Skip index 1 → index 0 invalid → return -1 | Full re-auth from provider |
+| `[provider, assume-role]` | Cache at index 0 (provider) | Return 0 | Identity chain starts at 1, AssumeRole executes |
+| `[provider, permset, assume-role]` | Cache at index 1 (middle) | Return 1 | `fetchCachedCredentials(1)` returns startIndex=2, assume-role step executes |
+
+This aligns with the existing comment on lines 30-33 of `manager_chain.go`:
+
+> "CRITICAL: Always re-authenticate through the full chain, even if the target identity has cached credentials."
+
+## Two Independent Bugs Affecting EKS ARC Runners
+
+There are two independent bugs that both affect EKS ARC runners with IRSA. Both fixes are needed for correct behavior:
+
+1. **IRSA env var scrubbing:** Pod-injected IRSA env vars (`AWS_WEB_IDENTITY_TOKEN_FILE`, `AWS_ROLE_ARN`, `AWS_ROLE_SESSION_NAME`) leak into the subprocess, causing AWS SDK to prefer web identity token auth over the Atmos-managed credential files. Fix: scrub these vars from the subprocess environment via `PrepareShellEnvironment`.
+
+2. **Credential chain cache lookup (this fix):** `findFirstValidCachedCredentials` returns the last chain index, `fetchCachedCredentials` advances past the chain end, and the `AssumeRole` call is skipped entirely. Fix: skip the last index in cache lookup to force re-authentication.
+
+Alexander confirmed that both fixes together resolved the issue for 9spokes.
+
+## Test Coverage
+
+Updated `TestManager_findFirstValidCachedCredentials` in `pkg/auth/manager_test.go`:
+
+- When both `id1` and `id2` have valid credentials, the function now returns `id1` (second-to-last, index 1) instead of `id2` (last, index 2), forcing re-authentication of the target identity.
@@ -403,6 +403,7 @@ func ExecuteHelmfile(info schema.ConfigAndStacksInfo) error {
 		envVars,
 		info.DryRun,
 		info.RedirectStdErr,
+		WithEnvironment(info.SanitizedEnv),
 	)
 	if err != nil {
 		return err

@@ -289,5 +289,6 @@ func ExecutePacker(
 		envVars,
 		info.DryRun,
 		info.RedirectStdErr,
+		WithEnvironment(info.SanitizedEnv),
 	)
 }
@@ -41,6 +41,10 @@ type shellCommandConfig struct {
 	stdoutCapture  io.Writer
 	stderrCapture  io.Writer
 	stdoutOverride io.Writer
+	// processEnv replaces os.Environ() as the process environment.
+	// When set, ExecuteShellCommand uses this instead of re-reading os.Environ().
+	// This is used when auth has already sanitized the environment (e.g., removed IRSA vars).
+	processEnv []string
 }
 
 // WithStdoutCapture returns a ShellCommandOption that tees stdout to the provided writer.
@@ -68,6 +72,17 @@ func WithStdoutOverride(w io.Writer) ShellCommandOption {
 	}
 }
 
+// WithEnvironment provides a pre-sanitized process environment for subprocess execution.
+// When provided, ExecuteShellCommand uses this instead of re-reading os.Environ().
+// Pass nil to fall back to the default os.Environ() behavior.
+func WithEnvironment(env []string) ShellCommandOption {
+	defer perf.Track(nil, "exec.WithEnvironment")()
+
+	return func(c *shellCommandConfig) {
+		c.processEnv = env
+	}
+}
+
 // ExecuteShellCommand prints and executes the provided command with args and flags.
 func ExecuteShellCommand(
 	atmosConfig schema.AtmosConfiguration,
@@ -93,9 +108,14 @@ func ExecuteShellCommand(
 	}
 
 	cmd := exec.Command(command, args...)
-	// Build environment: os.Environ() + global env (atmos.yaml) + command-specific env.
-	// Global env has lowest priority after system env, command-specific env overrides both.
-	cmdEnv := envpkg.MergeGlobalEnv(os.Environ(), atmosConfig.Env)
+	// Build environment: process env + global env (atmos.yaml) + command-specific env.
+	// When auth has sanitized the environment, cfg.processEnv is used instead of
+	// os.Environ() to avoid reintroducing problematic vars (e.g., IRSA credentials).
+	baseEnv := os.Environ()
+	if cfg.processEnv != nil {
+		baseEnv = cfg.processEnv
+	}
+	cmdEnv := envpkg.MergeGlobalEnv(baseEnv, atmosConfig.Env)
 	cmdEnv = append(cmdEnv, env...)
 	cmdEnv = append(cmdEnv, fmt.Sprintf("ATMOS_SHLVL=%d", newShellLevel))
 
@@ -395,12 +415,16 @@ func execTerraformShellCommand(
 }
 
 // ExecAuthShellCommand starts a new interactive shell with the provided authentication environment variables.
-// It increments ATMOS_SHLVL for the session, sets ATMOS_IDENTITY plus the supplied auth env vars into the shell environment (merged with the host environment), prints enter/exit messages, and launches the resolved shell command; returns an error if no suitable shell is found or if the shell process fails.
+// It increments ATMOS_SHLVL for the session, sets ATMOS_IDENTITY plus the supplied auth env vars into the shell environment, prints enter/exit messages, and launches the resolved shell command; returns an error if no suitable shell is found or if the shell process fails.
+//
+// The sanitizedEnv parameter should be a complete, pre-sanitized environment from PrepareShellEnvironment.
+// It is used directly without re-reading os.Environ(), ensuring problematic vars (e.g., IRSA credentials)
+// that were removed during auth preparation are not reintroduced.
 func ExecAuthShellCommand(
 	atmosConfig *schema.AtmosConfiguration,
 	identityName string,
 	providerName string,
-	authEnvVars map[string]string,
+	sanitizedEnv []string,
 	shellOverride string,
 	shellArgs []string,
 ) error {
@@ -414,12 +438,18 @@ func ExecAuthShellCommand(
 	// Decrement the value after exiting the shell.
 	defer decrementAtmosShellLevel()
 
-	// Convert auth env vars map to slice format.
-	authEnvList := envpkg.ConvertMapToSlice(authEnvVars)
+	// Append shell-specific env vars to the sanitized environment.
+	// The sanitizedEnv already includes os.Environ() (sanitized) + auth vars.
+	// Use UpdateEnvVar to replace-or-append each key, because os.StartProcess
+	// does not deduplicate — the first occurrence wins if duplicates exist.
+	shellEnv := append([]string{}, sanitizedEnv...)
+	shellEnv = envpkg.UpdateEnvVar(shellEnv, "ATMOS_IDENTITY", identityName)
+	shellEnv = envpkg.UpdateEnvVar(shellEnv, atmosShellLevelEnvVar, strconv.Itoa(atmosShellVal))
 
-	// Set environment variables to indicate the details of the Atmos auth shell configuration.
-	authEnvList = append(authEnvList, fmt.Sprintf("ATMOS_IDENTITY=%s", identityName))
-	authEnvList = append(authEnvList, fmt.Sprintf("%s=%d", atmosShellLevelEnvVar, atmosShellVal))
+	// Append global env from atmos.yaml.
+	for k, v := range atmosConfig.Env {
+		shellEnv = envpkg.UpdateEnvVar(shellEnv, k, v)
+	}
 
 	log.Debug("Starting a new interactive shell with authentication environment variables (type 'exit' to go back)",
 		"identity", identityName)
@@ -435,9 +465,8 @@ func ExecAuthShellCommand(
 	// Print user-facing message about entering the shell.
 	printShellEnterMessage(identityName, providerName)
 
-	// Merge env vars, ensuring authEnvList takes precedence.
-	// Include global env from atmos.yaml (lowest priority after system env).
-	mergedEnv := envpkg.MergeSystemEnvSimpleWithGlobal(authEnvList, atmosConfig.Env)
+	// Use the sanitized environment directly (no re-reading os.Environ()).
+	mergedEnv := shellEnv
 
 	// Determine shell command and args.
 	shellCommand, shellCommandArgs := determineShell(shellOverride, shellArgs)