Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 12 additions & 23 deletions cmd/auth_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,24 +110,17 @@ func executeAuthExecCommandCore(cmd *cobra.Command, args []string) error {

// Prepare shell environment with file-based credentials.
// Start with current OS environment + global env from atmos.yaml and let PrepareShellEnvironment configure auth.
// PrepareShellEnvironment sanitizes the env (removes IRSA/credential vars) and adds auth vars.
baseEnv := envpkg.MergeGlobalEnv(os.Environ(), atmosConfig.Env)
envList, err := authManager.PrepareShellEnvironment(ctx, identityName, baseEnv)
if err != nil {
return fmt.Errorf("failed to prepare command environment: %w", err)
}

// Convert environment list to map for executeCommandWithEnv.
envMap := make(map[string]string)
for _, envVar := range envList {
if idx := strings.IndexByte(envVar, '='); idx >= 0 {
key := envVar[:idx]
value := envVar[idx+1:]
envMap[key] = value
}
}

// Execute the command with authentication environment.
err = executeCommandWithEnv(commandArgs, envMap)
// Execute the command with the sanitized environment directly.
// The envList already includes os.Environ() (sanitized) + auth vars,
// so we pass it as the complete subprocess environment.
err = executeCommandWithEnv(commandArgs, envList)
if err != nil {
// For any subprocess error, provide a tip about refreshing credentials.
// This helps users when AWS tokens are expired or invalid.
Expand All @@ -137,29 +130,25 @@ func executeAuthExecCommandCore(cmd *cobra.Command, args []string) error {
return nil
}

// executeCommandWithEnv executes a command with additional environment variables.
func executeCommandWithEnv(args []string, envVars map[string]string) error {
// executeCommandWithEnv executes a command with a complete environment.
// The env parameter should be a fully prepared environment (e.g., from PrepareShellEnvironment).
// It is used directly as the subprocess environment without re-reading os.Environ().
func executeCommandWithEnv(args []string, env []string) error {
if len(args) == 0 {
return fmt.Errorf(errUtils.ErrWrapFormat, errUtils.ErrNoCommandSpecified, errUtils.ErrInvalidSubcommand)
}

// Prepare the command
// Prepare the command.
cmdName := args[0]
cmdArgs := args[1:]

// Look for the command in PATH
// Look for the command in PATH.
cmdPath, err := exec.LookPath(cmdName)
if err != nil {
return fmt.Errorf(errUtils.ErrWrapFormat, errUtils.ErrCommandNotFound, err)
}

// Prepare environment variables
env := os.Environ()
for key, value := range envVars {
env = append(env, fmt.Sprintf("%s=%s", key, value))
}

// Execute the command
// Execute the command with the provided environment directly.
execCmd := exec.Command(cmdPath, cmdArgs...)
execCmd.Env = env
execCmd.Stdin = os.Stdin
Expand Down
52 changes: 29 additions & 23 deletions cmd/auth_exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package cmd
import (
"bytes"
"errors"
"os"
"runtime"
"testing"

Expand Down Expand Up @@ -218,52 +219,57 @@ func TestExtractIdentityFlag(t *testing.T) {
}

func TestExecuteCommandWithEnv(t *testing.T) {
// Use the test binary itself as a cross-platform subprocess helper.
// TestMain in testing_main_test.go handles _ATMOS_TEST_EXIT_ONE.
exePath, err := os.Executable()
require.NoError(t, err, "os.Executable() must succeed")

// Prerequisite: verify that env vars reach the child process.
t.Run("env propagation to subprocess", func(t *testing.T) {
// Running the test binary with -test.run=^$ matches no tests and exits 0,
// confirming the subprocess receives the provided environment.
err := executeCommandWithEnv(
[]string{exePath, "-test.run=^$"},
[]string{"TEST_VAR=test-value"},
)
assert.NoError(t, err)
})

// Test the command execution helper directly.
tests := []struct {
name string
args []string
envVars map[string]string
skipOnWindows bool
envVars []string
expectedError string
expectedCode int // Expected exit code if error is ExitCodeError
expectedCode int // Expected exit code if error is ExitCodeError.
}{
{
name: "empty args",
args: []string{},
envVars: map[string]string{},
envVars: []string{},
expectedError: "no command specified",
},
{
name: "simple echo command",
args: []string{"echo", "hello"},
envVars: map[string]string{
"TEST_VAR": "test-value",
},
skipOnWindows: true,
name: "successful command",
args: []string{exePath, "-test.run=^$"},
envVars: []string{"TEST_VAR=test-value"},
},
{
name: "nonexistent command",
args: []string{"nonexistent-command-xyz"},
envVars: map[string]string{},
envVars: []string{},
expectedError: "command not found",
},
{
name: "command with non-zero exit code",
args: []string{"sh", "-c", "exit 2"},
envVars: map[string]string{
"TEST_VAR": "test-value",
},
skipOnWindows: true,
expectedCode: 2,
name: "command with non-zero exit code",
args: []string{exePath, "-test.run=^$"},
envVars: []string{"_ATMOS_TEST_EXIT_ONE=1"},
expectedCode: 1,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.skipOnWindows && runtime.GOOS == "windows" {
t.Skipf("Skipping test on Windows: command behaves differently")
}

err := executeCommandWithEnv(tt.args, tt.envVars)

switch {
Expand All @@ -274,7 +280,7 @@ func TestExecuteCommandWithEnv(t *testing.T) {
}
case tt.expectedCode != 0:
assert.Error(t, err)
// Check that it's an ExitCodeError with the correct code
// Check that it's an ExitCodeError with the correct code.
var exitCodeErr errUtils.ExitCodeError
if assert.True(t, errors.As(err, &exitCodeErr), "error should be ExitCodeError") {
assert.Equal(t, tt.expectedCode, exitCodeErr.Code)
Expand Down
16 changes: 4 additions & 12 deletions cmd/auth_shell.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ func executeAuthShellCommandCore(cmd *cobra.Command, args []string) error {

// Prepare shell environment with file-based credentials.
// Start with current OS environment and let PrepareShellEnvironment configure auth.
// PrepareShellEnvironment sanitizes the env (removes IRSA/credential vars) and adds auth vars.
envList, err := authManager.PrepareShellEnvironment(ctx, identityName, os.Environ())
if err != nil {
return fmt.Errorf("failed to prepare shell environment: %w", err)
Expand All @@ -129,18 +130,9 @@ func executeAuthShellCommandCore(cmd *cobra.Command, args []string) error {
// Get provider name from the identity to display in shell messages.
providerName := authManager.GetProviderForIdentity(identityName)

// Execute the shell with authentication environment.
// ExecAuthShellCommand expects env vars as a map, so convert the list.
envMap := make(map[string]string)
for _, envVar := range envList {
if idx := strings.IndexByte(envVar, '='); idx >= 0 {
key := envVar[:idx]
value := envVar[idx+1:]
envMap[key] = value
}
}

return exec.ExecAuthShellCommand(atmosConfigPtr, identityName, providerName, envMap, shell, shellArgs)
// Execute the shell with the sanitized environment directly.
// envList already includes os.Environ() (sanitized) + auth vars.
return exec.ExecAuthShellCommand(atmosConfigPtr, identityName, providerName, envList, shell, shellArgs)
}

// extractAuthShellFlags extracts --identity and --shell flags from args and returns the remaining shell args.
Expand Down
6 changes: 6 additions & 0 deletions cmd/testing_main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import (
// TestMain provides package-level test setup and teardown.
// It ensures RootCmd state is properly managed across all tests in the package.
func TestMain(m *testing.M) {
// Cross-platform subprocess helper: exit with code 1 when env flag is set.
// This lets tests use the test binary itself as a cross-platform "exit 1" command.
if os.Getenv("_ATMOS_TEST_EXIT_ONE") == "1" {
os.Exit(1)
}

// Capture initial RootCmd state.
initialSnapshot := snapshotRootCmdState()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Fix: Auth credential chain skipping AssumeRole when target identity credentials are cached

**Date:** 2026-03-23

**Reported by:** Alexander Matveev (9spokes)

## Problem

On EKS ARC (Actions Runner Controller) runners with IRSA, `atmos auth` returns stale cached credentials instead of performing the actual `AssumeRole` API call. Terraform runs with the runner's pod credentials instead of the Atmos-authenticated planner role.

This is the **second bug** affecting IRSA on EKS. The first handles env var scrubbing — removing `AWS_WEB_IDENTITY_TOKEN_FILE`, `AWS_ROLE_ARN`, and `AWS_ROLE_SESSION_NAME` from the subprocess environment so the AWS SDK doesn't prefer pod identity over file-based credentials. This second bug is in the credential chain cache lookup and causes the `AssumeRole` step to be skipped entirely.

### Root Cause

In `findFirstValidCachedCredentials()`, the bottom-up scan finds valid cached credentials at the **last** identity in the chain (the target). The caller `fetchCachedCredentials()` then advances `startIndex` past the end of the chain, causing `authenticateIdentityChain`'s loop to never execute.

**Walkthrough with a 2-element chain `[github-oidc, core-artifacts/terraform]`:**

1. `findFirstValidCachedCredentials()` scans bottom-up, finds valid cached credentials at index 1 (`core-artifacts/terraform`) → returns `1`
2. `fetchCachedCredentials(1)` loads those cached creds and returns `startIndex = 1 + 1 = 2`
3. `authenticateIdentityChain(ctx, 2, cachedCreds)` runs the loop `for i := 2; i < 2` → **loop never executes**
4. The cached credentials are returned as-is, without ever calling `identity.Authenticate()` (the actual `AssumeRole` API call)

The cached credentials at the last index represent the **output** of that step. `fetchCachedCredentials` is designed to return `startIndex + 1` so the **next** step can use them as input. But when there is no next step (last in chain), the index goes past the end and the identity loop is skipped entirely.

This means if the credential file ever contains stale or incorrect credentials (e.g., written by a previous auth step in the same run, or from a provider-level auth that wrote base credentials under the identity's profile), they get returned without validation through the actual AWS STS call.

### Why This Manifests on EKS ARC Runners

On EKS runners with IRSA, the credential file can contain provider-level (OIDC) credentials that were cached during the provider authentication step. When `findFirstValidCachedCredentials` returns the last index, `fetchCachedCredentials` advances past the chain end, and these provider-level credentials are returned as if they were the assume-role credentials — without the actual `AssumeRole` call ever happening.

## Fix

In `findFirstValidCachedCredentials()`, skip cached credentials at the target (last) identity and continue scanning earlier in the chain. This ensures the identity's `Authenticate()` method is always called for the final step.

**Changed file:** `pkg/auth/manager_chain.go`

```go
// After validating credentials are not expired:
if i == len(m.chain)-1 {
log.Debug("Skipping cached target identity credentials to force re-authentication",
logKeyChainIndex, i, identityNameKey, identityName)
continue
}
return i
```

**Behavior with the fix for different chain configurations:**

| Chain | Cache state | `findFirstValidCachedCredentials` returns | Result |
|-------|-------------|------------------------------------------|--------|
| `[provider, assume-role]` | Cache at index 1 (last) | Skip index 1 → check index 0 → if valid, return 0 | AssumeRole executes with provider creds as input |
| `[provider, assume-role]` | Cache only at index 1, nothing at 0 | Skip index 1 → index 0 invalid → return -1 | Full re-auth from provider |
| `[provider, assume-role]` | Cache at index 0 (provider) | Return 0 | Identity chain starts at 1, AssumeRole executes |
| `[provider, permset, assume-role]` | Cache at index 1 (middle) | Return 1 | `fetchCachedCredentials(1)` returns startIndex=2, assume-role step executes |

This aligns with the existing comment on lines 30-33 of `manager_chain.go`:

> "CRITICAL: Always re-authenticate through the full chain, even if the target identity has cached credentials."

## Two Independent Bugs Affecting EKS ARC Runners

There are two independent bugs that both affect EKS ARC runners with IRSA. Both fixes are needed for correct behavior:

1. **IRSA env var scrubbing:** Pod-injected IRSA env vars (`AWS_WEB_IDENTITY_TOKEN_FILE`, `AWS_ROLE_ARN`, `AWS_ROLE_SESSION_NAME`) leak into the subprocess, causing AWS SDK to prefer web identity token auth over the Atmos-managed credential files. Fix: scrub these vars from the subprocess environment via `PrepareShellEnvironment`.

2. **Credential chain cache lookup (this fix):** `findFirstValidCachedCredentials` returns the last chain index, `fetchCachedCredentials` advances past the chain end, and the `AssumeRole` call is skipped entirely. Fix: skip the last index in cache lookup to force re-authentication.

Alexander confirmed that both fixes together resolved the issue for 9spokes.

## Test Coverage

Updated `TestManager_findFirstValidCachedCredentials` in `pkg/auth/manager_test.go`:

- When both `id1` and `id2` have valid credentials, the function now returns `id1` (second-to-last, index 1) instead of `id2` (last, index 2), forcing re-authentication of the target identity.
1 change: 1 addition & 0 deletions internal/exec/helmfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ func ExecuteHelmfile(info schema.ConfigAndStacksInfo) error {
envVars,
info.DryRun,
info.RedirectStdErr,
WithEnvironment(info.SanitizedEnv),
)
if err != nil {
return err
Expand Down
1 change: 1 addition & 0 deletions internal/exec/packer.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,5 +289,6 @@ func ExecutePacker(
envVars,
info.DryRun,
info.RedirectStdErr,
WithEnvironment(info.SanitizedEnv),
)
}
55 changes: 42 additions & 13 deletions internal/exec/shell_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ type shellCommandConfig struct {
stdoutCapture io.Writer
stderrCapture io.Writer
stdoutOverride io.Writer
// processEnv replaces os.Environ() as the process environment.
// When set, ExecuteShellCommand uses this instead of re-reading os.Environ().
// This is used when auth has already sanitized the environment (e.g., removed IRSA vars).
processEnv []string
}

// WithStdoutCapture returns a ShellCommandOption that tees stdout to the provided writer.
Expand Down Expand Up @@ -68,6 +72,17 @@ func WithStdoutOverride(w io.Writer) ShellCommandOption {
}
}

// WithEnvironment provides a pre-sanitized process environment for subprocess execution.
// When provided, ExecuteShellCommand uses this instead of re-reading os.Environ().
// Pass nil to fall back to the default os.Environ() behavior.
func WithEnvironment(env []string) ShellCommandOption {
defer perf.Track(nil, "exec.WithEnvironment")()

return func(c *shellCommandConfig) {
c.processEnv = env
}
}

// ExecuteShellCommand prints and executes the provided command with args and flags.
func ExecuteShellCommand(
atmosConfig schema.AtmosConfiguration,
Expand All @@ -93,9 +108,14 @@ func ExecuteShellCommand(
}

cmd := exec.Command(command, args...)
// Build environment: os.Environ() + global env (atmos.yaml) + command-specific env.
// Global env has lowest priority after system env, command-specific env overrides both.
cmdEnv := envpkg.MergeGlobalEnv(os.Environ(), atmosConfig.Env)
// Build environment: process env + global env (atmos.yaml) + command-specific env.
// When auth has sanitized the environment, cfg.processEnv is used instead of
// os.Environ() to avoid reintroducing problematic vars (e.g., IRSA credentials).
baseEnv := os.Environ()
if cfg.processEnv != nil {
baseEnv = cfg.processEnv
}
cmdEnv := envpkg.MergeGlobalEnv(baseEnv, atmosConfig.Env)
cmdEnv = append(cmdEnv, env...)
cmdEnv = append(cmdEnv, fmt.Sprintf("ATMOS_SHLVL=%d", newShellLevel))

Expand Down Expand Up @@ -395,12 +415,16 @@ func execTerraformShellCommand(
}

// ExecAuthShellCommand starts a new interactive shell with the provided authentication environment variables.
// It increments ATMOS_SHLVL for the session, sets ATMOS_IDENTITY plus the supplied auth env vars into the shell environment (merged with the host environment), prints enter/exit messages, and launches the resolved shell command; returns an error if no suitable shell is found or if the shell process fails.
// It increments ATMOS_SHLVL for the session, sets ATMOS_IDENTITY plus the supplied auth env vars into the shell environment, prints enter/exit messages, and launches the resolved shell command; returns an error if no suitable shell is found or if the shell process fails.
//
// The sanitizedEnv parameter should be a complete, pre-sanitized environment from PrepareShellEnvironment.
// It is used directly without re-reading os.Environ(), ensuring problematic vars (e.g., IRSA credentials)
// that were removed during auth preparation are not reintroduced.
func ExecAuthShellCommand(
atmosConfig *schema.AtmosConfiguration,
identityName string,
providerName string,
authEnvVars map[string]string,
sanitizedEnv []string,
shellOverride string,
shellArgs []string,
) error {
Expand All @@ -414,12 +438,18 @@ func ExecAuthShellCommand(
// Decrement the value after exiting the shell.
defer decrementAtmosShellLevel()

// Convert auth env vars map to slice format.
authEnvList := envpkg.ConvertMapToSlice(authEnvVars)
// Append shell-specific env vars to the sanitized environment.
// The sanitizedEnv already includes os.Environ() (sanitized) + auth vars.
// Use UpdateEnvVar to replace-or-append each key, because os.StartProcess
// does not deduplicate — the first occurrence wins if duplicates exist.
shellEnv := append([]string{}, sanitizedEnv...)
shellEnv = envpkg.UpdateEnvVar(shellEnv, "ATMOS_IDENTITY", identityName)
shellEnv = envpkg.UpdateEnvVar(shellEnv, atmosShellLevelEnvVar, strconv.Itoa(atmosShellVal))

// Set environment variables to indicate the details of the Atmos auth shell configuration.
authEnvList = append(authEnvList, fmt.Sprintf("ATMOS_IDENTITY=%s", identityName))
authEnvList = append(authEnvList, fmt.Sprintf("%s=%d", atmosShellLevelEnvVar, atmosShellVal))
// Append global env from atmos.yaml.
for k, v := range atmosConfig.Env {
shellEnv = envpkg.UpdateEnvVar(shellEnv, k, v)
}

log.Debug("Starting a new interactive shell with authentication environment variables (type 'exit' to go back)",
"identity", identityName)
Expand All @@ -435,9 +465,8 @@ func ExecAuthShellCommand(
// Print user-facing message about entering the shell.
printShellEnterMessage(identityName, providerName)

// Merge env vars, ensuring authEnvList takes precedence.
// Include global env from atmos.yaml (lowest priority after system env).
mergedEnv := envpkg.MergeSystemEnvSimpleWithGlobal(authEnvList, atmosConfig.Env)
// Use the sanitized environment directly (no re-reading os.Environ()).
mergedEnv := shellEnv

// Determine shell command and args.
shellCommand, shellCommandArgs := determineShell(shellOverride, shellArgs)
Expand Down
Loading
Loading