camunda-platform-helm/scripts/deploy-camunda/matrix/runner.go at ca30127f3978686d5eafabfaf5b55acae7e7587a · camunda/camunda-platform-helm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package matrix

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/jwalton/gchalk"

	"scripts/camunda-core/pkg/docker"
	"scripts/camunda-core/pkg/executil"
	"scripts/camunda-core/pkg/helm"
	"scripts/camunda-core/pkg/kube"
	"scripts/camunda-core/pkg/logging"
	"scripts/camunda-core/pkg/scenarios"
	"scripts/camunda-core/pkg/versionmatrix"
	"scripts/camunda-deployer/pkg/deployer"
	"scripts/deploy-camunda/auth0"
	"scripts/deploy-camunda/config"
	"scripts/deploy-camunda/deploy"
	"scripts/deploy-camunda/entra"
	"scripts/prepare-helm-values/pkg/env"
)

// numESPools is the number of Elasticsearch pools across which matrix entries
// are distributed via round-robin. This matches the 4-cluster pool infra.
const numESPools = 4

// numOSPools is the number of OpenSearch pools across which matrix entries
// are distributed via round-robin. This matches the 4-cluster pool infra.
const numOSPools = 4

// RunOptions controls matrix execution.
type RunOptions struct {
	// DryRun logs what would be done without executing.
	DryRun bool
	// StopOnFailure stops the run on the first failure.
	// In parallel mode, this cancels in-flight entries and prevents new ones from starting.
	StopOnFailure bool
	// Cleanup deletes each entry's namespace immediately after its deployment
	// and tests complete (regardless of success or failure). This frees cluster
	// resources as early as possible rather than waiting for the entire run to finish.
	Cleanup bool
	// KubeContexts maps platform names to Kubernetes contexts, e.g.,
	// {"gke": "gke_my-project_us-east1_cluster", "eks": "arn:aws:eks:..."}
	// When an entry's platform matches a key, that context is used for deployment and cleanup.
	KubeContexts map[string]string
	// KubeContext is a fallback Kubernetes context used when no platform-specific
	// context is configured. If both KubeContexts and KubeContext are set, the
	// platform-specific context takes priority.
	KubeContext string
	// NamespacePrefix is prepended to generated namespaces.
	NamespacePrefix string
	// Platform overrides the platform for all entries.
	Platform string
	// MaxParallel controls how many entries run concurrently.
	// 0 or 1 means sequential execution (default). Values > 1 enable parallel execution
	// with at most MaxParallel entries running simultaneously.
	MaxParallel int
	// TestIT runs integration tests after each deployment.
	TestIT bool
	// TestE2E runs e2e tests after each deployment.
	TestE2E bool
	// TestAll runs both integration and e2e tests after each deployment.
	TestAll bool
	// RepoRoot is the repository root path.
	RepoRoot string
	// EnvFiles maps chart versions to .env file paths, e.g.,
	// {"8.9": ".env.89", "8.8": ".env.88"}
	// When an entry's version matches a key, that .env file is loaded before deployment.
	EnvFiles map[string]string
	// EnvFile is a fallback .env file used when no version-specific file is configured.
	// If both EnvFiles and EnvFile are set, the version-specific file takes priority.
	EnvFile string
	// KeycloakHost is the external Keycloak hostname.
	// Defaults to config.DefaultKeycloakHost when empty.
	KeycloakHost string
	// KeycloakProtocol is the protocol for the external Keycloak (e.g., "https").
	// Defaults to config.DefaultKeycloakProtocol when empty.
	KeycloakProtocol string
	// IngressBaseDomains maps platform names to ingress base domains, e.g.,
	// {"gke": "ci.distro.ultrawombat.com", "eks": "distribution.aws.camunda.cloud"}
	// When an entry's platform matches a key, that domain is used for ingress hostname construction.
	IngressBaseDomains map[string]string
	// IngressBaseDomain is a fallback base domain for ingress hosts used when no
	// platform-specific domain is configured. If both IngressBaseDomains and
	// IngressBaseDomain are set, the platform-specific domain takes priority.
	// Valid values: ci.distro.ultrawombat.com, distribution.aws.camunda.cloud
	IngressBaseDomain string
	// LogLevel controls the log verbosity for each entry's deployment.
	// Valid values: debug, info, warn, error. Defaults to "info" if empty.
	LogLevel string
	// SkipDependencyUpdate skips running "helm dependency update" before deploying.
	// Default is false, meaning dependency update runs for every entry.
	SkipDependencyUpdate bool
	// VaultBackedSecrets maps platform names to whether vault-backed secrets should be used, e.g.,
	// {"eks": true, "gke": false}
	// When an entry's platform matches a key, the corresponding value controls whether
	// the vault-backend ClusterSecretStore and -vault.yaml manifest variants are selected.
	VaultBackedSecrets map[string]bool
	// UseVaultBackedSecrets is a fallback for platforms not in VaultBackedSecrets.
	// If both VaultBackedSecrets and UseVaultBackedSecrets are set, the platform-specific
	// value takes priority.
	UseVaultBackedSecrets bool
	// DeleteNamespaceFirst deletes the namespace before deploying each matrix entry.
	// This ensures a clean-slate deployment by removing any existing resources in the namespace.
	DeleteNamespaceFirst bool
	// Coverage produces a layer-breakdown report showing what IS tested in the matrix.
	// Behaves like DryRun (no deployment), but outputs a focused table showing each
	// scenario's resolved layers (identity, persistence, platform, infra-type, features, flow).
	Coverage bool
	// UpgradeFromVersion overrides the auto-resolved "from" chart version for upgrade flows.
	// When set, this version is used instead of resolving from version-matrix JSON files.
	// Only applies to entries with upgrade flows (upgrade-patch, upgrade-minor, modular-upgrade-minor).
	UpgradeFromVersion string
	// HelmTimeout is the timeout in minutes for each Helm deployment.
	// Applies uniformly to all matrix entries (install, upgrade Step 1, upgrade Step 2).
	// When <= 0, deploy.Execute defaults to 5 minutes.
	HelmTimeout int
	// DockerUsername is the Harbor registry username for pulling images.
	// When empty, the deployer falls back to HARBOR_USERNAME, TEST_DOCKER_USERNAME_CAMUNDA_CLOUD, or NEXUS_USERNAME env vars.
	DockerUsername string
	// DockerPassword is the Harbor registry password for pulling images.
	// When empty, the deployer falls back to HARBOR_PASSWORD, TEST_DOCKER_PASSWORD_CAMUNDA_CLOUD, or NEXUS_PASSWORD env vars.
	DockerPassword string
	// EnsureDockerRegistry creates a Harbor registry secret in each entry's namespace.
	// When true, the deployer performs docker login and creates a registry-camunda-cloud
	// Kubernetes secret of type kubernetes.io/dockerconfigjson.
	EnsureDockerRegistry bool
	// DockerHubUsername is the Docker Hub registry username.
	// When empty, the deployer falls back to DOCKERHUB_USERNAME or TEST_DOCKER_USERNAME env vars.
	DockerHubUsername string
	// DockerHubPassword is the Docker Hub registry password.
	// When empty, the deployer falls back to DOCKERHUB_PASSWORD or TEST_DOCKER_PASSWORD env vars.
	DockerHubPassword string
	// EnsureDockerHub creates a Docker Hub pull secret (index-docker-io) in each entry's namespace.
	// When true, the deployer performs docker login and creates an index-docker-io
	// Kubernetes secret of type kubernetes.io/dockerconfigjson.
	EnsureDockerHub bool
	// UseLatest applies values-latest.yaml from each chart root instead of values-digest.yaml.
	// This overrides the default digest-based image pinning with the latest available tags.
	UseLatest bool
	// DisableImageTags disables SNAPSHOT image tag overrides from env vars for all entries,
	// regardless of per-scenario image-tags config in ci-test-config.yaml.
	// Runtime override that trumps entry.ImageTags — same pattern as UseQA.
	// Use when deploying an OCI release artifact whose values.yaml image versions are authoritative.
	DisableImageTags bool
	// UseQA forces the base-qa layer to be included for all entries, regardless of each
	// entry's per-scenario qa setting in ci-test-config.yaml.
	UseQA bool
	// OnEntryStart is called when a matrix entry begins execution.
	// The callback receives the entry and its resolved namespace.
	// Nil disables the callback (zero overhead for existing CLI behavior).
	OnEntryStart func(entry Entry, namespace string)
	// OnEntryComplete is called when a matrix entry finishes execution.
	// The callback receives the entry and its full result (including error and duration).
	// Nil disables the callback (zero overhead for existing CLI behavior).
	OnEntryComplete func(entry Entry, result RunResult)
	// OnPhaseChange is called when a matrix entry transitions to a new phase
	// (e.g., "preparing", "deploying", "step-1", "step-2", "testing", "cleanup").
	// Nil disables the callback.
	OnPhaseChange func(entry Entry, phase string)
	// LogDir is the directory for per-entry log files. When set, test script
	// output (IT/e2e) is redirected to per-entry files instead of the terminal.
	LogDir string
	// ExtraHelmArgs are appended to every helm command for every entry. CI uses
	// this to inject license-key --set-file flags whose values would otherwise be
	// shell-escaped incorrectly via --set.
	ExtraHelmArgs []string
	// ExtraHelmSets are key=value strings applied as extra --set pairs for every
	// entry. CI uses this for invariant flags like
	// orchestration.upgrade.allowPreReleaseImages=true.
	ExtraHelmSets []string
	// NamespaceOverride, when non-empty, replaces the computed namespace for
	// every entry. Used by per-scenario CI workflows that pre-create the
	// namespace (with vault secrets, TLS certs, docker pull-secrets) before
	// invoking matrix run, so the install lands in the same namespace.
	// Only meaningful when filters narrow the run to a single entry.
	NamespaceOverride string
	// ChartRef, when non-empty, overrides the chart source for helm install/upgrade.
	// This can be an OCI reference (e.g., "oci://registry.camunda.cloud/team-distribution/camunda-platform")
	// or a path to a local .tgz file. When set, the matrix runner uses this as the
	// chart argument for `helm upgrade --install` instead of the local chart directory.
	// The local chart directory (entry.ChartPath) is still used for values file resolution
	// (scenario layers, chart-root overlays). SkipDependencyUpdate is forced to true.
	ChartRef string
	// ChartRefVersion is the chart version to install from ChartRef (e.g., "13-rc-latest").
	// Only meaningful when ChartRef is set. Passed as --version to helm.
	ChartRefVersion string
}

// applyChartRefOverride mutates chart to point at opts.ChartRef (an OCI reference
// or a local .tgz path) when set, and forces SkipDependencyUpdate so helm does
// not try to run `dependency update` against an external/packaged chart.
// ChartPath is left untouched so values-file resolution (scenario directory,
// chart-root overlays) still uses the local repo. It returns true when the
// override was applied. Centralizing this logic here lets executeEntry and
// tests share a single code path.
func applyChartRefOverride(chart *config.ChartFlags, opts RunOptions) bool {
	if opts.ChartRef == "" {
		return false
	}
	chart.Chart = opts.ChartRef
	chart.ChartVersion = opts.ChartRefVersion
	chart.SkipDependencyUpdate = true
	logging.Logger.Info().
		Str("chartRef", opts.ChartRef).
		Str("chartVersion", opts.ChartRefVersion).
		Msg("Using external chart reference (OCI/tgz) instead of local chart directory")
	return true
}

// RunResult holds the result of a single matrix entry execution.
type RunResult struct {
	Entry       Entry
	Namespace   string
	KubeContext string
	Error       error
	Duration    time.Duration // Wall-clock time for this entry's execution.
	Diagnostics string        // Post-failure diagnostics run directory path

	// venomOpts stores the Entra options used to provision a venom app for OIDC entries.
	// Populated only when the entry uses OIDC authentication. Used during cleanup to
	// delete the corresponding Entra app registration.
	venomOpts *entra.Options

	// auth0Opts stores the Auth0 options used to provision per-component clients
	// for Auth0 entries. Populated only when entry.Identity == "auth0". Used
	// during cleanup to delete the corresponding Auth0 clients.
	auth0Opts *auth0.Options
}

// Run executes the matrix entries, building RuntimeFlags for each and calling deploy.Execute().
// When MaxParallel <= 1, entries are processed sequentially. When MaxParallel > 1, up to
// MaxParallel entries run concurrently. If Cleanup is enabled, each entry's namespace is
// deleted immediately after that entry's deployment and tests complete (regardless of
// success or failure). This frees cluster resources as early as possible rather than
// waiting for the entire matrix run to finish.
func Run(ctx context.Context, entries []Entry, opts RunOptions) ([]RunResult, error) {
	if len(entries) == 0 {
		return nil, fmt.Errorf("no matrix entries to run")
	}

	// Dry-run is always sequential
	if opts.DryRun {
		return dryRun(entries, opts), nil
	}

	// Coverage mode: resolve and display layer breakdown, no deployment
	if opts.Coverage {
		return coverageReport(entries, opts), nil
	}

	// Perform docker login ONCE before dispatching entries. Running `docker login`
	// concurrently causes keychain conflicts on macOS ("item already exists" -25299).
	// After this, each entry's deployer runs with SkipDockerLogin=true so it only
	// creates the per-namespace K8s pull secrets without touching `docker login`.
	if opts.EnsureDockerHub {
		if err := docker.EnsureDockerHubLogin(ctx, opts.DockerHubUsername, opts.DockerHubPassword); err != nil {
			return nil, fmt.Errorf("failed to ensure Docker Hub login: %w", err)
		}
	}
	if opts.EnsureDockerRegistry {
		if err := docker.EnsureHarborLogin(ctx, opts.DockerUsername, opts.DockerPassword); err != nil {
			return nil, fmt.Errorf("failed to ensure Harbor login: %w", err)
		}
	}

	// Warm up each unique kube context ONCE before dispatching entries.
	// For Teleport-managed clusters (EKS), the first API call may trigger
	// an interactive browser login. Doing this sequentially ensures only one
	// login prompt per context, rather than N parallel goroutines racing.
	if err := warmUpKubeContexts(ctx, entries, opts); err != nil {
		return nil, err
	}

	parallel := opts.MaxParallel > 1
	if parallel {
		logging.Logger.Info().
			Int("maxParallel", opts.MaxParallel).
			Int("totalEntries", len(entries)).
			Msg("Starting parallel matrix run")
	}

	var results []RunResult
	var retErr error

	if parallel {
		results, retErr = runParallel(ctx, entries, opts)
	} else {
		results, retErr = runSequential(ctx, entries, opts)
	}

	// If no early-termination error was returned (StopOnFailure) but entries
	// still failed, synthesize a summary error so callers (and CI steps) get a
	// non-zero exit code. Also catch the edge case where the context was
	// cancelled before any entry was dispatched (runParallel returns nil, nil).
	if retErr == nil {
		retErr = synthesizeRunError(ctx, results, len(entries))
	}

	return results, retErr
}

// synthesizeRunError checks completed results for failures and returns a
// summary error when any entries failed. It also detects context cancellation
// that prevented entries from being dispatched (e.g., parent ctx already done
// when runParallel starts). This is an unexported helper so tests can exercise
// the exact production logic.
func synthesizeRunError(ctx context.Context, results []RunResult, totalEntries int) error {
	// If the context was cancelled and fewer results were produced than entries
	// expected, report the cancellation — this catches the edge case where
	// runParallel breaks out of its dispatch loop before any entry starts.
	if ctx.Err() != nil && len(results) < totalEntries {
		return fmt.Errorf("run cancelled: %d of %d entries never started: %w",
			totalEntries-len(results), totalEntries, ctx.Err())
	}

	var failCount int
	for _, r := range results {
		if r.Error != nil {
			failCount++
		}
	}
	if failCount > 0 {
		return fmt.Errorf("%d of %d matrix entries failed", failCount, len(results))
	}
	return nil
}

// dryRunEntry holds resolved details for one matrix entry in dry-run mode.
type dryRunEntry struct {
	entry                Entry
	namespace            string
	kubeCtx              string
	platform             string
	infraType            string
	ingressHost          string
	envFile              string
	useVault             bool
	deleteNS             bool
	ensureDockerRegistry bool
	ensureDockerHub      bool
	// Resolved layer config (derived from scenario name + explicit overrides).
	identity    string
	persistence string
	features    []string
	layerFiles  []string // short relative paths, e.g., "values/identity/keycloak.yaml"
	// Upgrade flow fields (populated only for upgrade flows).
	upgradeFromVersion string   // The "from" chart version for upgrade flows (e.g., "13.5.0").
	preUpgradeScript   string   // Path to the pre-upgrade script (e.g., "charts/.../pre-upgrade-patch.sh"), or empty.
	upgradeOnly        bool     // True for modular-upgrade-minor (single-step upgrade, no install).
	step1ValuesFrom    string   // For upgrade-minor Step 1: the previous version whose values are used (e.g., "8.7"), or empty.
	chartRootOverlays  []string // Chart-root overlay files that will be applied (e.g., ["enterprise", "digest"]).
}

// dryRun resolves what would be deployed and prints a clean summary to stdout.
func dryRun(entries []Entry, opts RunOptions) []RunResult {
	var results []RunResult
	versions := VersionOrder(entries)
	groups := GroupByVersion(entries)

	// Resolve all entries first.
	var resolved []dryRunEntry
	for _, version := range versions {
		for _, entry := range groups[version] {
			namespace := resolveNamespace(opts, entry)
			platform := resolvePlatform(opts, entry)
			kubeCtx := resolveKubeContext(opts, platform)
			envFile := resolveEnvFile(opts, entry.Version)
			useVault := resolveUseVaultBackedSecrets(opts, platform)
			baseDomain := resolveIngressBaseDomain(opts, platform)
			ingressHost := ""
			if baseDomain != "" {
				ingressHost = namespace + "." + baseDomain
			}

			// Resolve deployment layers via the canonical builder (same logic as deploy.go prepareScenarioValues).
			scenarioDir := filepath.Join(entry.ChartPath, "test/integration/scenarios/chart-full-setup")
			deployConfig, buildErr := scenarios.BuildDeploymentConfig(entry.Scenario, scenarios.BuilderOverrides{
				Identity:    entry.Identity,
				Persistence: entry.Persistence,
				Platform:    platform,
				Features:    entry.Features,
				InfraType:   entry.InfraType,
				Flow:        entry.Flow,
				QA:          entry.QA || opts.UseQA,
				ImageTags:   effectiveImageTags(entry.ImageTags, opts),
				Upgrade:     entry.Upgrade,
			})
			if buildErr != nil {
				results = append(results, RunResult{
					Entry: entry,
					Error: fmt.Errorf("deployment config validation failed: %w", buildErr),
				})
				continue
			}

			var layerFiles []string
			if paths, err := deployConfig.ResolvePaths(scenarioDir); err == nil {
				for _, p := range paths {
					if rel, relErr := filepath.Rel(scenarioDir, p); relErr == nil {
						layerFiles = append(layerFiles, rel)
					} else {
						layerFiles = append(layerFiles, filepath.Base(p))
					}
				}
			}

			resolved = append(resolved, dryRunEntry{
				entry:                entry,
				namespace:            namespace,
				kubeCtx:              kubeCtx,
				platform:             platform,
				infraType:            entry.InfraType,
				ingressHost:          ingressHost,
				envFile:              envFile,
				useVault:             useVault,
				deleteNS:             opts.DeleteNamespaceFirst,
				ensureDockerRegistry: opts.EnsureDockerRegistry,
				ensureDockerHub:      opts.EnsureDockerHub,
				identity:             deployConfig.Identity,
				persistence:          deployConfig.Persistence,
				features:             deployConfig.Features,
				layerFiles:           layerFiles,
				upgradeFromVersion:   resolveUpgradeFromVersionQuiet(opts.RepoRoot, entry, opts.UpgradeFromVersion),
				preUpgradeScript:     resolvePreUpgradeScriptQuiet(opts.RepoRoot, entry),
				upgradeOnly:          versionmatrix.IsUpgradeOnlyFlow(entry.Flow),
				step1ValuesFrom:      resolveStep1ValuesFromQuiet(entry),
				chartRootOverlays:    resolveChartRootOverlaysQuiet(entry.ChartPath, entry, opts),
			})
			results = append(results, RunResult{Entry: entry, Namespace: namespace, KubeContext: kubeCtx})
		}
	}

	// Print clean dry-run output.
	fmt.Fprintln(os.Stdout, formatDryRunOutput(resolved, versions, opts))
	return results
}

// Style helpers for dry-run output. These wrap logging.Emphasize so colors
// are automatically disabled in CI/non-TTY environments.
var (
	dryHead = func(s string) string { return logging.Emphasize(s, gchalk.Bold) }
	dryKey  = func(s string) string { return logging.Emphasize(s, gchalk.Cyan) }
	dryVal  = func(s string) string { return logging.Emphasize(s, gchalk.Magenta) }
	dryOk   = func(s string) string { return logging.Emphasize(s, gchalk.Green) }
	dryWarn = func(s string) string { return logging.Emphasize(s, gchalk.Yellow) }
	dryFail = func(s string) string { return logging.Emphasize(s, gchalk.Red) }
	dryDim  = func(s string) string { return logging.Emphasize(s, gchalk.WithBrightBlack().Italic) }
)

// resolveUpgradeFromVersionQuiet resolves the "from" chart version for upgrade flows.
// Returns empty string for non-upgrade flows or on error (dry-run is best-effort).
// If overrideVersion is non-empty, it is returned directly for upgrade flows.
func resolveUpgradeFromVersionQuiet(repoRoot string, entry Entry, overrideVersion string) string {
	if !versionmatrix.IsUpgradeFlow(entry.Flow) {
		return ""
	}
	if overrideVersion != "" {
		return overrideVersion
	}
	version, err := versionmatrix.ResolveUpgradeFromVersion(repoRoot, entry.Version, entry.Flow)
	if err != nil {
		logging.Logger.Warn().Err(err).Str("version", entry.Version).Str("flow", entry.Flow).
			Msg("dry-run: could not resolve upgrade-from version")
		return "???"
	}
	return version
}

// resolvePreUpgradeScriptQuiet returns the pre-upgrade script path declared
// on the entry's PreUpgrade hook (if any). Used by the dry-run summary;
// returns empty string for non-upgrade flows, fixture-mode hooks, or scripts
// that do not exist on disk for the entry's version.
func resolvePreUpgradeScriptQuiet(repoRoot string, entry Entry) string {
	if !versionmatrix.IsUpgradeFlow(entry.Flow) {
		return ""
	}
	if entry.PreUpgrade == nil || entry.PreUpgrade.Script == "" {
		return ""
	}
	if !versionmatrix.HasPreSetupScript(repoRoot, entry.Version, entry.PreUpgrade.Script) {
		return ""
	}
	return versionmatrix.PreSetupScriptPath(repoRoot, entry.Version, entry.PreUpgrade.Script)
}

// resolveStep1ValuesFromQuiet returns the previous app version whose values files are used
// for Step 1 of upgrade-minor flows. For upgrade-minor, Step 1 uses the previous app
// version's chart directory (e.g., "8.7" values for an "8.8" entry). For all other flows
// (including upgrade-patch, which uses the current version's values), returns empty string.
// Errors are silently logged — dry-run is best-effort.
func resolveStep1ValuesFromQuiet(entry Entry) string {
	if entry.Flow != "upgrade-minor" {
		return ""
	}
	prev, err := versionmatrix.PreviousAppVersion(entry.Version)
	if err != nil {
		logging.Logger.Warn().Err(err).Str("version", entry.Version).
			Msg("dry-run: could not resolve previous app version for Step 1 values")
		return "???"
	}
	return prev
}

// effectiveImageTags returns whether SNAPSHOT tag overrides from env vars should be applied.
// opts.DisableImageTags is a runtime override that trumps the per-scenario flag.
func effectiveImageTags(entryImageTags bool, opts RunOptions) bool {
	if opts.DisableImageTags {
		return false
	}
	return entryImageTags
}

// chartRootOverlays returns the chart-root overlay names for an entry.
// enterprise is composable with any image source.
// digest / latest / image-tags are mutually exclusive for image version resolution:
//   - effectiveImageTags=true  → SNAPSHOT path; caller supplies base-image-tags.yaml via --env-file
//   - OCI (ChartRef set)       → no overlay; chart's values.yaml is authoritative
//   - UseLatest                → values-latest.yaml (pinned release tags from local repo)
//   - default                  → values-digest.yaml (SNAPSHOT pinned by sha256)
func chartRootOverlays(entry Entry, opts RunOptions) []string {
	var overlays []string
	if entry.Enterprise {
		overlays = append(overlays, "enterprise")
	}
	if !effectiveImageTags(entry.ImageTags, opts) {
		if opts.ChartRef != "" {
			// OCI artifact: the chart ships its own image versions; no overlay needed.
		} else if opts.UseLatest {
			overlays = append(overlays, "latest")
		} else {
			overlays = append(overlays, "digest")
		}
	}
	return overlays
}

// resolveChartRootOverlaysQuiet returns chart-root overlays that exist on disk.
// Dry-run helper: best-effort, silently filters to present files only.
func resolveChartRootOverlaysQuiet(chartPath string, entry Entry, opts RunOptions) []string {
	if chartPath == "" {
		return nil
	}
	var existing []string
	for _, name := range chartRootOverlays(entry, opts) {
		path := filepath.Join(chartPath, "values-"+name+".yaml")
		if _, err := os.Stat(path); err == nil {
			existing = append(existing, name)
		}
	}
	return existing
}

// formatDryRunOutput produces a human-readable dry-run summary grouped by version.
func formatDryRunOutput(entries []dryRunEntry, versions []string, opts RunOptions) string {
	var b strings.Builder

	// Group by version.
	groups := make(map[string][]dryRunEntry)
	for _, e := range entries {
		groups[e.entry.Version] = append(groups[e.entry.Version], e)
	}

	for i, version := range versions {
		versionEntries := groups[version]
		if i > 0 {
			b.WriteString("\n")
		}
		fmt.Fprintf(&b, "%s\n",
			dryHead(fmt.Sprintf("=== Version %s (%d entries) ===", version, len(versionEntries))))

		for j, e := range versionEntries {
			b.WriteString("\n")

			// Header line: number, scenario, shortname, flow, platform, infra-type, auth.
			scenarioLabel := dryKey(e.entry.Scenario)
			if e.entry.Shortname != "" {
				scenarioLabel += " " + dryDim("("+e.entry.Shortname+")")
			}
			fmt.Fprintf(&b, "  %s %s | %s | %s (%s) | %s\n",
				dryHead(fmt.Sprintf("[%d]", j+1)),
				scenarioLabel,
				dryOk(e.entry.Flow),
				dryOk(e.platform),
				dryOk(e.infraType),
				dryOk(e.entry.Auth))

			// Upgrade plan — show two-step upgrade details for two-step upgrade flows,
			// or upgrade-only details for modular-upgrade-minor.
			step2Target := "local chart"
			if opts.ChartRef != "" {
				step2Target = opts.ChartRef
				if opts.ChartRefVersion != "" {
					step2Target += "@" + opts.ChartRefVersion
				}
			}
			if e.upgradeOnly && e.upgradeFromVersion != "" {
				fmt.Fprintf(&b, "      %s %s %s → %s %s\n",
					dryKey("upgrade:"),
					dryDim("upgrade-only (no install step), expects"),
					dryWarn(versionmatrix.DefaultHelmChartRef+"@"+e.upgradeFromVersion),
					dryDim("already running, upgrading to"),
					dryWarn(step2Target))
			} else if !e.upgradeOnly && e.upgradeFromVersion != "" {
				fmt.Fprintf(&b, "      %s %s %s → %s %s\n",
					dryKey("upgrade:"),
					dryDim("Step 1: install"),
					dryWarn(versionmatrix.DefaultHelmChartRef+"@"+e.upgradeFromVersion),
					dryDim("Step 2: upgrade to"),
					dryWarn(step2Target))
				// For upgrade-minor, Step 1 uses the previous version's values files.
				// Show this explicitly so operators know values come from a different chart dir.
				if e.step1ValuesFrom != "" {
					fmt.Fprintf(&b, "      %s %s %s\n",
						dryKey("step1-values:"),
						dryDim("from"),
						dryWarn("camunda-platform-"+e.step1ValuesFrom))
				}
			}

			// Pre-upgrade script — show the script that runs between Step 1 and Step 2.
			if e.preUpgradeScript != "" {
				// Show a relative path from the repo root for readability.
				scriptDisplay := e.preUpgradeScript
				if opts.RepoRoot != "" {
					if rel, err := filepath.Rel(opts.RepoRoot, e.preUpgradeScript); err == nil {
						scriptDisplay = rel
					}
				}
				fmt.Fprintf(&b, "      %s %s\n",
					dryKey("pre-upgrade:"),
					dryWarn(scriptDisplay))
			}

			// Chart-root overlays — show when overlay files will be applied.
			if len(e.chartRootOverlays) > 0 {
				fmt.Fprintf(&b, "      %s %s\n",
					dryKey("overlays:"),
					dryWarn(strings.Join(e.chartRootOverlays, ", ")))
			}

			// Layers — the most important info.
			features := dryDim("-")
			if len(e.features) > 0 {
				features = dryWarn(strings.Join(e.features, ", "))
			}
			fmt.Fprintf(&b, "      %s %s + %s + %s  %s %s\n",
				dryKey("layers:"),
				dryVal(e.identity), dryVal(e.persistence), dryVal(e.platform),
				dryKey("features:"), features)

			// Namespace.
			fmt.Fprintf(&b, "      %s %s\n", dryKey("namespace:"), e.namespace)

			// Optional fields — only shown when set.
			if e.kubeCtx != "" {
				fmt.Fprintf(&b, "      %s   %s\n", dryKey("context:"), e.kubeCtx)
			}
			if e.ingressHost != "" {
				fmt.Fprintf(&b, "      %s   %s\n", dryKey("ingress:"), e.ingressHost)
			}
			if e.envFile != "" {
				fmt.Fprintf(&b, "      %s   %s\n", dryKey("envFile:"), e.envFile)
			}
			if e.useVault {
				fmt.Fprintf(&b, "      %s     %s\n", dryKey("vault:"), dryWarn("true"))
			}
			if e.deleteNS {
				fmt.Fprintf(&b, "      %s %s\n", dryKey("delete-ns:"), dryWarn("true"))
			}
			if e.ensureDockerRegistry {
				fmt.Fprintf(&b, "      %s    %s\n", dryKey("docker:"), dryWarn("true"))
			}
			if e.ensureDockerHub {
				fmt.Fprintf(&b, "      %s %s\n", dryKey("dockerhub:"), dryWarn("true"))
			}
			if len(e.entry.Exclude) > 0 {
				fmt.Fprintf(&b, "      %s   %s\n", dryKey("exclude:"), dryWarn(strings.Join(e.entry.Exclude, ", ")))
			}

			// Resolved values files.
			if len(e.layerFiles) > 0 {
				fmt.Fprintf(&b, "      %s\n", dryKey("files:"))
				for _, f := range e.layerFiles {
					fmt.Fprintf(&b, "        %s %s\n", dryDim("-"), f)
				}
			}
		}
	}

	// Footer.
	fmt.Fprintf(&b, "\n%s\n",
		dryHead(fmt.Sprintf("--- %d entries across %d versions (dry-run, nothing deployed) ---", len(entries), len(versions))))

	return b.String()
}

// coverageEntry holds resolved layer information for one matrix entry in coverage mode.
type coverageEntry struct {
	entry       Entry
	platform    string
	identity    string
	persistence string
	infraType   string
	features    []string
	flow        string
}

// coverageReport resolves all entries and prints a layer-breakdown table to stdout.
// Like dryRun it performs no deployment — it shows what IS tested in the matrix.
func coverageReport(entries []Entry, opts RunOptions) []RunResult {
	var results []RunResult
	versions := VersionOrder(entries)
	groups := GroupByVersion(entries)

	var resolved []coverageEntry
	for _, version := range versions {
		for _, entry := range groups[version] {
			platform := resolvePlatform(opts, entry)

			// Resolve deployment layers via the canonical builder.
			deployConfig, buildErr := scenarios.BuildDeploymentConfig(entry.Scenario, scenarios.BuilderOverrides{
				Identity:    entry.Identity,
				Persistence: entry.Persistence,
				Platform:    platform,
				Features:    entry.Features,
				InfraType:   entry.InfraType,
				Flow:        entry.Flow,
				QA:          entry.QA || opts.UseQA,
				ImageTags:   effectiveImageTags(entry.ImageTags, opts),
				Upgrade:     entry.Upgrade,
			})
			if buildErr != nil {
				results = append(results, RunResult{
					Entry: entry,
					Error: fmt.Errorf("deployment config validation failed: %w", buildErr),
				})
				continue
			}

			resolved = append(resolved, coverageEntry{
				entry:       entry,
				platform:    platform,
				identity:    deployConfig.Identity,
				persistence: deployConfig.Persistence,
				infraType:   entry.InfraType,
				features:    deployConfig.Features,
				flow:        entry.Flow,
			})

			namespace := resolveNamespace(opts, entry)
			results = append(results, RunResult{Entry: entry, Namespace: namespace})
		}
	}

	fmt.Fprintln(os.Stdout, formatCoverageOutput(resolved, versions))
	return results
}

// formatCoverageOutput produces a compact table showing what each scenario tests.
// Columns: VER | SCENARIO | ENABLED | FLOW | PLATFORM | INFRA-TYPE | IDENTITY | PERSISTENCE | FEATURES
func formatCoverageOutput(entries []coverageEntry, versions []string) string {
	var b strings.Builder

	// Group by version.
	groups := make(map[string][]coverageEntry)
	for _, e := range entries {
		groups[e.entry.Version] = append(groups[e.entry.Version], e)
	}

	// Table header — pad text first, then apply style (ANSI codes break %-Ns padding).
	fmt.Fprintf(&b, "%s\n\n", dryHead("=== Coverage: Layer Breakdown ==="))
	fmt.Fprintf(&b, "%s %s %s %s %s %s %s %s %s\n",
		dryHead(fmt.Sprintf("%-6s", "VER")),
		dryHead(fmt.Sprintf("%-25s", "SCENARIO")),
		dryHead(fmt.Sprintf("%-8s", "ENABLED")),
		dryHead(fmt.Sprintf("%-16s", "FLOW")),
		dryHead(fmt.Sprintf("%-10s", "PLATFORM")),
		dryHead(fmt.Sprintf("%-14s", "INFRA-TYPE")),
		dryHead(fmt.Sprintf("%-20s", "IDENTITY")),
		dryHead(fmt.Sprintf("%-22s", "PERSISTENCE")),
		dryHead("FEATURES"))
	fmt.Fprintf(&b, "%-6s %-25s %-8s %-16s %-10s %-14s %-20s %-22s %s\n",
		"---", "--------", "-------", "----", "--------", "----------", "--------", "-----------", "--------")

	for _, version := range versions {
		versionEntries := groups[version]
		for _, e := range versionEntries {
			// Pad enabled text before applying color so column width is consistent.
			enabled := fmt.Sprintf("%-8s", "yes")
			if e.entry.Enabled {
				enabled = dryOk(enabled)
			} else {
				enabled = dryWarn(fmt.Sprintf("%-8s", "no"))
			}

			platform := e.platform
			if platform == "" {
				platform = "-"
			}
			infraType := e.infraType
			if infraType == "" {
				infraType = "-"
			}
			identity := e.identity
			if identity == "" {
				identity = "(derived)"
			}
			persistence := e.persistence
			if persistence == "" {
				persistence = "(derived)"
			}
			features := strings.Join(e.features, ",")
			if features == "" {
				features = "-"
			}
			flow := e.flow
			if flow == "" {
				flow = "install"
			}

			fmt.Fprintf(&b, "%-6s %-25s %s %-16s %-10s %-14s %-20s %-22s %s\n",
				e.entry.Version,
				e.entry.Scenario,
				enabled,
				flow,
				platform,
				infraType,
				identity,
				persistence,
				features)
		}
	}

	// Summary.
	total := len(entries)
	enabledCount := 0
	for _, e := range entries {
		if e.entry.Enabled {
			enabledCount++
		}
	}
	fmt.Fprintf(&b, "\n%s\n",
		dryHead(fmt.Sprintf("--- %d entries (%d enabled, %d disabled) across %d versions ---",
			total, enabledCount, total-enabledCount, len(versions))))

	// Layer summary: unique values per dimension.
	identities := uniqueStrings(entries, func(e coverageEntry) string { return e.identity })
	persistences := uniqueStrings(entries, func(e coverageEntry) string { return e.persistence })
	platforms := uniqueStrings(entries, func(e coverageEntry) string { return e.platform })
	infraTypes := uniqueStrings(entries, func(e coverageEntry) string { return e.infraType })
	features := uniqueFeatures(entries)
	flows := uniqueStrings(entries, func(e coverageEntry) string { return e.flow })

	fmt.Fprintf(&b, "\n%s\n", dryHead("Layer Coverage:"))
	fmt.Fprintf(&b, "  %s  %s\n", dryKey("identities: "), strings.Join(identities, ", "))
	fmt.Fprintf(&b, "  %s  %s\n", dryKey("persistence:"), strings.Join(persistences, ", "))
	fmt.Fprintf(&b, "  %s   %s\n", dryKey("platforms:  "), strings.Join(platforms, ", "))
	fmt.Fprintf(&b, "  %s  %s\n", dryKey("infra-types:"), strings.Join(infraTypes, ", "))
	fmt.Fprintf(&b, "  %s    %s\n", dryKey("features:  "), strings.Join(features, ", "))
	fmt.Fprintf(&b, "  %s       %s\n", dryKey("flows:  "), strings.Join(flows, ", "))

	return b.String()
}

// uniqueStrings returns unique non-empty values from a field extractor, preserving first-seen order.
func uniqueStrings(entries []coverageEntry, extract func(coverageEntry) string) []string {
	seen := make(map[string]bool)
	var result []string
	for _, e := range entries {
		v := extract(e)
		if v != "" && !seen[v] {
			seen[v] = true
			result = append(result, v)
		}
	}
	return result
}

// uniqueFeatures returns all unique feature names across entries, preserving first-seen order.
func uniqueFeatures(entries []coverageEntry) []string {
	seen := make(map[string]bool)
	var result []string
	for _, e := range entries {
		for _, f := range e.features {
			if !seen[f] {
				seen[f] = true
				result = append(result, f)
			}
		}
	}
	if len(result) == 0 {
		return []string{"-"}
	}
	return result
}

// runSequential processes all entries one at a time.
func runSequential(ctx context.Context, entries []Entry, opts RunOptions) ([]RunResult, error) {
	var results []RunResult
	versions := VersionOrder(entries)
	groups := GroupByVersion(entries)

	globalIndex := 0
	for _, version := range versions {
		versionEntries := groups[version]

		logging.Logger.Info().
			Str("version", version).
			Int("entries", len(versionEntries)).
			Msg("Processing version")

		for _, entry := range versionEntries {
			result := executeEntry(ctx, entry, opts, globalIndex)
			globalIndex++
			results = append(results, result)

			if result.Error != nil {
				logEvent := logging.Logger.Error().
					Err(result.Error).
					Str("version", entry.Version).
					Str("scenario", entry.Scenario).
					Str("flow", entry.Flow)
				var helmErr *deployer.HelmError
				if errors.As(result.Error, &helmErr) {
					logEvent = logEvent.Str("command", helmErr.ShortCommand())
				}
				logEvent.Msg("Matrix entry failed")

				if opts.StopOnFailure {
					return results, fmt.Errorf("stopping on failure: %w", result.Error)
				}
			} else {
				logging.Logger.Info().
					Str("version", entry.Version).
					Str("scenario", entry.Scenario).
					Str("flow", entry.Flow).
					Msg("Matrix entry completed successfully")
			}
		}
	}

	return results, nil
}

// runParallel processes entries concurrently with a bounded semaphore.
// Results are collected in entry order. If StopOnFailure is set, the context
// is cancelled on the first failure, which prevents new entries from starting
// and signals in-flight deploy.Execute() calls to abort.
func runParallel(ctx context.Context, entries []Entry, opts RunOptions) ([]RunResult, error) {
	// Pre-allocate results slice so each goroutine writes to its own index (no mutex needed for slots).
	results := make([]RunResult, len(entries))

	// Use a cancellable context for stop-on-failure.
	runCtx, cancel := context.WithCancel(ctx)
	defer cancel()

	// Semaphore to limit concurrency.
	sem := make(chan struct{}, opts.MaxParallel)

	var wg sync.WaitGroup

	// Track first failure for stop-on-failure.
	var (
		firstErr error
		errOnce  sync.Once
	)

	for i, entry := range entries {
		// Check if context is already cancelled (stop-on-failure triggered).
		if runCtx.Err() != nil {
			break
		}

		wg.Add(1)
		sem <- struct{}{} // Acquire semaphore slot.

		go func(idx int, e Entry) {
			defer wg.Done()
			defer func() { <-sem }() // Release semaphore slot.

			// Check again after acquiring semaphore slot.
			if runCtx.Err() != nil {
				results[idx] = RunResult{
					Entry:     e,
					Namespace: resolveNamespace(opts, e),
					Error:     fmt.Errorf("skipped: run cancelled"),
				}
				if opts.OnEntryComplete != nil {