Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
71a4102
Add initial design spec for type-safe DAG execution library
Mar 20, 2026
9efee3e
Rename taskflow to tasks, use named Deps structs throughout
Mar 20, 2026
1e5f8bf
tasks: add core types — Task interface, Config, DAGError, TaskStatus
Mar 20, 2026
39b21c3
tasks: add graph discovery via reflection
Mar 20, 2026
b493f46
tasks: add cycle detection via topological sort
Mar 20, 2026
b3a2857
tasks: add concurrent scheduler with error strategies
Mar 20, 2026
2e5d06e
tasks: add integration tests and clean up vet warnings
Mar 20, 2026
0c3f1d4
tasks: simplify scheduler and address review findings
Mar 20, 2026
6ce7e7a
refactor(e2e): replace sequential prepareCluster with concurrent DAG
Mar 20, 2026
da1d4b2
refactor(e2e): simplify prepareCluster, remove clusterSetup struct
Mar 20, 2026
4f0e867
refactor(e2e): wire cluster as DAG task, pass functions directly
Mar 20, 2026
c58c727
refactor(e2e): eliminate all anonymous functions from prepareCluster
Mar 20, 2026
c9d0792
refactor(e2e): eliminate wrapper helpers, absorb args into functions
Mar 20, 2026
6efb867
refactor(e2e): use bind helpers, eliminate newClusterTask
Mar 20, 2026
d75c164
polish: improve naming, comments, and error wrapping in dag/tasks pac…
Mar 21, 2026
5fea3e4
refactor(e2e): remove unused tasks package in favor of dag package
Mar 21, 2026
4ad379a
chore: remove design spec docs, revert config.go changes
Mar 21, 2026
388a96a
refactor(e2e): remove bind/bindRun helpers, use inline closures
Mar 21, 2026
8718b71
docs(e2e): add note to keep prepareCluster minimal
Mar 21, 2026
31cd747
fix: address PR review comments on dag package and cluster.go
Mar 21, 2026
8ed5067
test(dag): expand coverage to 95.1% with 37 tests
Mar 22, 2026
8dbaa70
dag: recover panics in task goroutines and improve docs
Mar 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions e2e/aks_model.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str

func addFirewallRules(
ctx context.Context, clusterModel *armcontainerservice.ManagedCluster,
location string,
) error {
location := *clusterModel.Location
defer toolkit.LogStepCtx(ctx, "adding firewall rules")()
routeTableName := "abe2e-fw-rt"
rtGetResp, err := config.Azure.RouteTables.Get(
Expand Down Expand Up @@ -486,10 +486,11 @@ func addFirewallRules(
return nil
}

func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient, resourceGroupName string, kubeletIdentity *armcontainerservice.UserAssignedIdentity, isNonAnonymousPull bool) error {
func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient, kubeletIdentity *armcontainerservice.UserAssignedIdentity, isNonAnonymousPull bool) error {
if cluster == nil || kube == nil || kubeletIdentity == nil {
return errors.New("cluster, kubeclient, and kubeletIdentity cannot be nil when adding Private Azure Container Registry")
}
resourceGroupName := config.ResourceGroupName(*cluster.Location)
if err := createPrivateAzureContainerRegistry(ctx, cluster, resourceGroupName, isNonAnonymousPull); err != nil {
return fmt.Errorf("failed to create private acr: %w", err)
}
Expand All @@ -514,7 +515,8 @@ func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainer
return nil
}

func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, location string) error {
func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster) error {
location := *clusterModel.Location
defer toolkit.LogStepCtx(ctx, fmt.Sprintf("Adding network settings for network isolated cluster %s in rg %s", *clusterModel.Name, *clusterModel.Properties.NodeResourceGroup))

vnet, err := getClusterVNet(ctx, *clusterModel.Properties.NodeResourceGroup)
Expand Down
134 changes: 66 additions & 68 deletions e2e/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"time"

"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/e2e/dag"
"github.com/Azure/agentbaker/e2e/toolkit"
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
Expand Down Expand Up @@ -59,89 +60,82 @@ func (c *Cluster) MaxPodsPerNode() (int, error) {
return 0, fmt.Errorf("cluster agentpool profiles were nil or empty: %+v", c.Model)
}

func prepareCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) {
// prepareCluster runs all cluster preparation steps as a concurrent DAG.
// This function contains complex concurrent orchestration — keep it as
// minimal as possible and push all non-trivial logic into the individual
// task functions it calls.
func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) {
defer toolkit.LogStepCtx(ctx, "preparing cluster")()
ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster)
defer cancel()
cluster.Name = to.Ptr(fmt.Sprintf("%s-%s", *cluster.Name, hash(cluster)))
cluster, err := getOrCreateCluster(ctx, cluster)
if err != nil {
return nil, fmt.Errorf("get or create cluster: %w", err)
}

bastion, err := getOrCreateBastion(ctx, cluster)
if err != nil {
return nil, fmt.Errorf("get or create bastion: %w", err)
}

_, err = getOrCreateMaintenanceConfiguration(ctx, cluster)
if err != nil {
return nil, fmt.Errorf("get or create maintenance configuration: %w", err)
}

subnetID, err := getClusterSubnetID(ctx, *cluster.Properties.NodeResourceGroup)
if err != nil {
return nil, fmt.Errorf("get cluster subnet: %w", err)
}

resourceGroupName := config.ResourceGroupName(*cluster.Location)
clusterModel.Name = to.Ptr(fmt.Sprintf("%s-%s", *clusterModel.Name, hash(clusterModel)))

kube, err := getClusterKubeClient(ctx, resourceGroupName, *cluster.Name)
cluster, err := getOrCreateCluster(ctx, clusterModel)
if err != nil {
return nil, fmt.Errorf("get kube client using cluster %q: %w", *cluster.Name, err)
return nil, fmt.Errorf("get or create cluster: %w", err)
}

kubeletIdentity, err := getClusterKubeletIdentity(cluster)
if err != nil {
return nil, fmt.Errorf("getting cluster kubelet identity: %w", err)
}
g := dag.NewGroup(ctx)

if isNetworkIsolated || attachPrivateAcr {
// private acr must be created before we add the debug daemonsets
if err := addPrivateAzureContainerRegistry(ctx, cluster, kube, resourceGroupName, kubeletIdentity, true); err != nil {
return nil, fmt.Errorf("add private azure container registry (true): %w", err)
}
if err := addPrivateAzureContainerRegistry(ctx, cluster, kube, resourceGroupName, kubeletIdentity, false); err != nil {
return nil, fmt.Errorf("add private azure container registry (false): %w", err)
}
bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) {
return getOrCreateBastion(ctx, cluster)
})
dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) })
subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, cluster) })
kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, cluster) })
identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) {
return getClusterKubeletIdentity(ctx, cluster)
})
dag.Run(g, func(ctx context.Context) error { return collectGarbageVMSS(ctx, cluster) })
var networkDeps []dag.Dep
if !isNetworkIsolated {
networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, cluster) }))
}
if isNetworkIsolated {
if err := addNetworkIsolatedSettings(ctx, cluster, *cluster.Location); err != nil {
return nil, fmt.Errorf("add network isolated settings: %w", err)
}
}
if !isNetworkIsolated { // network isolated cluster blocks all egress via NSG
if err := addFirewallRules(ctx, cluster, *cluster.Location); err != nil {
return nil, fmt.Errorf("add firewall rules: %w", err)
}
networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addNetworkIsolatedSettings(ctx, cluster) }))
}
needACR := isNetworkIsolated || attachPrivateAcr
acrNonAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, true))
acrAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, false))
dag.Run1(g, kube, ensureDebugDaemonsets(cluster, isNetworkIsolated), append([]dag.Dep{acrNonAnon, acrAnon}, networkDeps...)...)
extract := dag.Go1(g, kube, extractClusterParams(cluster))
Comment on lines +92 to +102
Copy link

Copilot AI Mar 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prepareCluster no longer enforces the previous ordering where network changes (firewall rules / network-isolated subnet+NSG updates) completed before EnsureDebugDaemonsets runs. Because the daemonset creation triggers pod scheduling/image pulls, running it concurrently with route table/NSG updates can introduce e2e flakiness. Consider capturing the firewall/NSG task(s) as Effect values and adding them as explicit dependencies to the Run1(... ensureDebugDaemonsets ...) task (and possibly any other k8s API tasks that assume steady node egress).

Copilot uses AI. Check for mistakes.

if err := kube.EnsureDebugDaemonsets(ctx, isNetworkIsolated, config.GetPrivateACRName(true, *cluster.Location)); err != nil {
return nil, fmt.Errorf("ensure debug daemonsets for %q: %w", *cluster.Name, err)
if err := g.Wait(); err != nil {
return nil, fmt.Errorf("prepare cluster tasks: %w", err)
}
return &Cluster{
Model: cluster,
Kube: kube.MustGet(),
KubeletIdentity: identity.MustGet(),
SubnetID: subnet.MustGet(),
ClusterParams: extract.MustGet(),
Bastion: bastion.MustGet(),
}, nil
}

// sometimes tests can be interrupted and vmss are left behind
// don't waste resource and delete them
if err := collectGarbageVMSS(ctx, cluster); err != nil {
return nil, fmt.Errorf("collect garbage vmss: %w", err)
func addACR(cluster *armcontainerservice.ManagedCluster, needACR, isNonAnonymousPull bool) func(context.Context, *Kubeclient, *armcontainerservice.UserAssignedIdentity) error {
return func(ctx context.Context, k *Kubeclient, id *armcontainerservice.UserAssignedIdentity) error {
if !needACR {
return nil
}
return addPrivateAzureContainerRegistry(ctx, cluster, k, id, isNonAnonymousPull)
}
}

clusterParams, err := extractClusterParameters(ctx, kube, cluster)
if err != nil {
return nil, fmt.Errorf("extracting cluster parameters: %w", err)
func ensureDebugDaemonsets(cluster *armcontainerservice.ManagedCluster, isNetworkIsolated bool) func(context.Context, *Kubeclient) error {
return func(ctx context.Context, k *Kubeclient) error {
return k.EnsureDebugDaemonsets(ctx, isNetworkIsolated, config.GetPrivateACRName(true, *cluster.Location))
}
}

return &Cluster{
Model: cluster,
Kube: kube,
KubeletIdentity: kubeletIdentity,
SubnetID: subnetID,
ClusterParams: clusterParams,
Bastion: bastion,
}, nil
func extractClusterParams(cluster *armcontainerservice.ManagedCluster) func(context.Context, *Kubeclient) (*ClusterParams, error) {
return func(ctx context.Context, k *Kubeclient) (*ClusterParams, error) {
return extractClusterParameters(ctx, cluster, k)
}
}

func getClusterKubeletIdentity(cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.UserAssignedIdentity, error) {
func getClusterKubeletIdentity(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.UserAssignedIdentity, error) {
if cluster == nil || cluster.Properties == nil || cluster.Properties.IdentityProfile == nil {
return nil, fmt.Errorf("cannot dereference cluster identity profile to extract kubelet identity ID")
}
Expand All @@ -152,7 +146,7 @@ func getClusterKubeletIdentity(cluster *armcontainerservice.ManagedCluster) (*ar
return kubeletIdentity, nil
Comment on lines 138 to 146
Copy link

Copilot AI Mar 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getClusterKubeletIdentity takes a ctx parameter but does not use it, which will fail compilation in Go (unused parameter). Either remove the ctx parameter from the signature or use it (e.g., for logging/telemetry) consistently across callers.

Copilot uses AI. Check for mistakes.
}

func extractClusterParameters(ctx context.Context, kube *Kubeclient, cluster *armcontainerservice.ManagedCluster) (*ClusterParams, error) {
func extractClusterParameters(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) (*ClusterParams, error) {
kubeconfig, err := clientcmd.Load(kube.KubeConfig)
if err != nil {
return nil, fmt.Errorf("loading cluster kubeconfig: %w", err)
Expand Down Expand Up @@ -423,16 +417,20 @@ func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerserv
return nil, fmt.Errorf("failed to create cluster after %d attempts due to persistent 409 Conflict: %w", maxRetries, lastErr)
}

func getOrCreateMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) {
existingMaintenance, err := config.Azure.Maintenance.Get(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", nil)
func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error {
_, err := config.Azure.Maintenance.Get(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", nil)
var azErr *azcore.ResponseError
if errors.As(err, &azErr) && azErr.StatusCode == 404 {
return createNewMaintenanceConfiguration(ctx, cluster)
_, err = createNewMaintenanceConfiguration(ctx, cluster)
if err != nil {
return fmt.Errorf("creating maintenance configuration for cluster %q: %w", *cluster.Name, err)
}
return nil
}
if err != nil {
return nil, fmt.Errorf("failed to get maintenance configuration 'default' for cluster %q: %w", *cluster.Name, err)
return fmt.Errorf("failed to get maintenance configuration 'default' for cluster %q: %w", *cluster.Name, err)
}
return &existingMaintenance.MaintenanceConfiguration, nil
return nil
}

func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) {
Expand Down
Loading
Loading