fix: use one big vnet and attach AKS clusters to it to avoid creating bastion multiple times#8646
Open
awesomenix wants to merge 1 commit into
Open
fix: use one big vnet and attach AKS clusters to it to avoid creating bastion multiple times#8646awesomenix wants to merge 1 commit into
awesomenix wants to merge 1 commit into
Conversation
Contributor
awesomenix
commented
Jun 5, 2026
- Make everyone's life simple
- README has all the details, but idea is to use shared VNET and stop doing unnecessary activities multiple times.
Contributor
There was a problem hiding this comment.
Pull request overview
This PR refactors the E2E harness to use a single shared per-location VNet and Bastion (in abe2e-{location}) and attaches all AKS clusters to per-cluster subnets inside that VNet, reducing repeated Bastion provisioning and centralizing shared infrastructure creation.
Changes:
- Add shared-infrastructure provisioning (
shared_infra.go) to create/ensure a shared VNet, Bastion, Firewall, and managed identity, plus auto-allocation of per-cluster subnets. - Update cluster creation and networking helpers to rely on
VnetSubnetID(BYO VNet/subnet) rather than discovering VNets in the MC_ resource group. - Update SSH-over-Bastion flow and documentation to reflect shared Bastion/VNet usage.
Reviewed changes
Copilot reviewed 8 out of 8 changed files in this pull request and generated 6 comments.
Show a summary per file
| File | Description |
|---|---|
| e2e/vmss.go | Switch Bastion SSH command to use shared Bastion name + shared RG. |
| e2e/shared_infra.go | Introduce shared VNet/Bastion/Firewall/identity creation and per-cluster subnet allocation/cleanup. |
| e2e/README.md | Document the new shared-infra architecture and updated scenario authoring pointers. |
| e2e/node_config.go | Adjust tenant ID derivation to handle user-assigned identity clusters. |
| e2e/kube.go | Read cluster subnet ID from agent pool VnetSubnetID instead of listing VNets. |
| e2e/cluster.go | Create per-cluster subnet after name hashing; use shared Bastion; derive VNet from subnet ID. |
| e2e/cache.go | Ensure all cached cluster creators configure the shared VNet before preparing clusters. |
| e2e/aks_model.go | Avoid hardcoded subnet CIDRs; use parsed VNet/subnet info when configuring firewall/isolated settings. |
Comment on lines
475
to
479
| subnetAddressPrefix := vnet.addressPrefix | ||
|
|
||
| subnetParameters := armnetwork.Subnet{ | ||
| ID: to.Ptr(subnetId), | ||
| Properties: &armnetwork.SubnetPropertiesFormat{ |
Comment on lines
+126
to
+134
| // Skip subnets with associated route tables (cluster may still exist) | ||
| if subnet.Properties != nil && subnet.Properties.RouteTable != nil { | ||
| // Check if the cluster still exists by extracting cluster name | ||
| clusterName := strings.TrimPrefix(name, "aks-subnet-") | ||
| _, err := config.Azure.AKS.Get(ctx, rg, clusterName, nil) | ||
| if err == nil { | ||
| continue // cluster still exists | ||
| } | ||
| } |
Comment on lines
+310
to
+312
| // ensureClusterIdentity creates a user-assigned managed identity for AKS clusters | ||
| // and grants it Network Contributor on the subscription so it can manage route tables | ||
| // in both the shared VNet and the MC_ resource groups. |
Comment on lines
+56
to
+59
| firewallIP, err := ensureSharedFirewall(ctx, rg, location) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("ensuring shared firewall: %w", err) | ||
| } |
Comment on lines
+139
to
+160
| nbc.TenantID = clusterTenantID(cluster.Model) | ||
| nbc.ContainerService.Properties.CertificateProfile.CaCertificate = string(cluster.ClusterParams.CACert) | ||
| nbc.ContainerService.Properties.HostedMasterProfile.FQDN = cluster.ClusterParams.FQDN | ||
| nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = vhd.Distro | ||
| nbc.AgentPoolProfile.Distro = vhd.Distro | ||
| return nbc, nil | ||
| } | ||
|
|
||
| // clusterTenantID extracts the tenant ID from the cluster's identity. | ||
| // For system-assigned identity, TenantID is set directly on the Identity struct. | ||
| // For user-assigned identity, we look it up from the managed identity resource. | ||
| func clusterTenantID(cluster *armcontainerservice.ManagedCluster) string { | ||
| if cluster.Identity != nil && cluster.Identity.TenantID != nil { | ||
| return *cluster.Identity.TenantID | ||
| } | ||
| // For user-assigned identity, get tenant from the shared cluster identity | ||
| rg := config.ResourceGroupName(*cluster.Location) | ||
| identity, err := config.Azure.UserAssignedIdentities.Get(context.Background(), rg, SharedClusterIdentity, nil) | ||
| if err == nil && identity.Properties != nil && identity.Properties.TenantID != nil { | ||
| return *identity.Properties.TenantID | ||
| } | ||
| return "" |
Comment on lines
+90
to
+96
| | Resource | Name | Details | | ||
| |----------|------|---------| | ||
| | VNet | `abe2e-shared-vnet` | `10.0.0.0/8` — supports ~4096 `/20` cluster subnets | | ||
| | Bastion | `abe2e-shared-bastion` | Standard SKU with tunneling enabled for native SSH | | ||
| | Bastion Subnet | `AzureBastionSubnet` | `10.0.0.0/26` (required by Azure Bastion) | | ||
| | Firewall Subnet | `AzureFirewallSubnet` | `10.0.1.0/24` (created by shared infra, firewall on-demand) | | ||
|
|
2579b61 to
738f4ae
Compare
Comment on lines
484
to
490
| subnetAddressPrefix := vnet.addressPrefix | ||
|
|
||
| subnetParameters := armnetwork.Subnet{ | ||
| ID: to.Ptr(subnetId), | ||
| Properties: &armnetwork.SubnetPropertiesFormat{ | ||
| AddressPrefix: to.Ptr("10.224.0.0/16"), | ||
| AddressPrefix: to.Ptr(subnetAddressPrefix), | ||
| NetworkSecurityGroup: &armnetwork.SecurityGroup{ |
Comment on lines
+379
to
+384
| // Check if this subnet already exists (idempotent) | ||
| existing, err := config.Azure.Subnet.Get(ctx, rg, SharedVNetName, subnetName, nil) | ||
| if err == nil { | ||
| return *existing.ID, nil | ||
| } | ||
|
|
Comment on lines
+472
to
+489
| vnetResp, err := config.Azure.VNet.Get(ctx, rg, vnetName, nil) | ||
| if err != nil { | ||
| return VNet{}, fmt.Errorf("getting VNet %s in RG %s: %w", vnetName, rg, err) | ||
| } | ||
|
|
||
| var addressPrefix string | ||
| if vnetResp.Properties.AddressSpace != nil && len(vnetResp.Properties.AddressSpace.AddressPrefixes) > 0 { | ||
| addressPrefix = *vnetResp.Properties.AddressSpace.AddressPrefixes[0] | ||
| } | ||
|
|
||
| return VNet{ | ||
| name: vnetName, | ||
| resourceGroup: rg, | ||
| subnetName: subnetName, | ||
| subnetId: subnetID, | ||
| resourceGUID: *vnetResp.Properties.ResourceGUID, | ||
| addressPrefix: addressPrefix, | ||
| }, nil |
Comment on lines
+139
to
141
| nbc.TenantID = clusterTenantID(cluster.Model) | ||
| nbc.ContainerService.Properties.CertificateProfile.CaCertificate = string(cluster.ClusterParams.CACert) | ||
| nbc.ContainerService.Properties.HostedMasterProfile.FQDN = cluster.ClusterParams.FQDN |
Comment on lines
+185
to
+188
| existing, err := config.Azure.BastionHosts.Get(ctx, rg, SharedBastionName, nil) | ||
| if err == nil { | ||
| return *existing.Properties.DNSName, nil | ||
| } |
Comment on lines
+90
to
+93
| | Resource | Name | Details | | ||
| |----------|------|---------| | ||
| | VNet | `abe2e-shared-vnet` | `10.0.0.0/8` — supports ~4096 `/20` cluster subnets | | ||
| | Bastion | `abe2e-shared-bastion` | Standard SKU with tunneling enabled for native SSH | |
738f4ae to
69c05f8
Compare
69c05f8 to
f928610
Compare
Comment on lines
+379
to
+384
| // Check if this subnet already exists (idempotent) | ||
| existing, err := config.Azure.Subnet.Get(ctx, rg, SharedVNetName, subnetName, nil) | ||
| if err == nil { | ||
| return *existing.ID, nil | ||
| } | ||
|
|
Comment on lines
+321
to
+323
| // ensureClusterIdentity creates a user-assigned managed identity for AKS clusters | ||
| // and grants it Network Contributor on the subscription so it can manage route tables | ||
| // in both the shared VNet and the MC_ resource groups. |
Comment on lines
+531
to
+537
| sharedRG := config.ResourceGroupName(location) | ||
| sharedBastion, err := config.Azure.BastionHosts.Get(ctx, sharedRG, SharedBastionName, nil) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to create bastion: %w", err) | ||
| } | ||
|
|
||
| bastion := NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) | ||
|
|
||
| if err := verifyBastion(ctx, cluster, bastion); err != nil { | ||
| return nil, fmt.Errorf("failed to verify bastion: %w", err) | ||
| return nil, fmt.Errorf("shared bastion %s not found in %s: %w", SharedBastionName, sharedRG, err) | ||
| } | ||
| return bastion, nil | ||
| } | ||
|
|
||
| func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { | ||
| nodeRG := *cluster.Properties.NodeResourceGroup | ||
| vmssName, err := getSystemPoolVMSSName(ctx, cluster) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| var vmssVM *armcompute.VirtualMachineScaleSetVM | ||
| pager := config.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) | ||
| if pager.More() { | ||
| page, err := pager.NextPage(ctx) | ||
| if err != nil { | ||
| return fmt.Errorf("list vmss vms for %q in rg %q: %w", vmssName, nodeRG, err) | ||
| } | ||
| if len(page.Value) > 0 { | ||
| vmssVM = page.Value[0] | ||
| } | ||
| } | ||
|
|
||
| vmPrivateIP, err := getPrivateIPFromVMSSVM(ctx, nodeRG, vmssName, *vmssVM.InstanceID) | ||
|
|
||
| ctx, cancel := context.WithCancel(ctx) | ||
| defer cancel() | ||
|
|
||
| sshClient, err := DialSSHOverBastion(ctx, bastion, vmPrivateIP, config.SysSSHPrivateKey) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| defer sshClient.Close() | ||
|
|
||
| result, err := runSSHCommandWithPrivateKeyFile(ctx, sshClient, "uname -a", false) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if strings.Contains(result.stdout, vmssName) { | ||
| return nil | ||
| } | ||
| return fmt.Errorf("Executed ssh on wrong VM, Expected %s: %s", vmssName, result.stdout) | ||
| } | ||
|
|
||
| func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { | ||
| nodeRG := *cluster.Properties.NodeResourceGroup | ||
| var systemPoolName string | ||
| for _, pool := range cluster.Properties.AgentPoolProfiles { | ||
| if strings.EqualFold(string(*pool.Mode), "System") { | ||
| systemPoolName = *pool.Name | ||
| } | ||
| } | ||
| pager := config.Azure.VMSS.NewListPager(nodeRG, nil) | ||
| if pager.More() { | ||
| page, err := pager.NextPage(ctx) | ||
| if err != nil { | ||
| return "", fmt.Errorf("list vmss in rg %q: %w", nodeRG, err) | ||
| } | ||
| for _, vmss := range page.Value { | ||
| if strings.Contains(strings.ToLower(*vmss.Name), strings.ToLower(systemPoolName)) { | ||
| return *vmss.Name, nil | ||
| } | ||
| } | ||
| } | ||
| return "", fmt.Errorf("no matching VMSS found for system pool %q in rg %q", systemPoolName, nodeRG) | ||
| } | ||
|
|
||
| func sanitizeAzureResourceName(name string) string { | ||
| // Azure resource name restrictions vary by type. For our usage here (Public IP name) we just | ||
| // keep it simple and strip problematic characters. | ||
| replacer := strings.NewReplacer("/", "-", "\\", "-", ":", "-", "_", "-", " ", "-") | ||
| name = replacer.Replace(name) | ||
| name = strings.Trim(name, "-") | ||
| if len(name) > 80 { | ||
| name = name[:80] | ||
| } | ||
| return name | ||
| toolkit.Logf(ctx, "using shared bastion %s in %s", SharedBastionName, sharedRG) | ||
| return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, sharedRG, *sharedBastion.Properties.DNSName), nil |
Comment on lines
+150
to
+161
| func clusterTenantID(cluster *armcontainerservice.ManagedCluster) string { | ||
| if cluster.Identity != nil && cluster.Identity.TenantID != nil { | ||
| return *cluster.Identity.TenantID | ||
| } | ||
| // For user-assigned identity, get tenant from the shared cluster identity | ||
| rg := config.ResourceGroupName(*cluster.Location) | ||
| identity, err := config.Azure.UserAssignedIdentities.Get(context.Background(), rg, SharedClusterIdentity, nil) | ||
| if err == nil && identity.Properties != nil && identity.Properties.TenantID != nil { | ||
| return *identity.Properties.TenantID | ||
| } | ||
| return "" | ||
| } |
Comment on lines
174
to
178
| NetworkProfile: &armcontainerservice.NetworkProfile{ | ||
| NetworkPlugin: to.Ptr(armcontainerservice.NetworkPluginKubenet), | ||
| ServiceCidr: to.Ptr("172.16.0.0/16"), | ||
| DNSServiceIP: to.Ptr("172.16.0.10"), | ||
| }, |
Comment on lines
+414
to
+419
| routeTableResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to create firewall route table %q: %w", routeTableName, err) | ||
| } | ||
| routeTableID = routeTableResp.ID | ||
| return nil |
Comment on lines
+92
to
+95
| | VNet | `abe2e-shared-vnet` | `10.0.0.0/8` — supports ~4096 `/20` cluster subnets | | ||
| | Bastion | `abe2e-shared-bastion` | Standard SKU with tunneling enabled for native SSH | | ||
| | Bastion Subnet | `AzureBastionSubnet` | `10.0.0.0/26` (required by Azure Bastion) | | ||
| | Firewall Subnet | `AzureFirewallSubnet` | `10.0.1.0/24` (created by shared infra, firewall on-demand) | |
… bastion multiple times
f928610 to
be31cd4
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.