From 92d011a6ff68008201a2924815860af24f239a7c Mon Sep 17 00:00:00 2001 From: Adrian Riobo Date: Wed, 1 Jul 2026 10:39:00 +0000 Subject: [PATCH 1/2] feat(aws): add --vpc-id flag to deploy into existing VPCs When --vpc-id is set, mapt reuses an existing VPC instead of creating a new one. The flag is available on all standard AWS hosts (rhel, windows, fedora, rhel-ai) and exposed as a param in all Tekton tasks. Key behaviours: - Spot AZ search is restricted to AZs that have subnets in the given VPC - On-demand AZ selection is restricted to the VPC's subnet AZs - Airgap is rejected when --vpc-id is set (mutually exclusive) - existingVPCNetwork() resolves a public subnet in the chosen AZ and reads the VPC/subnet as pulumi read-only resources (ec2.GetVpc / ec2.GetSubnet), so no new networking infrastructure is created Also fix spot AZ resolution: GetSpotPlacementScores can return AZ IDs for zones not visible to the account via the default DescribeAvailabilityZones call. describeAvailabilityZonesByRegions now uses AllAvailabilityZones: true so all AZ IDs can be resolved to names during spot search, preventing spurious "skipping AZ: az id not found" drops that led to no spot option being found in accounts with SCP-restricted regions. Closes #849 Co-Authored-By: Claude Sonnet 4.6 --- cmd/mapt/cmd/aws/hosts/fedora.go | 3 +- cmd/mapt/cmd/aws/hosts/rhel.go | 1 + cmd/mapt/cmd/aws/hosts/rhelai.go | 1 + cmd/mapt/cmd/aws/hosts/windows.go | 1 + cmd/mapt/cmd/params/params.go | 13 +++- pkg/provider/aws/action/fedora/fedora.go | 40 +++++++----- pkg/provider/aws/action/rhel-ai/rhelai.go | 4 ++ pkg/provider/aws/action/rhel/rhel.go | 10 ++- pkg/provider/aws/action/windows/windows.go | 26 +++++--- pkg/provider/aws/data/azs.go | 28 +++++--- pkg/provider/aws/data/network.go | 65 ++++++++++++++++++- pkg/provider/aws/data/spot.go | 15 +++++ .../aws/modules/allocation/allocation.go | 36 +++++++--- pkg/provider/aws/modules/network/network.go | 46 +++++++++++-- pkg/provider/aws/modules/spot/stack.go | 4 ++ pkg/target/host/rhelai/api.go | 9 +-- tkn/infra-aws-fedora.yaml | 8 +++ tkn/infra-aws-rhel-ai.yaml | 8 +++ tkn/infra-aws-rhel.yaml | 8 +++ tkn/infra-aws-windows-server.yaml | 6 ++ tkn/template/infra-aws-fedora.yaml | 8 +++ tkn/template/infra-aws-rhel-ai.yaml | 8 +++ tkn/template/infra-aws-rhel.yaml | 8 +++ tkn/template/infra-aws-windows-server.yaml | 6 ++ 24 files changed, 301 insertions(+), 61 deletions(-) diff --git a/cmd/mapt/cmd/aws/hosts/fedora.go b/cmd/mapt/cmd/aws/hosts/fedora.go index 69c55f2c0..91795c60a 100644 --- a/cmd/mapt/cmd/aws/hosts/fedora.go +++ b/cmd/mapt/cmd/aws/hosts/fedora.go @@ -68,7 +68,8 @@ func getFedoraCreate() *cobra.Command { Spot: params.SpotArgs(), Timeout: viper.GetString(params.Timeout), Airgap: viper.IsSet(airgap), - ServiceEndpoints: params.NetworkServiceEndpoints()}) + ServiceEndpoints: params.NetworkServiceEndpoints(), + VpcID: params.NetworkVpcID()}) }, } flagSet := pflag.NewFlagSet(params.CreateCmdName, pflag.ExitOnError) diff --git a/cmd/mapt/cmd/aws/hosts/rhel.go b/cmd/mapt/cmd/aws/hosts/rhel.go index 6308913c7..dc81f21c4 100644 --- a/cmd/mapt/cmd/aws/hosts/rhel.go +++ b/cmd/mapt/cmd/aws/hosts/rhel.go @@ -68,6 +68,7 @@ func getRHELCreate() *cobra.Command { Timeout: viper.GetString(params.Timeout), Airgap: viper.IsSet(airgap), ServiceEndpoints: params.NetworkServiceEndpoints(), + VpcID: params.NetworkVpcID(), }) }, } diff --git a/cmd/mapt/cmd/aws/hosts/rhelai.go b/cmd/mapt/cmd/aws/hosts/rhelai.go index a52921697..8ef94fcda 100644 --- a/cmd/mapt/cmd/aws/hosts/rhelai.go +++ b/cmd/mapt/cmd/aws/hosts/rhelai.go @@ -74,6 +74,7 @@ func getRHELAICreate() *cobra.Command { AutoStart: viper.IsSet(params.RhelAIAutoStart), VLLMExtraArgs: viper.GetString(params.RhelAIVLLMExtraArgs), ExposePorts: viper.GetIntSlice(params.RhelAIExposePorts), + VpcID: params.NetworkVpcID(), }) }, } diff --git a/cmd/mapt/cmd/aws/hosts/windows.go b/cmd/mapt/cmd/aws/hosts/windows.go index 00da29e9e..4207f6a85 100644 --- a/cmd/mapt/cmd/aws/hosts/windows.go +++ b/cmd/mapt/cmd/aws/hosts/windows.go @@ -81,6 +81,7 @@ func getWindowsCreate() *cobra.Command { Airgap: viper.IsSet(airgap), Timeout: viper.GetString(params.Timeout), ServiceEndpoints: params.NetworkServiceEndpoints(), + VpcID: params.NetworkVpcID(), }) }, } diff --git a/cmd/mapt/cmd/params/params.go b/cmd/mapt/cmd/params/params.go index 25f0c24e1..8a0eab52b 100644 --- a/cmd/mapt/cmd/params/params.go +++ b/cmd/mapt/cmd/params/params.go @@ -205,7 +205,9 @@ const ( KindExtraPortMappingsDesc = "Additional port mappings for the Kind cluster. Value should be a JSON array of objects with containerPort, hostPort, and protocol properties. Example: '[{\"containerPort\": 8080, \"hostPort\": 8080, \"protocol\": \"TCP\"}]'" // Network - ServiceEndpoints = "service-endpoints" + ServiceEndpoints = "service-endpoints" + VpcID = "vpc-id" + VpcIDDesc = "ID of an existing VPC to deploy the instance into. When set, airgap is not supported and spot search is restricted to AZs with subnets in that VPC." // Spot spot = "spot" @@ -222,12 +224,21 @@ const ( func AddNetworkFlags(fs *pflag.FlagSet, desc string) { fs.StringSliceP(ServiceEndpoints, "", []string{}, desc) + fs.StringP(VpcID, "", "", VpcIDDesc) } func NetworkServiceEndpoints() []string { return viper.GetStringSlice(ServiceEndpoints) } +func NetworkVpcID() *string { + if viper.IsSet(VpcID) { + v := viper.GetString(VpcID) + return &v + } + return nil +} + func AddSpotFlags(fs *pflag.FlagSet) { fs.Bool(spot, false, spotDesc) fs.StringP(spotTolerance, "", spotToleranceDefault, spotToleranceDesc) diff --git a/pkg/provider/aws/action/fedora/fedora.go b/pkg/provider/aws/action/fedora/fedora.go index be6924f45..be8b6e101 100644 --- a/pkg/provider/aws/action/fedora/fedora.go +++ b/pkg/provider/aws/action/fedora/fedora.go @@ -33,28 +33,30 @@ import ( ) type FedoraArgs struct { - Prefix string - Version string - Arch string - ComputeRequest *cr.ComputeRequestArgs - Spot *spotTypes.SpotArgs - Airgap bool + Prefix string + Version string + Arch string + ComputeRequest *cr.ComputeRequestArgs + Spot *spotTypes.SpotArgs + Airgap bool ServiceEndpoints []string + VpcID *string // If timeout is set a severless scheduled task will be created to self destroy the resources Timeout string } type fedoraRequest struct { - mCtx *mc.Context - prefix *string - version *string - arch *string - spot bool - timeout *string + mCtx *mc.Context + prefix *string + version *string + arch *string + spot bool + timeout *string serviceEndpoints []string - allocationData *allocation.AllocationResult - airgap *bool - diskSize *int + vpcID *string + allocationData *allocation.AllocationResult + airgap *bool + diskSize *int // internal management // For airgap scenario there is an orchestation of // a phase with connectivity on the machine (allowing bootstraping) @@ -81,6 +83,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) { return err } // Compose request + if args.VpcID != nil && args.Airgap { + return fmt.Errorf("--vpc-id and --airgap are mutually exclusive") + } prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main") r := fedoraRequest{ mCtx: mCtx, @@ -89,6 +94,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) { arch: &args.Arch, timeout: &args.Timeout, serviceEndpoints: args.ServiceEndpoints, + vpcID: args.VpcID, airgap: &args.Airgap, diskSize: args.ComputeRequest.DiskSize} if args.Spot != nil { @@ -100,6 +106,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) { ComputeRequest: args.ComputeRequest, AMIProductDescription: &amiProduct, Spot: args.Spot, + VpcID: args.VpcID, }) if err != nil { return err @@ -201,7 +208,8 @@ func (r *fedoraRequest) deploy(ctx *pulumi.Context) error { CreateLoadBalancer: r.spot, Airgap: *r.airgap, AirgapPhaseConnectivity: r.airgapPhaseConnectivity, - ServiceEndpoints: r.serviceEndpoints, + ServiceEndpoints: r.serviceEndpoints, + VpcID: r.vpcID, }) if err != nil { return err diff --git a/pkg/provider/aws/action/rhel-ai/rhelai.go b/pkg/provider/aws/action/rhel-ai/rhelai.go index f458b7e5c..eeac7e03e 100644 --- a/pkg/provider/aws/action/rhel-ai/rhelai.go +++ b/pkg/provider/aws/action/rhel-ai/rhelai.go @@ -40,6 +40,7 @@ type rhelAIRequest struct { spot bool timeout *string serviceEndpoints []string + vpcID *string allocationData *allocation.AllocationResult diskSize *int model *string @@ -81,6 +82,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) { arch: &args.Arch, timeout: &args.Timeout, serviceEndpoints: args.ServiceEndpoints, + vpcID: args.VpcID, diskSize: args.ComputeRequest.DiskSize, model: &args.Model, hfToken: &args.HFToken, @@ -97,6 +99,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) { ComputeRequest: args.ComputeRequest, AMIProductDescription: &amiProduct, Spot: args.Spot, + VpcID: args.VpcID, }) if err != nil { return err @@ -228,6 +231,7 @@ func (r *rhelAIRequest) deploy(ctx *pulumi.Context) error { AZ: *r.allocationData.AZ, CreateLoadBalancer: r.allocationData.SpotPrice != nil, ServiceEndpoints: r.serviceEndpoints, + VpcID: r.vpcID, }) if err != nil { return err diff --git a/pkg/provider/aws/action/rhel/rhel.go b/pkg/provider/aws/action/rhel/rhel.go index 76240a9d8..cb2c9bccb 100644 --- a/pkg/provider/aws/action/rhel/rhel.go +++ b/pkg/provider/aws/action/rhel/rhel.go @@ -42,6 +42,7 @@ type RHELArgs struct { Spot *spotTypes.SpotArgs Airgap bool ServiceEndpoints []string + VpcID *string // If timeout is set a severless scheduled task will be created to self destroy the resources Timeout string } @@ -57,6 +58,7 @@ type rhelRequest struct { profileSNC *bool timeout *string serviceEndpoints []string + vpcID *string allocationData *allocation.AllocationResult airgap *bool diskSize *int @@ -86,6 +88,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) { return err } // Compose request + if args.VpcID != nil && args.Airgap { + return fmt.Errorf("--vpc-id and --airgap are mutually exclusive") + } prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main") r := rhelRequest{ mCtx: mCtx, @@ -97,6 +102,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) { subsUserpass: &args.SubsUserpass, profileSNC: &args.ProfileSNC, serviceEndpoints: args.ServiceEndpoints, + vpcID: args.VpcID, airgap: &args.Airgap, diskSize: args.ComputeRequest.DiskSize} if args.Spot != nil { @@ -108,6 +114,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) { ComputeRequest: args.ComputeRequest, AMIProductDescription: &amiProduct, Spot: args.Spot, + VpcID: args.VpcID, }) if err != nil { return err @@ -206,7 +213,8 @@ func (r *rhelRequest) deploy(ctx *pulumi.Context) error { CreateLoadBalancer: r.allocationData.SpotPrice != nil, Airgap: *r.airgap, AirgapPhaseConnectivity: r.airgapPhaseConnectivity, - ServiceEndpoints: r.serviceEndpoints, + ServiceEndpoints: r.serviceEndpoints, + VpcID: r.vpcID, }) if err != nil { return err diff --git a/pkg/provider/aws/action/windows/windows.go b/pkg/provider/aws/action/windows/windows.go index 385922bbf..621d6f5e0 100644 --- a/pkg/provider/aws/action/windows/windows.go +++ b/pkg/provider/aws/action/windows/windows.go @@ -46,10 +46,11 @@ type WindowsServerArgs struct { AMILang string AMIKeepCopy bool // Machine params - ComputeRequest *cr.ComputeRequestArgs - Spot *spotTypes.SpotArgs - Airgap bool + ComputeRequest *cr.ComputeRequestArgs + Spot *spotTypes.SpotArgs + Airgap bool ServiceEndpoints []string + VpcID *string // If timeout is set a severless scheduled task will be created to self destroy the resources Timeout string } @@ -64,12 +65,13 @@ type windowsServerRequest struct { amiLang *string amiKeepCopy *bool - spot bool - timeout *string + spot bool + timeout *string serviceEndpoints []string - allocationData *allocation.AllocationResult - airgap *bool - diskSize *int + vpcID *string + allocationData *allocation.AllocationResult + airgap *bool + diskSize *int // internal management // For airgap scenario there is an orchestation of // a phase with connectivity on the machine (allowing bootstraping) @@ -104,6 +106,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) { args.AMIName = amiNonEngNameDefault } // Compose request + if args.VpcID != nil && args.Airgap { + return fmt.Errorf("--vpc-id and --airgap are mutually exclusive") + } prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main") r := windowsServerRequest{ mCtx: mCtx, @@ -115,6 +120,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) { amiLang: &args.AMILang, timeout: &args.Timeout, serviceEndpoints: args.ServiceEndpoints, + vpcID: args.VpcID, airgap: &args.Airgap, } if args.ComputeRequest != nil { @@ -129,6 +135,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) { ComputeRequest: args.ComputeRequest, AMIProductDescription: &amiProduct, Spot: args.Spot, + VpcID: args.VpcID, }) if err != nil { return err @@ -259,7 +266,8 @@ func (r *windowsServerRequest) deploy(ctx *pulumi.Context) error { CreateLoadBalancer: r.spot, Airgap: *r.airgap, AirgapPhaseConnectivity: r.airgapPhaseConnectivity, - ServiceEndpoints: r.serviceEndpoints, + ServiceEndpoints: r.serviceEndpoints, + VpcID: r.vpcID, }) if err != nil { return err diff --git a/pkg/provider/aws/data/azs.go b/pkg/provider/aws/data/azs.go index 8e9a95291..f5b7c63d2 100644 --- a/pkg/provider/aws/data/azs.go +++ b/pkg/provider/aws/data/azs.go @@ -42,12 +42,9 @@ type AvailabilityZonesResult struct { Err error } -func describeAvailabilityZonesAsync(ctx context.Context, regionName string, c chan AvailabilityZonesResult) { - data, err := DescribeAvailabilityZones(ctx, regionName) - c <- AvailabilityZonesResult{ - AvailabilityZones: data, - Err: err} - +func describeAvailabilityZonesAllAsync(ctx context.Context, regionName string, c chan AvailabilityZonesResult) { + data, err := describeAvailabilityZonesAll(ctx, regionName) + c <- AvailabilityZonesResult{AvailabilityZones: data, Err: err} } func DescribeAvailabilityZones(ctx context.Context, regionName string) ([]ec2Types.AvailabilityZone, error) { @@ -55,6 +52,18 @@ func DescribeAvailabilityZones(ctx context.Context, regionName string) ([]ec2Typ } func describeAvailabilityZones(ctx context.Context, regionName string, excludedZoneIDs []string) ([]ec2Types.AvailabilityZone, error) { + return describeAvailabilityZonesOpts(ctx, regionName, excludedZoneIDs, false) +} + +// describeAvailabilityZonesAll is like describeAvailabilityZones but includes AZs not +// normally visible to the account (AllAvailabilityZones: true). Used only for AZ ID→name +// resolution during spot placement score lookups, where the scores API can return AZ IDs +// for zones not yet opted-in to by the account. +func describeAvailabilityZonesAll(ctx context.Context, regionName string) ([]ec2Types.AvailabilityZone, error) { + return describeAvailabilityZonesOpts(ctx, regionName, nil, true) +} + +func describeAvailabilityZonesOpts(ctx context.Context, regionName string, excludedZoneIDs []string, allZones bool) ([]ec2Types.AvailabilityZone, error) { var cfgOpts config.LoadOptionsFunc if len(regionName) > 0 { cfgOpts = config.WithRegion(regionName) @@ -64,9 +73,8 @@ func describeAvailabilityZones(ctx context.Context, regionName string, excludedZ return nil, err } client := ec2.NewFromConfig(cfg) - // TODO check what happen when true and region name input := ec2.DescribeAvailabilityZonesInput{ - // AllAvailabilityZones: aws.Bool(true), + AllAvailabilityZones: aws.Bool(allZones), } input.Filters = []ec2Types.Filter{ { @@ -112,12 +120,14 @@ func getZoneName(azID string, azDescriptions []ec2Types.AvailabilityZone) (strin // user 1 Name: us-west-1a ID: us-west-11, Name: us-west-1b ID: us-west-12 // user 2 Name: us-west-1a ID: us-west-12, Name: us-west-1b ID: us-west-11 // This allowsa a better distribution among users +// describeAvailabilityZonesByRegions fetches all AZs (including non-opted-in ones) so +// that AZ IDs returned by GetSpotPlacementScores can always be resolved to names. func describeAvailabilityZonesByRegions(ctx context.Context, regions []string) map[string][]ec2Types.AvailabilityZone { result := make(map[string][]ec2Types.AvailabilityZone) c := make(chan AvailabilityZonesResult) for _, region := range regions { lRegion := region - go describeAvailabilityZonesAsync(ctx, lRegion, c) + go describeAvailabilityZonesAllAsync(ctx, lRegion, c) } for i := 0; i < len(regions); i++ { availabilityZonesResult := <-c diff --git a/pkg/provider/aws/data/network.go b/pkg/provider/aws/data/network.go index 277f786b2..4ec139100 100644 --- a/pkg/provider/aws/data/network.go +++ b/pkg/provider/aws/data/network.go @@ -13,6 +13,7 @@ import ( const ( filterVPCID = "vpc-id" + filterAvailabilityZone = "availability-zone" filterAssociationSubnetID = "association.subnet-id" ) @@ -90,6 +91,62 @@ func getPublicSubnets(ctx context.Context, client *ec2.Client, vpcID string) (su return } +// GetSubnetAZsForVPC returns the unique AZ names of all subnets that belong to the specified VPC. +func GetSubnetAZsForVPC(ctx context.Context, region, vpcID string) ([]string, error) { + cfg, err := getConfig(ctx, region) + if err != nil { + return nil, err + } + client := ec2.NewFromConfig(cfg) + subnetsOutput, err := client.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ + Filters: []ec2types.Filter{ + {Name: aws.String(filterVPCID), Values: []string{vpcID}}, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe subnets for VPC %s: %w", vpcID, err) + } + seen := map[string]struct{}{} + var azs []string + for _, s := range subnetsOutput.Subnets { + if s.AvailabilityZone != nil { + az := *s.AvailabilityZone + if _, ok := seen[az]; !ok { + seen[az] = struct{}{} + azs = append(azs, az) + } + } + } + if len(azs) == 0 { + return nil, fmt.Errorf("no subnets found in VPC %s", vpcID) + } + return azs, nil +} + +// GetPublicSubnetIDInAZ returns a public subnet ID in the given AZ within the specified VPC. +func GetPublicSubnetIDInAZ(ctx context.Context, region, vpcID, az string) (*string, error) { + cfg, err := getConfig(ctx, region) + if err != nil { + return nil, err + } + client := ec2.NewFromConfig(cfg) + subnetsOutput, err := client.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ + Filters: []ec2types.Filter{ + {Name: aws.String(filterVPCID), Values: []string{vpcID}}, + {Name: aws.String(filterAvailabilityZone), Values: []string{az}}, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe subnets in VPC %s AZ %s: %w", vpcID, az, err) + } + for _, s := range subnetsOutput.Subnets { + if err := isPublic(ctx, client, *s.SubnetId); err == nil { + return s.SubnetId, nil + } + } + return nil, fmt.Errorf("no public subnet found in VPC %s AZ %s", vpcID, az) +} + func isPublic(ctx context.Context, client *ec2.Client, subnetID string) error { routeTablesOutput, err := client.DescribeRouteTables( ctx, @@ -109,9 +166,11 @@ func isPublic(ctx context.Context, client *ec2.Client, subnetID string) error { } for _, routeTable := range routeTablesOutput.RouteTables { for _, route := range routeTable.Routes { - gwID := *route.GatewayId - if route.GatewayId != nil && len(gwID) > 0 && gwID[:2] == "igw" { - return nil + if route.GatewayId != nil { + gwID := *route.GatewayId + if len(gwID) > 0 && gwID[:2] == "igw" { + return nil + } } } } diff --git a/pkg/provider/aws/data/spot.go b/pkg/provider/aws/data/spot.go index dcbf575cf..c502a8023 100644 --- a/pkg/provider/aws/data/spot.go +++ b/pkg/provider/aws/data/spot.go @@ -115,6 +115,9 @@ type SpotInfoArgs struct { ExcludedRegions []string SpotTolerance *spot.Tolerance SpotPriceIncreaseRate *int + // AllowedAZs restricts spot selection to specific availability zones. + // Used when deploying into a fixed VPC to only consider AZs with subnets in that VPC. + AllowedAZs []string } type SpotInfoResult struct { @@ -194,6 +197,18 @@ func SpotInfo(mCtx *mc.Context, args *SpotInfoArgs) (*spot.SpotResults, error) { if err != nil { return nil, err } + if len(args.AllowedAZs) > 0 { + for region, scores := range placementScores { + filtered := util.ArrayFilter(scores, func(s placementScoreResult) bool { + return slices.Contains(args.AllowedAZs, s.azName) + }) + if len(filtered) == 0 { + delete(placementScores, region) + } else { + placementScores[region] = filtered + } + } + } regionsWithPlacementScore := utilMaps.Keys(placementScores) spotPricing, err := hostingPlaces.RunOnHostingPlaces(regionsWithPlacementScore, spotPricingArgs{ diff --git a/pkg/provider/aws/modules/allocation/allocation.go b/pkg/provider/aws/modules/allocation/allocation.go index 905c61796..4980d9f11 100644 --- a/pkg/provider/aws/modules/allocation/allocation.go +++ b/pkg/provider/aws/modules/allocation/allocation.go @@ -8,6 +8,8 @@ import ( spotTypes "github.com/redhat-developer/mapt/pkg/provider/api/spot" "github.com/redhat-developer/mapt/pkg/provider/aws/data" "github.com/redhat-developer/mapt/pkg/provider/aws/modules/spot" + "github.com/redhat-developer/mapt/pkg/util" + "golang.org/x/exp/slices" ) var ErrNoSupportedInstaceTypes = fmt.Errorf("the current region does not support any of the requested instance types") @@ -17,7 +19,8 @@ type AllocationArgs struct { Prefix, AMIProductDescription, AMIName *string - Spot *spotTypes.SpotArgs + Spot *spotTypes.SpotArgs + VpcID *string } type AllocationResult struct { @@ -37,11 +40,20 @@ func Allocation(mCtx *mc.Context, args *AllocationArgs) (*AllocationResult, erro return nil, err } } + var allowedAZs []string + if args.VpcID != nil { + region := mCtx.TargetHostingPlace() + allowedAZs, err = data.GetSubnetAZsForVPC(mCtx.Context(), region, *args.VpcID) + if err != nil { + return nil, err + } + } if args.Spot != nil && args.Spot.Spot { sr := &spot.SpotStackArgs{ Prefix: *args.Prefix, InstaceTypes: instancesTypes, Spot: args.Spot, + AllowedAZs: allowedAZs, } if args.AMIName != nil { sr.AMIName = *args.AMIName @@ -51,7 +63,7 @@ func Allocation(mCtx *mc.Context, args *AllocationArgs) (*AllocationResult, erro } return allocationSpot(mCtx, sr) } - return allocationOnDemand(mCtx, instancesTypes) + return allocationOnDemand(mCtx, instancesTypes, allowedAZs) } func allocationSpot(mCtx *mc.Context, @@ -68,18 +80,25 @@ func allocationSpot(mCtx *mc.Context, }, nil } -func allocationOnDemand(mCtx *mc.Context, instancesTypes []string) (*AllocationResult, error) { +func allocationOnDemand(mCtx *mc.Context, instancesTypes []string, allowedAZs []string) (*AllocationResult, error) { region := mCtx.TargetHostingPlace() + candidateAZs := allowedAZs + if len(candidateAZs) == 0 { + candidateAZs = data.GetAvailabilityZones(mCtx.Context(), region, nil) + } excludedAZs := []string{} var err error var az *string var supportedInstancesType []string - azs := data.GetAvailabilityZones(mCtx.Context(), region, nil) for { - az, err = data.GetRandomAvailabilityZone(mCtx.Context(), region, excludedAZs) - if err != nil { - return nil, err + remaining := util.ArrayFilter(candidateAZs, func(a string) bool { + return !slices.Contains(excludedAZs, a) + }) + if len(remaining) == 0 { + return nil, ErrNoSupportedInstaceTypes } + azName := remaining[util.Random(len(remaining)-1, 0)] + az = &azName supportedInstancesType, err = data.FilterInstaceTypesOfferedByLocation(mCtx.Context(), instancesTypes, &data.LocationArgs{ Region: ®ion, @@ -92,9 +111,6 @@ func allocationOnDemand(mCtx *mc.Context, instancesTypes []string) (*AllocationR break } excludedAZs = append(excludedAZs, *az) - if len(excludedAZs) == len(azs) { - return nil, ErrNoSupportedInstaceTypes - } } return &AllocationResult{ Region: ®ion, diff --git a/pkg/provider/aws/modules/network/network.go b/pkg/provider/aws/modules/network/network.go index 82f7bce90..455c0db70 100644 --- a/pkg/provider/aws/modules/network/network.go +++ b/pkg/provider/aws/modules/network/network.go @@ -8,6 +8,7 @@ import ( bastion "github.com/redhat-developer/mapt/pkg/provider/aws/modules/bastion" na "github.com/redhat-developer/mapt/pkg/provider/aws/modules/network/airgap" ns "github.com/redhat-developer/mapt/pkg/provider/aws/modules/network/standard" + "github.com/redhat-developer/mapt/pkg/provider/aws/data" utilNetwork "github.com/redhat-developer/mapt/pkg/util/network" resourcesUtil "github.com/redhat-developer/mapt/pkg/util/resources" ) @@ -36,7 +37,10 @@ type NetworkArgs struct { CreateLoadBalancer bool Airgap bool AirgapPhaseConnectivity Connectivity - ServiceEndpoints []string + ServiceEndpoints []string + // VpcID deploys into an existing VPC instead of creating one. + // Airgap is not supported when VpcID is set. + VpcID *string } type NetworkResult struct { @@ -51,16 +55,16 @@ type NetworkResult struct { func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkResult, error) { var err error - result := &NetworkResult{} - if !args.Airgap { - result, err = standardNetwork(ctx, mCtx, args) + var result *NetworkResult + switch { + case args.VpcID != nil: + result, err = existingVPCNetwork(ctx, mCtx, args) if err != nil { return nil, err } - } else { + case args.Airgap: var publicSubnet *ec2.Subnet - result, publicSubnet, err = - airgapNetworking(ctx, mCtx, args) + result, publicSubnet, err = airgapNetworking(ctx, mCtx, args) if err != nil { return nil, err } @@ -73,6 +77,11 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkR if err != nil { return nil, err } + default: + result, err = standardNetwork(ctx, mCtx, args) + if err != nil { + return nil, err + } } result.Eip, err = ec2.NewEip(ctx, resourcesUtil.GetResourceName(args.Prefix, args.ID, "lbeip"), @@ -100,6 +109,29 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkR return result, nil } +func existingVPCNetwork(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkResult, error) { + subnetID, err := data.GetPublicSubnetIDInAZ(ctx.Context(), args.Region, *args.VpcID, args.AZ) + if err != nil { + return nil, err + } + vpc, err := ec2.GetVpc(ctx, + resourcesUtil.GetResourceName(args.Prefix, args.ID, "vpc"), + pulumi.ID(*args.VpcID), nil) + if err != nil { + return nil, err + } + subnet, err := ec2.GetSubnet(ctx, + resourcesUtil.GetResourceName(args.Prefix, args.ID, "subnet"), + pulumi.ID(*subnetID), nil) + if err != nil { + return nil, err + } + return &NetworkResult{ + Vpc: vpc, + Subnet: subnet, + }, nil +} + func standardNetwork(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkResult, error) { net, err := ns.NetworkRequest{ MCtx: mCtx, diff --git a/pkg/provider/aws/modules/spot/stack.go b/pkg/provider/aws/modules/spot/stack.go index 92a000ff4..fdb1da310 100644 --- a/pkg/provider/aws/modules/spot/stack.go +++ b/pkg/provider/aws/modules/spot/stack.go @@ -22,6 +22,7 @@ type SpotStackArgs struct { AMIName string AMIArch string Spot *spotTypes.SpotArgs + AllowedAZs []string } type SpotStackResult struct { @@ -40,6 +41,7 @@ type spotStackRequest struct { amiName string amiArch string spot *spotTypes.SpotArgs + allowedAZs []string } func (r *SpotStackArgs) validate() error { @@ -55,6 +57,7 @@ func (r *SpotStackArgs) toRequest(mCtx *mc.Context) *spotStackRequest { amiName: r.AMIName, amiArch: r.AMIArch, spot: r.Spot, + allowedAZs: r.AllowedAZs, } } @@ -131,6 +134,7 @@ func (r *spotStackRequest) deployer(ctx *pulumi.Context) error { InstaceTypes: r.instaceTypes, AMIName: &r.amiName, AMIArch: &r.amiArch, + AllowedAZs: r.allowedAZs, } if r.spot != nil { sia.ExcludedRegions = r.spot.ExcludedHostingPlaces diff --git a/pkg/target/host/rhelai/api.go b/pkg/target/host/rhelai/api.go index d979be619..dcd8105f2 100644 --- a/pkg/target/host/rhelai/api.go +++ b/pkg/target/host/rhelai/api.go @@ -16,11 +16,12 @@ type RHELAIArgs struct { Spot *spotTypes.SpotArgs ServiceEndpoints []string // If timeout is set a severless scheduled task will be created to self destroy the resources - Timeout string - Model string - HFToken string - APIKey string + Timeout string + Model string + HFToken string + APIKey string AutoStart bool VLLMExtraArgs string ExposePorts []int + VpcID *string } diff --git a/tkn/infra-aws-fedora.yaml b/tkn/infra-aws-fedora.yaml index 431347b6b..97af1e9b3 100644 --- a/tkn/infra-aws-fedora.yaml +++ b/tkn/infra-aws-fedora.yaml @@ -138,6 +138,11 @@ spec: default: "false" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -277,6 +282,9 @@ spec: if [[ "$(params.airgap)" == "true" ]]; then cmd+="--airgap " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/infra-aws-rhel-ai.yaml b/tkn/infra-aws-rhel-ai.yaml index 857034575..e4ebe8db7 100644 --- a/tkn/infra-aws-rhel-ai.yaml +++ b/tkn/infra-aws-rhel-ai.yaml @@ -160,6 +160,11 @@ spec: default: "" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -302,6 +307,9 @@ spec: cmd+="--spot-eviction-tolerance '$(params.spot-eviction-tolerance)' " cmd+="--spot-excluded-regions '$(params.spot-excluded-regions)' " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/infra-aws-rhel.yaml b/tkn/infra-aws-rhel.yaml index c56e2e7e8..82d3b2b0e 100644 --- a/tkn/infra-aws-rhel.yaml +++ b/tkn/infra-aws-rhel.yaml @@ -157,6 +157,11 @@ spec: default: "false" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -302,6 +307,9 @@ spec: if [[ "$(params.profile-snc)" == "true" ]]; then cmd+="--snc " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/infra-aws-windows-server.yaml b/tkn/infra-aws-windows-server.yaml index f6e989e3c..5c1df6978 100644 --- a/tkn/infra-aws-windows-server.yaml +++ b/tkn/infra-aws-windows-server.yaml @@ -117,6 +117,11 @@ spec: To access the target machine we need to go through the bastion default: 'false' # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "''" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -238,6 +243,7 @@ spec: cmd+="--spot --spot-increase-rate $(params.spot-increase-rate) --spot-eviction-tolerance $(params.spot-eviction-tolerance) " fi if [[ $(params.airgap) == "true" ]]; then cmd+="--airgap "; fi + if [[ $(params.vpc-id) != "" ]]; then cmd+="--vpc-id $(params.vpc-id) "; fi if [[ $(params.service-endpoints) != "" ]]; then cmd+="--service-endpoints $(params.service-endpoints) "; fi cmd+="--tags $(params.tags) " fi diff --git a/tkn/template/infra-aws-fedora.yaml b/tkn/template/infra-aws-fedora.yaml index 08627de38..b3f2f76bf 100644 --- a/tkn/template/infra-aws-fedora.yaml +++ b/tkn/template/infra-aws-fedora.yaml @@ -138,6 +138,11 @@ spec: default: "false" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -277,6 +282,9 @@ spec: if [[ "$(params.airgap)" == "true" ]]; then cmd+="--airgap " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/template/infra-aws-rhel-ai.yaml b/tkn/template/infra-aws-rhel-ai.yaml index 7a9fe9581..57ff485b0 100644 --- a/tkn/template/infra-aws-rhel-ai.yaml +++ b/tkn/template/infra-aws-rhel-ai.yaml @@ -160,6 +160,11 @@ spec: default: "" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -302,6 +307,9 @@ spec: cmd+="--spot-eviction-tolerance '$(params.spot-eviction-tolerance)' " cmd+="--spot-excluded-regions '$(params.spot-excluded-regions)' " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/template/infra-aws-rhel.yaml b/tkn/template/infra-aws-rhel.yaml index cc33482a9..cd6c1a505 100644 --- a/tkn/template/infra-aws-rhel.yaml +++ b/tkn/template/infra-aws-rhel.yaml @@ -157,6 +157,11 @@ spec: default: "false" # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -302,6 +307,9 @@ spec: if [[ "$(params.profile-snc)" == "true" ]]; then cmd+="--snc " fi + if [[ "$(params.vpc-id)" != "" ]]; then + cmd+="--vpc-id '$(params.vpc-id)' " + fi if [[ "$(params.service-endpoints)" != "" ]]; then cmd+="--service-endpoints '$(params.service-endpoints)' " fi diff --git a/tkn/template/infra-aws-windows-server.yaml b/tkn/template/infra-aws-windows-server.yaml index 05ff00600..3c652f4d5 100644 --- a/tkn/template/infra-aws-windows-server.yaml +++ b/tkn/template/infra-aws-windows-server.yaml @@ -117,6 +117,11 @@ spec: To access the target machine we need to go through the bastion default: 'false' # Network params + - name: vpc-id + description: | + ID of an existing VPC to deploy into. When set, mapt reuses the VPC + instead of creating a new one. Mutually exclusive with airgap. + default: "''" - name: service-endpoints description: | Comma-separated list of VPC service endpoints to create. @@ -238,6 +243,7 @@ spec: cmd+="--spot --spot-increase-rate $(params.spot-increase-rate) --spot-eviction-tolerance $(params.spot-eviction-tolerance) " fi if [[ $(params.airgap) == "true" ]]; then cmd+="--airgap "; fi + if [[ $(params.vpc-id) != "" ]]; then cmd+="--vpc-id $(params.vpc-id) "; fi if [[ $(params.service-endpoints) != "" ]]; then cmd+="--service-endpoints $(params.service-endpoints) "; fi cmd+="--tags $(params.tags) " fi From 8f2067b23e41a384a9b28568120db05009ff71e0 Mon Sep 17 00:00:00 2001 From: Adrian Riobo Date: Wed, 1 Jul 2026 20:06:29 +0000 Subject: [PATCH 2/2] fix(aws): support private subnets when deploying into existing VPCs When --vpc-id is set and the chosen AZ contains only private subnets, mapt now falls back to using any available subnet instead of failing. Machines in private subnets connect outbound only (e.g. GitLab runner registration), so inbound SSH rules, EIP allocation, load balancer creation and SSH readiness checks are all skipped automatically. Key changes: - existingVPCNetwork tries GetPublicSubnetIDInAZ first; on failure falls back to GetAnySubnetIDInAZ and marks IsPublic=false - NetworkResult.IsPublic propagates to all host deploy functions; EIP and LB are only created when IsPublic is true - ComputeRequest respects nil Eip: no AssociatePublicIpAddress, no EIP association, no LB target groups; GetHostDnsName falls back to the instance private IP when neither EIP nor LB is present - Security groups omit all inbound rules for private subnet deploys - GetSubnetAZsForVPC returns all AZs (public and private) since private-subnet AZs are now valid deployment targets - GetAnySubnetIDInAZ added to data/network.go - Spot AZ resolution hardened: describeAvailabilityZonesAllAsync falls back to the standard describe when AllAvailabilityZones:true is SCP-blocked; getPlacementScores restricts the placement score request to regions where AZ ID resolution actually succeeded Co-Authored-By: Claude Sonnet 4.6 --- pkg/provider/aws/action/fedora/fedora.go | 55 +++++++++------- pkg/provider/aws/action/rhel-ai/rhelai.go | 38 ++++++----- pkg/provider/aws/action/rhel/rhel.go | 30 +++++---- pkg/provider/aws/action/windows/windows.go | 34 +++++----- pkg/provider/aws/data/azs.go | 15 ++++- pkg/provider/aws/data/network.go | 27 +++++++- pkg/provider/aws/data/spot.go | 16 ++++- .../aws/modules/ec2/compute/compute.go | 65 ++++++++++++------- pkg/provider/aws/modules/network/network.go | 43 ++++++++---- 9 files changed, 213 insertions(+), 110 deletions(-) diff --git a/pkg/provider/aws/action/fedora/fedora.go b/pkg/provider/aws/action/fedora/fedora.go index be8b6e101..e44c28966 100644 --- a/pkg/provider/aws/action/fedora/fedora.go +++ b/pkg/provider/aws/action/fedora/fedora.go @@ -225,7 +225,7 @@ func (r *fedoraRequest) deploy(ctx *pulumi.Context) error { ctx.Export(fmt.Sprintf("%s-%s", *r.prefix, outputUserPrivateKey), keyResources.PrivateKey.PrivateKeyPem) // Security groups - securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc) + securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc, nw.IsPublic) if err != nil { return err } @@ -257,7 +257,7 @@ func (r *fedoraRequest) deploy(ctx *pulumi.Context) error { Eip: nw.Eip, LBTargetGroups: []int{22}, } - if r.spot { + if r.spot && nw.IsPublic { cr.Spot = true cr.SpotPrice = *r.allocationData.SpotPrice } @@ -281,6 +281,11 @@ func (r *fedoraRequest) deploy(ctx *pulumi.Context) error { return err } } + // Skip SSH readiness check for private subnets: the machine has no inbound + // connectivity and is expected to register itself outbound (e.g. GitLab runner). + if !nw.IsPublic { + return nil + } return c.Readiness(ctx, command.CommandPing, *r.prefix, awsFedoraDedicatedID, keyResources.PrivateKey, amiUserDefault, nw.Bastion, c.Dependencies) } @@ -301,31 +306,32 @@ func manageResults(mCtx *mc.Context, stackResult auto.UpResult, prefix *string, return output.Write(stackResult, mCtx.GetResultsOutputPath(), results) } -// security group for mac machine with ingress rules for ssh and vnc +// securityGroups builds the security group for the Fedora machine. +// When public is true the SG allows SSH (and optional Cirrus) inbound from anywhere. +// When false (private subnet, outbound-only workload) no inbound rules are added; +// the default egress rule permits all outbound traffic so the runner can reach GitLab. func securityGroups(ctx *pulumi.Context, mCtx *mc.Context, prefix *string, - vpc *ec2.Vpc) (pulumi.StringArray, error) { - // ingress for ssh access from 0.0.0.0 + vpc *ec2.Vpc, public bool) (pulumi.StringArray, error) { var ingressRules []securityGroup.IngressRules - sshIngressRule := securityGroup.SSH_TCP - sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 - ingressRules = []securityGroup.IngressRules{sshIngressRule} - // Integration ports - cirrusPort, err := cirrus.CirrusPort() - if err != nil { - return nil, err - } - if cirrusPort != nil { - ingressRules = append(ingressRules, - securityGroup.IngressRules{ - Description: fmt.Sprintf("Cirrus port for %s", awsFedoraDedicatedID), - FromPort: *cirrusPort, - ToPort: *cirrusPort, - Protocol: "tcp", - CidrBlocks: infra.NETWORKING_CIDR_ANY_IPV4, - }) + if public { + sshIngressRule := securityGroup.SSH_TCP + sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 + ingressRules = []securityGroup.IngressRules{sshIngressRule} + cirrusPort, err := cirrus.CirrusPort() + if err != nil { + return nil, err + } + if cirrusPort != nil { + ingressRules = append(ingressRules, + securityGroup.IngressRules{ + Description: fmt.Sprintf("Cirrus port for %s", awsFedoraDedicatedID), + FromPort: *cirrusPort, + ToPort: *cirrusPort, + Protocol: "tcp", + CidrBlocks: infra.NETWORKING_CIDR_ANY_IPV4, + }) + } } - - // Create SG with ingress rules sg, err := securityGroup.SGRequest{ Name: resourcesUtil.GetResourceName(*prefix, awsFedoraDedicatedID, "sg"), VPC: vpc, @@ -335,7 +341,6 @@ func securityGroups(ctx *pulumi.Context, mCtx *mc.Context, prefix *string, if err != nil { return nil, err } - // Convert to an array of IDs sgs := util.ArrayConvert([]*ec2.SecurityGroup{sg.SG}, func(sg *ec2.SecurityGroup) pulumi.StringInput { return sg.ID() diff --git a/pkg/provider/aws/action/rhel-ai/rhelai.go b/pkg/provider/aws/action/rhel-ai/rhelai.go index eeac7e03e..1de003379 100644 --- a/pkg/provider/aws/action/rhel-ai/rhelai.go +++ b/pkg/provider/aws/action/rhel-ai/rhelai.go @@ -247,7 +247,7 @@ func (r *rhelAIRequest) deploy(ctx *pulumi.Context) error { ctx.Export(fmt.Sprintf("%s-%s", *r.prefix, outputUserPrivateKey), keyResources.PrivateKey.PrivateKeyPem) // Security groups - securityGroups, err := r.securityGroups(ctx, r.mCtx, nw.Vpc) + securityGroups, err := r.securityGroups(ctx, r.mCtx, nw.Vpc, nw.IsPublic) if err != nil { return err } @@ -269,7 +269,7 @@ func (r *rhelAIRequest) deploy(ctx *pulumi.Context) error { LB: nw.LoadBalancer, Eip: nw.Eip, LBTargetGroups: r.lbTargetGroups()} - if r.allocationData.SpotPrice != nil { + if r.allocationData.SpotPrice != nil && nw.IsPublic { cr.Spot = true cr.SpotPrice = *r.allocationData.SpotPrice } @@ -293,6 +293,12 @@ func (r *rhelAIRequest) deploy(ctx *pulumi.Context) error { return err } } + if !nw.IsPublic { + if r.autoStart { + return fmt.Errorf("--auto-start requires SSH access to configure RHAIIS; private subnet deployments do not support --auto-start") + } + return nil + } if !r.autoStart { return c.Readiness(ctx, command.CommandPing, *r.prefix, awsRHELDedicatedID, keyResources.PrivateKey, amiUserDefault, nil, c.Dependencies) @@ -325,22 +331,22 @@ func (r *rhelAIRequest) manageResults(stackResult auto.UpResult) error { return output.Write(stackResult, r.mCtx.GetResultsOutputPath(), results) } -// security group for mac machine with ingress rules for ssh and vnc func (r *rhelAIRequest) securityGroups(ctx *pulumi.Context, mCtx *mc.Context, - vpc *ec2.Vpc) (pulumi.StringArray, error) { - // ingress for ssh access from 0.0.0.0 - sshIngressRule := securityGroup.SSH_TCP - sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 - ingressRules := []securityGroup.IngressRules{sshIngressRule} - for _, port := range r.exposePorts { - rule := securityGroup.IngressRules{ - Description: fmt.Sprintf("port-%d", port), - FromPort: port, - ToPort: port, - Protocol: "tcp", - CidrBlocks: infra.NETWORKING_CIDR_ANY_IPV4, + vpc *ec2.Vpc, public bool) (pulumi.StringArray, error) { + var ingressRules []securityGroup.IngressRules + if public { + sshIngressRule := securityGroup.SSH_TCP + sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 + ingressRules = []securityGroup.IngressRules{sshIngressRule} + for _, port := range r.exposePorts { + ingressRules = append(ingressRules, securityGroup.IngressRules{ + Description: fmt.Sprintf("port-%d", port), + FromPort: port, + ToPort: port, + Protocol: "tcp", + CidrBlocks: infra.NETWORKING_CIDR_ANY_IPV4, + }) } - ingressRules = append(ingressRules, rule) } // Create SG with ingress rules sg, err := securityGroup.SGRequest{ diff --git a/pkg/provider/aws/action/rhel/rhel.go b/pkg/provider/aws/action/rhel/rhel.go index cb2c9bccb..197163d7e 100644 --- a/pkg/provider/aws/action/rhel/rhel.go +++ b/pkg/provider/aws/action/rhel/rhel.go @@ -230,7 +230,7 @@ func (r *rhelRequest) deploy(ctx *pulumi.Context) error { ctx.Export(fmt.Sprintf("%s-%s", *r.prefix, outputUserPrivateKey), keyResources.PrivateKey.PrivateKeyPem) // Security groups - securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc) + securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc, nw.IsPublic) if err != nil { return err } @@ -267,7 +267,7 @@ func (r *rhelRequest) deploy(ctx *pulumi.Context) error { LB: nw.LoadBalancer, Eip: nw.Eip, LBTargetGroups: []int{22}} - if r.allocationData.SpotPrice != nil { + if r.allocationData.SpotPrice != nil && nw.IsPublic { cr.Spot = true cr.SpotPrice = *r.allocationData.SpotPrice } @@ -291,6 +291,9 @@ func (r *rhelRequest) deploy(ctx *pulumi.Context) error { return err } } + if !nw.IsPublic { + return nil + } return c.Readiness(ctx, command.CommandCloudInitWait, *r.prefix, awsRHELDedicatedID, keyResources.PrivateKey, amiUserDefault, nw.Bastion, c.Dependencies) } @@ -311,24 +314,23 @@ func manageResults(mCtx *mc.Context, stackResult auto.UpResult, prefix *string, return output.Write(stackResult, mCtx.GetResultsOutputPath(), results) } -// security group for mac machine with ingress rules for ssh and vnc func securityGroups(ctx *pulumi.Context, mCtx *mc.Context, prefix *string, - vpc *ec2.Vpc) (pulumi.StringArray, error) { - // ingress for ssh access from 0.0.0.0 - sshIngressRule := securityGroup.SSH_TCP - sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 - // Create SG with ingress rules + vpc *ec2.Vpc, public bool) (pulumi.StringArray, error) { + var ingressRules []securityGroup.IngressRules + if public { + sshIngressRule := securityGroup.SSH_TCP + sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 + ingressRules = []securityGroup.IngressRules{sshIngressRule} + } sg, err := securityGroup.SGRequest{ - Name: resourcesUtil.GetResourceName(*prefix, awsRHELDedicatedID, "sg"), - VPC: vpc, - Description: fmt.Sprintf("sg for %s", awsRHELDedicatedID), - IngressRules: []securityGroup.IngressRules{ - sshIngressRule}, + Name: resourcesUtil.GetResourceName(*prefix, awsRHELDedicatedID, "sg"), + VPC: vpc, + Description: fmt.Sprintf("sg for %s", awsRHELDedicatedID), + IngressRules: ingressRules, }.Create(ctx, mCtx) if err != nil { return nil, err } - // Convert to an array of IDs sgs := util.ArrayConvert([]*ec2.SecurityGroup{sg.SG}, func(sg *ec2.SecurityGroup) pulumi.StringInput { return sg.ID() diff --git a/pkg/provider/aws/action/windows/windows.go b/pkg/provider/aws/action/windows/windows.go index 621d6f5e0..1b72d3811 100644 --- a/pkg/provider/aws/action/windows/windows.go +++ b/pkg/provider/aws/action/windows/windows.go @@ -283,7 +283,7 @@ func (r *windowsServerRequest) deploy(ctx *pulumi.Context) error { ctx.Export(fmt.Sprintf("%s-%s", *r.prefix, outputUserPrivateKey), keyResources.PrivateKey.PrivateKeyPem) // Security groups - securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc) + securityGroups, err := securityGroups(ctx, r.mCtx, r.prefix, nw.Vpc, nw.IsPublic) if err != nil { return err } @@ -321,7 +321,7 @@ func (r *windowsServerRequest) deploy(ctx *pulumi.Context) error { LB: nw.LoadBalancer, Eip: nw.Eip, LBTargetGroups: []int{22, 3389}} - if r.allocationData.SpotPrice != nil { + if r.allocationData.SpotPrice != nil && nw.IsPublic { cr.Spot = true cr.SpotPrice = *r.allocationData.SpotPrice } @@ -347,6 +347,9 @@ func (r *windowsServerRequest) deploy(ctx *pulumi.Context) error { return err } } + if !nw.IsPublic { + return nil + } return c.Readiness(ctx, command.CommandPing, *r.prefix, awsWindowsDedicatedID, keyResources.PrivateKey, *r.amiUser, nw.Bastion, c.Dependencies) } @@ -368,26 +371,25 @@ func manageResults(mCtx *mc.Context, stackResult auto.UpResult, prefix *string, return output.Write(stackResult, mCtx.GetResultsOutputPath(), results) } -// security group for mac machine with ingress rules for ssh and vnc func securityGroups(ctx *pulumi.Context, mCtx *mc.Context, prefix *string, - vpc *ec2.Vpc) (pulumi.StringArray, error) { - // ingress for ssh access from 0.0.0.0 - sshIngressRule := securityGroup.SSH_TCP - sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 - rdpIngressRule := securityGroup.RDP_TCP - rdpIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 - // Create SG with ingress rules + vpc *ec2.Vpc, public bool) (pulumi.StringArray, error) { + var ingressRules []securityGroup.IngressRules + if public { + sshIngressRule := securityGroup.SSH_TCP + sshIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 + rdpIngressRule := securityGroup.RDP_TCP + rdpIngressRule.CidrBlocks = infra.NETWORKING_CIDR_ANY_IPV4 + ingressRules = []securityGroup.IngressRules{sshIngressRule, rdpIngressRule} + } sg, err := securityGroup.SGRequest{ - Name: resourcesUtil.GetResourceName(*prefix, awsWindowsDedicatedID, "sg"), - VPC: vpc, - Description: fmt.Sprintf("sg for %s", awsWindowsDedicatedID), - IngressRules: []securityGroup.IngressRules{ - sshIngressRule, rdpIngressRule}, + Name: resourcesUtil.GetResourceName(*prefix, awsWindowsDedicatedID, "sg"), + VPC: vpc, + Description: fmt.Sprintf("sg for %s", awsWindowsDedicatedID), + IngressRules: ingressRules, }.Create(ctx, mCtx) if err != nil { return nil, err } - // Convert to an array of IDs sgs := util.ArrayConvert([]*ec2.SecurityGroup{sg.SG}, func(sg *ec2.SecurityGroup) pulumi.StringInput { return sg.ID() diff --git a/pkg/provider/aws/data/azs.go b/pkg/provider/aws/data/azs.go index f5b7c63d2..17ac0a712 100644 --- a/pkg/provider/aws/data/azs.go +++ b/pkg/provider/aws/data/azs.go @@ -44,6 +44,10 @@ type AvailabilityZonesResult struct { func describeAvailabilityZonesAllAsync(ctx context.Context, regionName string, c chan AvailabilityZonesResult) { data, err := describeAvailabilityZonesAll(ctx, regionName) + if err != nil || len(data) == 0 { + // AllAvailabilityZones: true may be SCP-blocked or unsupported; fall back. + data, err = describeAvailabilityZones(ctx, regionName, nil) + } c <- AvailabilityZonesResult{AvailabilityZones: data, Err: err} } @@ -131,10 +135,15 @@ func describeAvailabilityZonesByRegions(ctx context.Context, regions []string) m } for i := 0; i < len(regions); i++ { availabilityZonesResult := <-c - if availabilityZonesResult.Err == nil { - region := availabilityZonesResult.AvailabilityZones[0].RegionName - result[*region] = append(result[*region], availabilityZonesResult.AvailabilityZones...) + if availabilityZonesResult.Err != nil { + logging.Debugf("could not describe AZs: %v", availabilityZonesResult.Err) + continue + } + if len(availabilityZonesResult.AvailabilityZones) == 0 { + continue } + region := availabilityZonesResult.AvailabilityZones[0].RegionName + result[*region] = append(result[*region], availabilityZonesResult.AvailabilityZones...) } close(c) return result diff --git a/pkg/provider/aws/data/network.go b/pkg/provider/aws/data/network.go index 4ec139100..c39302a9b 100644 --- a/pkg/provider/aws/data/network.go +++ b/pkg/provider/aws/data/network.go @@ -91,7 +91,9 @@ func getPublicSubnets(ctx context.Context, client *ec2.Client, vpcID string) (su return } -// GetSubnetAZsForVPC returns the unique AZ names of all subnets that belong to the specified VPC. +// GetSubnetAZsForVPC returns the unique AZ names of all subnets in the specified VPC. +// Both public and private subnets are included; callers that need public-only access +// should use GetPublicSubnetIDInAZ and handle the private-subnet fallback themselves. func GetSubnetAZsForVPC(ctx context.Context, region, vpcID string) ([]string, error) { cfg, err := getConfig(ctx, region) if err != nil { @@ -123,6 +125,29 @@ func GetSubnetAZsForVPC(ctx context.Context, region, vpcID string) ([]string, er return azs, nil } +// GetAnySubnetIDInAZ returns the first available subnet (public or private) in the +// given AZ within the specified VPC. Used as a fallback when no public subnet exists. +func GetAnySubnetIDInAZ(ctx context.Context, region, vpcID, az string) (*string, error) { + cfg, err := getConfig(ctx, region) + if err != nil { + return nil, err + } + client := ec2.NewFromConfig(cfg) + subnetsOutput, err := client.DescribeSubnets(ctx, &ec2.DescribeSubnetsInput{ + Filters: []ec2types.Filter{ + {Name: aws.String(filterVPCID), Values: []string{vpcID}}, + {Name: aws.String(filterAvailabilityZone), Values: []string{az}}, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe subnets in VPC %s AZ %s: %w", vpcID, az, err) + } + if len(subnetsOutput.Subnets) == 0 { + return nil, fmt.Errorf("no subnet found in VPC %s AZ %s", vpcID, az) + } + return subnetsOutput.Subnets[0].SubnetId, nil +} + // GetPublicSubnetIDInAZ returns a public subnet ID in the given AZ within the specified VPC. func GetPublicSubnetIDInAZ(ctx context.Context, region, vpcID, az string) (*string, error) { cfg, err := getConfig(ctx, region) diff --git a/pkg/provider/aws/data/spot.go b/pkg/provider/aws/data/spot.go index c502a8023..148a69a33 100644 --- a/pkg/provider/aws/data/spot.go +++ b/pkg/provider/aws/data/spot.go @@ -462,9 +462,23 @@ type placementScoreResult struct { // skipped. Returns a map of region → AZ scores filtered to those meeting minPlacementScore. func getPlacementScores(args placementScoreArgs, regions []string) (map[string][]placementScoreResult, error) { azsByRegion := describeAvailabilityZonesByRegions(args.ctx, regions) + // Restrict placement score request to regions where AZ IDs can be resolved. + // If DescribeAvailabilityZones failed for a region (SCP restriction, region not + // enabled, etc.) scores returned for that region cannot be matched to an AZ name. + resolvedRegions := make([]string, 0, len(regions)) + for _, r := range regions { + if _, ok := azsByRegion[r]; ok { + resolvedRegions = append(resolvedRegions, r) + } else { + logging.Debugf("excluding region %s from spot search: AZ information unavailable", r) + } + } + if len(resolvedRegions) == 0 { + return nil, fmt.Errorf("no regions with resolvable AZ information available for spot search") + } var lastErr error for _, apiRegion := range args.apiRegions { - result, err := placementScoresViaRegion(apiRegion, args, regions, azsByRegion) + result, err := placementScoresViaRegion(apiRegion, args, resolvedRegions, azsByRegion) if err != nil { logging.Debugf("placement score API unavailable in region %s: %v, trying next", apiRegion, err) lastErr = err diff --git a/pkg/provider/aws/modules/ec2/compute/compute.go b/pkg/provider/aws/modules/ec2/compute/compute.go index af9fbe476..36a40ad78 100644 --- a/pkg/provider/aws/modules/ec2/compute/compute.go +++ b/pkg/provider/aws/modules/ec2/compute/compute.go @@ -86,17 +86,28 @@ type Compute struct { func (r *ComputeRequest) NewCompute(ctx *pulumi.Context) (*Compute, error) { if r.Spot { asg, err := r.spotInstance(ctx) + deps := []pulumi.Resource{asg} + if r.LB != nil { + deps = append(deps, r.LB) + } + if r.Eip != nil { + deps = append(deps, r.Eip) + } return &Compute{ AutoscalingGroup: asg, LB: r.LB, Eip: r.Eip, - Dependencies: []pulumi.Resource{asg, r.LB, r.Eip}}, err + Dependencies: deps}, err } i, err := r.onDemandInstance(ctx) + deps := []pulumi.Resource{i} + if r.Eip != nil { + deps = append(deps, r.Eip) + } return &Compute{ Instance: i, Eip: r.Eip, - Dependencies: []pulumi.Resource{i, r.Eip}}, err + Dependencies: deps}, err } // Create on demand instance @@ -112,7 +123,7 @@ func (r *ComputeRequest) onDemandInstance(ctx *pulumi.Context) (*ec2.Instance, e Ami: pulumi.String(r.AMI.Id), InstanceType: pulumi.String(instanceType), KeyName: r.KeyResources.AWSKeyPair.KeyName, - AssociatePublicIpAddress: pulumi.Bool(true), + AssociatePublicIpAddress: pulumi.Bool(r.Eip != nil && !r.Airgap), VpcSecurityGroupIds: r.SecurityGroups, RootBlockDevice: ec2.InstanceRootBlockDeviceArgs{ VolumeSize: pulumi.Int(volSize), @@ -125,9 +136,6 @@ func (r *ComputeRequest) onDemandInstance(ctx *pulumi.Context) (*ec2.Instance, e if r.UserDataAsBase64 != nil { args.UserDataBase64 = r.UserDataAsBase64 } - if r.Airgap { - args.AssociatePublicIpAddress = pulumi.Bool(false) - } instance, err := ec2.NewInstance(ctx, resourcesUtil.GetResourceName(r.Prefix, r.ID, "instance"), &args, @@ -135,14 +143,16 @@ func (r *ComputeRequest) onDemandInstance(ctx *pulumi.Context) (*ec2.Instance, e if err != nil { return nil, err } - _, err = ec2.NewEipAssociation(ctx, - resourcesUtil.GetResourceName(r.Prefix, r.ID, "instance-eip"), - &ec2.EipAssociationArgs{ - InstanceId: instance.ID(), - AllocationId: r.Eip.ID(), - }) - if err != nil { - return nil, err + if r.Eip != nil { + _, err = ec2.NewEipAssociation(ctx, + resourcesUtil.GetResourceName(r.Prefix, r.ID, "instance-eip"), + &ec2.EipAssociationArgs{ + InstanceId: instance.ID(), + AllocationId: r.Eip.ID(), + }) + if err != nil { + return nil, err + } } return instance, nil } @@ -167,7 +177,7 @@ func (r ComputeRequest) spotInstance(ctx *pulumi.Context) (*autoscaling.Group, e NetworkInterfaces: ec2.LaunchTemplateNetworkInterfaceArray{ &ec2.LaunchTemplateNetworkInterfaceArgs{ SecurityGroups: r.SecurityGroups, - AssociatePublicIpAddress: pulumi.String(strconv.FormatBool(!r.Airgap)), + AssociatePublicIpAddress: pulumi.String(strconv.FormatBool(r.Eip != nil && !r.Airgap)), SubnetId: r.Subnet.ID(), }, }, @@ -213,14 +223,16 @@ func (r ComputeRequest) spotInstance(ctx *pulumi.Context) (*autoscaling.Group, e if err != nil { return nil, err } - // Create target groups + // Create target groups (only when a load balancer is present) var tgGroupsARNs []pulumi.StringOutput - for _, tgPort := range r.LBTargetGroups { - tg, err := r.createForwardTargetGRoups(ctx, tgPort) - if err != nil { - return nil, err + if r.LB != nil { + for _, tgPort := range r.LBTargetGroups { + tg, err := r.createForwardTargetGRoups(ctx, tgPort) + if err != nil { + return nil, err + } + tgGroupsARNs = append(tgGroupsARNs, tg.Arn) } - tgGroupsARNs = append(tgGroupsARNs, tg.Arn) } overrides := autoscaling.GroupMixedInstancesPolicyLaunchTemplateOverrideArray{} for _, instanceType := range r.InstaceTypes { @@ -276,11 +288,18 @@ func (r ComputeRequest) spotInstance(ctx *pulumi.Context) (*autoscaling.Group, e } // function returns the ip to access the target host -func (c *Compute) GetHostDnsName(public bool) (ip pulumi.StringInput) { +func (c *Compute) GetHostDnsName(public bool) pulumi.StringInput { if c.LB != nil { return c.LB.DnsName } - return util.If(public, c.Eip.PublicDns, c.Eip.PrivateDns) + if c.Eip != nil { + return util.If(public, c.Eip.PublicDns, c.Eip.PrivateDns) + } + // Private subnet: no EIP or LB — return the instance's private IP. + if c.Instance != nil { + return c.Instance.PrivateIp + } + return pulumi.String("") } func (c *Compute) GetHostIP(public bool) (ip pulumi.StringOutput) { diff --git a/pkg/provider/aws/modules/network/network.go b/pkg/provider/aws/modules/network/network.go index 455c0db70..2eef02853 100644 --- a/pkg/provider/aws/modules/network/network.go +++ b/pkg/provider/aws/modules/network/network.go @@ -51,6 +51,10 @@ type NetworkResult struct { LoadBalancer *lb.LoadBalancer // If Airgap true on args Bastion *bastion.BastionResult + // IsPublic is false when the selected subnet has no internet gateway route + // (private subnet in an existing VPC). In that case no EIP or LB is created; + // the machine connects outbound only and SSH readiness checks are skipped. + IsPublic bool } func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkResult, error) { @@ -68,6 +72,7 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkR if err != nil { return nil, err } + result.IsPublic = true result.Bastion, err = bastion.Create(ctx, mCtx, &bastion.BastionArgs{ Prefix: args.Prefix, @@ -82,16 +87,24 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkR if err != nil { return nil, err } + result.IsPublic = true } - result.Eip, err = ec2.NewEip(ctx, - resourcesUtil.GetResourceName(args.Prefix, args.ID, "lbeip"), - &ec2.EipArgs{ - Tags: mCtx.ResourceTags(), - }) - if err != nil { - return nil, err + // EIP: only for truly public, non-airgap deployments. + // Airgap machines are private (reachable only via bastion); the internal LB + // does not need an EIP. Private-VPC deployments have no public access at all. + if result.IsPublic && !args.Airgap { + result.Eip, err = ec2.NewEip(ctx, + resourcesUtil.GetResourceName(args.Prefix, args.ID, "lbeip"), + &ec2.EipArgs{ + Tags: mCtx.ResourceTags(), + }) + if err != nil { + return nil, err + } } - if args.CreateLoadBalancer { + // LB: created for any public deployment that requests one. + // Public deployments attach the EIP; airgap deployments get an internal LB (no EIP). + if args.CreateLoadBalancer && result.IsPublic { lba := &loadBalancerArgs{ prefix: &args.Prefix, id: &args.ID, @@ -111,8 +124,15 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkR func existingVPCNetwork(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs) (*NetworkResult, error) { subnetID, err := data.GetPublicSubnetIDInAZ(ctx.Context(), args.Region, *args.VpcID, args.AZ) + isPublic := true if err != nil { - return nil, err + // No public subnet in this AZ. Fall back to any available subnet so the + // machine can still run as an outbound-only workload (e.g. a GitLab runner). + subnetID, err = data.GetAnySubnetIDInAZ(ctx.Context(), args.Region, *args.VpcID, args.AZ) + if err != nil { + return nil, err + } + isPublic = false } vpc, err := ec2.GetVpc(ctx, resourcesUtil.GetResourceName(args.Prefix, args.ID, "vpc"), @@ -127,8 +147,9 @@ func existingVPCNetwork(ctx *pulumi.Context, mCtx *mc.Context, args *NetworkArgs return nil, err } return &NetworkResult{ - Vpc: vpc, - Subnet: subnet, + Vpc: vpc, + Subnet: subnet, + IsPublic: isPublic, }, nil }