Skip to content

Commit 92d011a

Browse files
adrianrioboclaude
andcommitted
feat(aws): add --vpc-id flag to deploy into existing VPCs
When --vpc-id is set, mapt reuses an existing VPC instead of creating a new one. The flag is available on all standard AWS hosts (rhel, windows, fedora, rhel-ai) and exposed as a param in all Tekton tasks. Key behaviours: - Spot AZ search is restricted to AZs that have subnets in the given VPC - On-demand AZ selection is restricted to the VPC's subnet AZs - Airgap is rejected when --vpc-id is set (mutually exclusive) - existingVPCNetwork() resolves a public subnet in the chosen AZ and reads the VPC/subnet as pulumi read-only resources (ec2.GetVpc / ec2.GetSubnet), so no new networking infrastructure is created Also fix spot AZ resolution: GetSpotPlacementScores can return AZ IDs for zones not visible to the account via the default DescribeAvailabilityZones call. describeAvailabilityZonesByRegions now uses AllAvailabilityZones: true so all AZ IDs can be resolved to names during spot search, preventing spurious "skipping AZ: az id not found" drops that led to no spot option being found in accounts with SCP-restricted regions. Closes #849 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e504607 commit 92d011a

24 files changed

Lines changed: 301 additions & 61 deletions

cmd/mapt/cmd/aws/hosts/fedora.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ func getFedoraCreate() *cobra.Command {
6868
Spot: params.SpotArgs(),
6969
Timeout: viper.GetString(params.Timeout),
7070
Airgap: viper.IsSet(airgap),
71-
ServiceEndpoints: params.NetworkServiceEndpoints()})
71+
ServiceEndpoints: params.NetworkServiceEndpoints(),
72+
VpcID: params.NetworkVpcID()})
7273
},
7374
}
7475
flagSet := pflag.NewFlagSet(params.CreateCmdName, pflag.ExitOnError)

cmd/mapt/cmd/aws/hosts/rhel.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ func getRHELCreate() *cobra.Command {
6868
Timeout: viper.GetString(params.Timeout),
6969
Airgap: viper.IsSet(airgap),
7070
ServiceEndpoints: params.NetworkServiceEndpoints(),
71+
VpcID: params.NetworkVpcID(),
7172
})
7273
},
7374
}

cmd/mapt/cmd/aws/hosts/rhelai.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ func getRHELAICreate() *cobra.Command {
7474
AutoStart: viper.IsSet(params.RhelAIAutoStart),
7575
VLLMExtraArgs: viper.GetString(params.RhelAIVLLMExtraArgs),
7676
ExposePorts: viper.GetIntSlice(params.RhelAIExposePorts),
77+
VpcID: params.NetworkVpcID(),
7778
})
7879
},
7980
}

cmd/mapt/cmd/aws/hosts/windows.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ func getWindowsCreate() *cobra.Command {
8181
Airgap: viper.IsSet(airgap),
8282
Timeout: viper.GetString(params.Timeout),
8383
ServiceEndpoints: params.NetworkServiceEndpoints(),
84+
VpcID: params.NetworkVpcID(),
8485
})
8586
},
8687
}

cmd/mapt/cmd/params/params.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,9 @@ const (
205205
KindExtraPortMappingsDesc = "Additional port mappings for the Kind cluster. Value should be a JSON array of objects with containerPort, hostPort, and protocol properties. Example: '[{\"containerPort\": 8080, \"hostPort\": 8080, \"protocol\": \"TCP\"}]'"
206206

207207
// Network
208-
ServiceEndpoints = "service-endpoints"
208+
ServiceEndpoints = "service-endpoints"
209+
VpcID = "vpc-id"
210+
VpcIDDesc = "ID of an existing VPC to deploy the instance into. When set, airgap is not supported and spot search is restricted to AZs with subnets in that VPC."
209211

210212
// Spot
211213
spot = "spot"
@@ -222,12 +224,21 @@ const (
222224

223225
func AddNetworkFlags(fs *pflag.FlagSet, desc string) {
224226
fs.StringSliceP(ServiceEndpoints, "", []string{}, desc)
227+
fs.StringP(VpcID, "", "", VpcIDDesc)
225228
}
226229

227230
func NetworkServiceEndpoints() []string {
228231
return viper.GetStringSlice(ServiceEndpoints)
229232
}
230233

234+
func NetworkVpcID() *string {
235+
if viper.IsSet(VpcID) {
236+
v := viper.GetString(VpcID)
237+
return &v
238+
}
239+
return nil
240+
}
241+
231242
func AddSpotFlags(fs *pflag.FlagSet) {
232243
fs.Bool(spot, false, spotDesc)
233244
fs.StringP(spotTolerance, "", spotToleranceDefault, spotToleranceDesc)

pkg/provider/aws/action/fedora/fedora.go

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,28 +33,30 @@ import (
3333
)
3434

3535
type FedoraArgs struct {
36-
Prefix string
37-
Version string
38-
Arch string
39-
ComputeRequest *cr.ComputeRequestArgs
40-
Spot *spotTypes.SpotArgs
41-
Airgap bool
36+
Prefix string
37+
Version string
38+
Arch string
39+
ComputeRequest *cr.ComputeRequestArgs
40+
Spot *spotTypes.SpotArgs
41+
Airgap bool
4242
ServiceEndpoints []string
43+
VpcID *string
4344
// If timeout is set a severless scheduled task will be created to self destroy the resources
4445
Timeout string
4546
}
4647

4748
type fedoraRequest struct {
48-
mCtx *mc.Context
49-
prefix *string
50-
version *string
51-
arch *string
52-
spot bool
53-
timeout *string
49+
mCtx *mc.Context
50+
prefix *string
51+
version *string
52+
arch *string
53+
spot bool
54+
timeout *string
5455
serviceEndpoints []string
55-
allocationData *allocation.AllocationResult
56-
airgap *bool
57-
diskSize *int
56+
vpcID *string
57+
allocationData *allocation.AllocationResult
58+
airgap *bool
59+
diskSize *int
5860
// internal management
5961
// For airgap scenario there is an orchestation of
6062
// a phase with connectivity on the machine (allowing bootstraping)
@@ -81,6 +83,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) {
8183
return err
8284
}
8385
// Compose request
86+
if args.VpcID != nil && args.Airgap {
87+
return fmt.Errorf("--vpc-id and --airgap are mutually exclusive")
88+
}
8489
prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main")
8590
r := fedoraRequest{
8691
mCtx: mCtx,
@@ -89,6 +94,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) {
8994
arch: &args.Arch,
9095
timeout: &args.Timeout,
9196
serviceEndpoints: args.ServiceEndpoints,
97+
vpcID: args.VpcID,
9298
airgap: &args.Airgap,
9399
diskSize: args.ComputeRequest.DiskSize}
94100
if args.Spot != nil {
@@ -100,6 +106,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *FedoraArgs) (err error) {
100106
ComputeRequest: args.ComputeRequest,
101107
AMIProductDescription: &amiProduct,
102108
Spot: args.Spot,
109+
VpcID: args.VpcID,
103110
})
104111
if err != nil {
105112
return err
@@ -201,7 +208,8 @@ func (r *fedoraRequest) deploy(ctx *pulumi.Context) error {
201208
CreateLoadBalancer: r.spot,
202209
Airgap: *r.airgap,
203210
AirgapPhaseConnectivity: r.airgapPhaseConnectivity,
204-
ServiceEndpoints: r.serviceEndpoints,
211+
ServiceEndpoints: r.serviceEndpoints,
212+
VpcID: r.vpcID,
205213
})
206214
if err != nil {
207215
return err

pkg/provider/aws/action/rhel-ai/rhelai.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type rhelAIRequest struct {
4040
spot bool
4141
timeout *string
4242
serviceEndpoints []string
43+
vpcID *string
4344
allocationData *allocation.AllocationResult
4445
diskSize *int
4546
model *string
@@ -81,6 +82,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
8182
arch: &args.Arch,
8283
timeout: &args.Timeout,
8384
serviceEndpoints: args.ServiceEndpoints,
85+
vpcID: args.VpcID,
8486
diskSize: args.ComputeRequest.DiskSize,
8587
model: &args.Model,
8688
hfToken: &args.HFToken,
@@ -97,6 +99,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) {
9799
ComputeRequest: args.ComputeRequest,
98100
AMIProductDescription: &amiProduct,
99101
Spot: args.Spot,
102+
VpcID: args.VpcID,
100103
})
101104
if err != nil {
102105
return err
@@ -228,6 +231,7 @@ func (r *rhelAIRequest) deploy(ctx *pulumi.Context) error {
228231
AZ: *r.allocationData.AZ,
229232
CreateLoadBalancer: r.allocationData.SpotPrice != nil,
230233
ServiceEndpoints: r.serviceEndpoints,
234+
VpcID: r.vpcID,
231235
})
232236
if err != nil {
233237
return err

pkg/provider/aws/action/rhel/rhel.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type RHELArgs struct {
4242
Spot *spotTypes.SpotArgs
4343
Airgap bool
4444
ServiceEndpoints []string
45+
VpcID *string
4546
// If timeout is set a severless scheduled task will be created to self destroy the resources
4647
Timeout string
4748
}
@@ -57,6 +58,7 @@ type rhelRequest struct {
5758
profileSNC *bool
5859
timeout *string
5960
serviceEndpoints []string
61+
vpcID *string
6062
allocationData *allocation.AllocationResult
6163
airgap *bool
6264
diskSize *int
@@ -86,6 +88,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) {
8688
return err
8789
}
8890
// Compose request
91+
if args.VpcID != nil && args.Airgap {
92+
return fmt.Errorf("--vpc-id and --airgap are mutually exclusive")
93+
}
8994
prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main")
9095
r := rhelRequest{
9196
mCtx: mCtx,
@@ -97,6 +102,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) {
97102
subsUserpass: &args.SubsUserpass,
98103
profileSNC: &args.ProfileSNC,
99104
serviceEndpoints: args.ServiceEndpoints,
105+
vpcID: args.VpcID,
100106
airgap: &args.Airgap,
101107
diskSize: args.ComputeRequest.DiskSize}
102108
if args.Spot != nil {
@@ -108,6 +114,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *RHELArgs) (err error) {
108114
ComputeRequest: args.ComputeRequest,
109115
AMIProductDescription: &amiProduct,
110116
Spot: args.Spot,
117+
VpcID: args.VpcID,
111118
})
112119
if err != nil {
113120
return err
@@ -206,7 +213,8 @@ func (r *rhelRequest) deploy(ctx *pulumi.Context) error {
206213
CreateLoadBalancer: r.allocationData.SpotPrice != nil,
207214
Airgap: *r.airgap,
208215
AirgapPhaseConnectivity: r.airgapPhaseConnectivity,
209-
ServiceEndpoints: r.serviceEndpoints,
216+
ServiceEndpoints: r.serviceEndpoints,
217+
VpcID: r.vpcID,
210218
})
211219
if err != nil {
212220
return err

pkg/provider/aws/action/windows/windows.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ type WindowsServerArgs struct {
4646
AMILang string
4747
AMIKeepCopy bool
4848
// Machine params
49-
ComputeRequest *cr.ComputeRequestArgs
50-
Spot *spotTypes.SpotArgs
51-
Airgap bool
49+
ComputeRequest *cr.ComputeRequestArgs
50+
Spot *spotTypes.SpotArgs
51+
Airgap bool
5252
ServiceEndpoints []string
53+
VpcID *string
5354
// If timeout is set a severless scheduled task will be created to self destroy the resources
5455
Timeout string
5556
}
@@ -64,12 +65,13 @@ type windowsServerRequest struct {
6465
amiLang *string
6566
amiKeepCopy *bool
6667

67-
spot bool
68-
timeout *string
68+
spot bool
69+
timeout *string
6970
serviceEndpoints []string
70-
allocationData *allocation.AllocationResult
71-
airgap *bool
72-
diskSize *int
71+
vpcID *string
72+
allocationData *allocation.AllocationResult
73+
airgap *bool
74+
diskSize *int
7375
// internal management
7476
// For airgap scenario there is an orchestation of
7577
// a phase with connectivity on the machine (allowing bootstraping)
@@ -104,6 +106,9 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) {
104106
args.AMIName = amiNonEngNameDefault
105107
}
106108
// Compose request
109+
if args.VpcID != nil && args.Airgap {
110+
return fmt.Errorf("--vpc-id and --airgap are mutually exclusive")
111+
}
107112
prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main")
108113
r := windowsServerRequest{
109114
mCtx: mCtx,
@@ -115,6 +120,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) {
115120
amiLang: &args.AMILang,
116121
timeout: &args.Timeout,
117122
serviceEndpoints: args.ServiceEndpoints,
123+
vpcID: args.VpcID,
118124
airgap: &args.Airgap,
119125
}
120126
if args.ComputeRequest != nil {
@@ -129,6 +135,7 @@ func Create(mCtxArgs *mc.ContextArgs, args *WindowsServerArgs) (err error) {
129135
ComputeRequest: args.ComputeRequest,
130136
AMIProductDescription: &amiProduct,
131137
Spot: args.Spot,
138+
VpcID: args.VpcID,
132139
})
133140
if err != nil {
134141
return err
@@ -259,7 +266,8 @@ func (r *windowsServerRequest) deploy(ctx *pulumi.Context) error {
259266
CreateLoadBalancer: r.spot,
260267
Airgap: *r.airgap,
261268
AirgapPhaseConnectivity: r.airgapPhaseConnectivity,
262-
ServiceEndpoints: r.serviceEndpoints,
269+
ServiceEndpoints: r.serviceEndpoints,
270+
VpcID: r.vpcID,
263271
})
264272
if err != nil {
265273
return err

pkg/provider/aws/data/azs.go

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,28 @@ type AvailabilityZonesResult struct {
4242
Err error
4343
}
4444

45-
func describeAvailabilityZonesAsync(ctx context.Context, regionName string, c chan AvailabilityZonesResult) {
46-
data, err := DescribeAvailabilityZones(ctx, regionName)
47-
c <- AvailabilityZonesResult{
48-
AvailabilityZones: data,
49-
Err: err}
50-
45+
func describeAvailabilityZonesAllAsync(ctx context.Context, regionName string, c chan AvailabilityZonesResult) {
46+
data, err := describeAvailabilityZonesAll(ctx, regionName)
47+
c <- AvailabilityZonesResult{AvailabilityZones: data, Err: err}
5148
}
5249

5350
func DescribeAvailabilityZones(ctx context.Context, regionName string) ([]ec2Types.AvailabilityZone, error) {
5451
return describeAvailabilityZones(ctx, regionName, nil)
5552
}
5653

5754
func describeAvailabilityZones(ctx context.Context, regionName string, excludedZoneIDs []string) ([]ec2Types.AvailabilityZone, error) {
55+
return describeAvailabilityZonesOpts(ctx, regionName, excludedZoneIDs, false)
56+
}
57+
58+
// describeAvailabilityZonesAll is like describeAvailabilityZones but includes AZs not
59+
// normally visible to the account (AllAvailabilityZones: true). Used only for AZ ID→name
60+
// resolution during spot placement score lookups, where the scores API can return AZ IDs
61+
// for zones not yet opted-in to by the account.
62+
func describeAvailabilityZonesAll(ctx context.Context, regionName string) ([]ec2Types.AvailabilityZone, error) {
63+
return describeAvailabilityZonesOpts(ctx, regionName, nil, true)
64+
}
65+
66+
func describeAvailabilityZonesOpts(ctx context.Context, regionName string, excludedZoneIDs []string, allZones bool) ([]ec2Types.AvailabilityZone, error) {
5867
var cfgOpts config.LoadOptionsFunc
5968
if len(regionName) > 0 {
6069
cfgOpts = config.WithRegion(regionName)
@@ -64,9 +73,8 @@ func describeAvailabilityZones(ctx context.Context, regionName string, excludedZ
6473
return nil, err
6574
}
6675
client := ec2.NewFromConfig(cfg)
67-
// TODO check what happen when true and region name
6876
input := ec2.DescribeAvailabilityZonesInput{
69-
// AllAvailabilityZones: aws.Bool(true),
77+
AllAvailabilityZones: aws.Bool(allZones),
7078
}
7179
input.Filters = []ec2Types.Filter{
7280
{
@@ -112,12 +120,14 @@ func getZoneName(azID string, azDescriptions []ec2Types.AvailabilityZone) (strin
112120
// user 1 Name: us-west-1a ID: us-west-11, Name: us-west-1b ID: us-west-12
113121
// user 2 Name: us-west-1a ID: us-west-12, Name: us-west-1b ID: us-west-11
114122
// This allowsa a better distribution among users
123+
// describeAvailabilityZonesByRegions fetches all AZs (including non-opted-in ones) so
124+
// that AZ IDs returned by GetSpotPlacementScores can always be resolved to names.
115125
func describeAvailabilityZonesByRegions(ctx context.Context, regions []string) map[string][]ec2Types.AvailabilityZone {
116126
result := make(map[string][]ec2Types.AvailabilityZone)
117127
c := make(chan AvailabilityZonesResult)
118128
for _, region := range regions {
119129
lRegion := region
120-
go describeAvailabilityZonesAsync(ctx, lRegion, c)
130+
go describeAvailabilityZonesAllAsync(ctx, lRegion, c)
121131
}
122132
for i := 0; i < len(regions); i++ {
123133
availabilityZonesResult := <-c

0 commit comments

Comments
 (0)