Skip to content

Commit 79f6ca3

Browse files
ppitonakclaude
andcommitted
feat(azure/rhel-ai): add Azure Marketplace image support
Add --marketplace flag to deploy RHEL AI from the Azure Marketplace instead of the shared gallery. The SKU is constructed from accelerator type (cuda→nvidia, rocm→amd) and GPU count (1/2/4/8), with gen2 handling delegated to SkuG2Support. Includes a Plan block on the VM resource for marketplace purchase plan acceptance and a helpful error message when marketplace terms have not been accepted. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Pavol Pitonak <ppitonak@redhat.com>
1 parent 42c139c commit 79f6ca3

8 files changed

Lines changed: 118 additions & 21 deletions

File tree

cmd/mapt/cmd/azure/hosts/rhelai.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ func getRHELAICreate() *cobra.Command {
6363
Version: viper.GetString(params.RhelAIVersion),
6464
Accelerator: viper.GetString(params.RhelAIAccelerator),
6565
CustomImage: viper.GetString(params.RhelAICustomImage),
66+
Marketplace: viper.GetBool(params.RhelAIMarketplace),
6667
ComputeRequest: params.ComputeRequestArgs(),
6768
Spot: params.SpotArgs(),
6869
Timeout: viper.GetString(params.Timeout),
@@ -75,6 +76,7 @@ func getRHELAICreate() *cobra.Command {
7576
flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc)
7677
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
7778
flagSet.StringP(params.RhelAICustomImage, "", "", params.RhelAICustomImageDesc)
79+
flagSet.Bool(params.RhelAIMarketplace, false, params.RhelAIMarketplaceDesc)
7880
flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc)
7981
params.AddComputeRequestFlags(flagSet)
8082
params.AddSpotFlags(flagSet)

cmd/mapt/cmd/params/params.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ const (
119119
RhelAIAccelearatorDefault string = "cuda"
120120
RhelAICustomImage string = "custom-image"
121121
RhelAICustomImageDesc string = "custom image name to spin RHEL AI OS (AMI name for AWS, image name for Azure)"
122+
RhelAIMarketplace string = "marketplace"
123+
RhelAIMarketplaceDesc string = "use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image"
122124

123125
// Serverless
124126
Timeout string = "timeout"

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 67 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
1010
azureLinux "github.com/redhat-developer/mapt/pkg/provider/azure/action/linux"
11+
cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
1112
"github.com/redhat-developer/mapt/pkg/provider/azure/data"
1213
"github.com/redhat-developer/mapt/pkg/provider/util/command"
1314
apiRHELAI "github.com/redhat-developer/mapt/pkg/target/host/rhelai"
@@ -22,6 +23,13 @@ const (
2223
// $1 subscriptionId $2 rgName $3 galleryName $4 imageName
2324
imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0"
2425

26+
// Marketplace image coordinates
27+
marketplacePublisher = "RedHat"
28+
marketplaceOffer = "rh-rhel-ai"
29+
marketplacePlanPublisher = "redhat"
30+
// SKU pattern: rh-rhelai-{nvidia|amd}-{N}gpu (gen2 handled by SkuG2Support)
31+
marketplaceSkuRegex = "rh-rhelai-%s-%dgpu"
32+
2533
username = "azureuser"
2634
)
2735

@@ -37,6 +45,13 @@ func imageId(accelerator, version string) string {
3745
return imageIdFromName(fmt.Sprintf(imageNameRegex, accelerator, version))
3846
}
3947

48+
var acceleratorToMarketplace = map[string]string{
49+
"cuda": "nvidia",
50+
"rocm": "amd",
51+
}
52+
53+
var validMarketplaceGPUCounts = map[int32]bool{1: true, 2: true, 4: true, 8: true}
54+
4055
// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
4156
// which are the compute GPU families supported for RHEL AI workloads.
4257
// NV-series (visualization GPUs) is intentionally excluded.
@@ -50,10 +65,6 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
5065
return fmt.Errorf("RHEL AI: args and ComputeRequest must not be nil")
5166
}
5267
logging.Debug("Creating RHEL AI Server")
53-
sharedImageID := imageId(args.Accelerator, args.Version)
54-
if args.CustomImage != "" {
55-
sharedImageID = imageIdFromName(args.CustomImage)
56-
}
5768
// Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
5869
computeReq := *args.ComputeRequest
5970
// Ensure GPU-capable instance selection for auto-selection paths.
@@ -68,27 +79,66 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
6879
return fmt.Errorf("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)", s)
6980
}
7081
}
82+
imageRef, err := resolveImageSource(args, &computeReq)
83+
if err != nil {
84+
return err
85+
}
7186
azureLinuxRequest :=
7287
&azureLinux.LinuxArgs{
73-
Prefix: args.Prefix,
74-
ComputeRequest: &computeReq,
75-
Spot: args.Spot,
76-
ImageRef: &data.ImageReference{
77-
SharedImageID: sharedImageID,
78-
// Belt-and-suspenders: set SCSI explicitly so Azure never infers a
79-
// conflicting default. resolveImageRef will also derive this from the
80-
// gallery image's Features, but the static value protects against API
81-
// failures or future images with multiple supported types.
82-
DiskControllerType: "SCSI",
83-
},
88+
Prefix: args.Prefix,
89+
ComputeRequest: &computeReq,
90+
Spot: args.Spot,
91+
ImageRef: imageRef,
8492
Username: username,
8593
ReadinessCommand: command.CommandPing}
86-
if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil && len(computeReq.ComputeSizes) == 0 {
87-
return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
94+
if err = azureLinux.Create(mCtxArgs, azureLinuxRequest); err != nil {
95+
if args.Marketplace && imageRef.Plan != nil &&
96+
(strings.Contains(err.Error(), "ResourcePurchaseValidationFailed") ||
97+
strings.Contains(err.Error(), "MarketplacePurchaseEligibilityFailed")) {
98+
return fmt.Errorf("RHEL AI marketplace: terms not accepted; run: az vm image terms accept --publisher %s --offer %s --plan %s\n%w",
99+
imageRef.Plan.Publisher, marketplaceOffer, imageRef.Plan.Name, err)
100+
}
101+
if len(computeReq.ComputeSizes) == 0 {
102+
return fmt.Errorf("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w", err)
103+
}
88104
}
89105
return err
90106
}
91107

108+
func resolveImageSource(args *apiRHELAI.RHELAIArgs, computeReq *cr.ComputeRequestArgs) (*data.ImageReference, error) {
109+
if args.Marketplace {
110+
gpus := computeReq.GPUs
111+
if !validMarketplaceGPUCounts[gpus] {
112+
return nil, fmt.Errorf("RHEL AI marketplace: --gpus must be 1, 2, 4, or 8 (got %d)", gpus)
113+
}
114+
accName, ok := acceleratorToMarketplace[strings.ToLower(args.Accelerator)]
115+
if !ok {
116+
return nil, fmt.Errorf("RHEL AI marketplace: unsupported accelerator %q (expected cuda or rocm)", args.Accelerator)
117+
}
118+
sku := fmt.Sprintf(marketplaceSkuRegex, accName, gpus)
119+
return &data.ImageReference{
120+
Publisher: marketplacePublisher,
121+
Offer: marketplaceOffer,
122+
Sku: sku,
123+
Plan: &data.MarketplacePlan{
124+
Name: sku,
125+
Product: marketplaceOffer,
126+
Publisher: marketplacePlanPublisher,
127+
},
128+
}, nil
129+
}
130+
if args.CustomImage != "" {
131+
return &data.ImageReference{
132+
SharedImageID: imageIdFromName(args.CustomImage),
133+
DiskControllerType: "SCSI",
134+
}, nil
135+
}
136+
return &data.ImageReference{
137+
SharedImageID: imageId(args.Accelerator, args.Version),
138+
DiskControllerType: "SCSI",
139+
}, nil
140+
}
141+
92142
func Destroy(mCtxArgs *maptContext.ContextArgs) error {
93143
return azureLinux.Destroy(mCtxArgs)
94144
}

pkg/provider/azure/data/imageref.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ const fedoraImageGalleryBase = "/CommunityGalleries/Fedora-5e266ba4-2250-406d-ad
2121

2222
// /subscriptions/02db6bd4-035c-4074-b699-468f3d914744/resourceGroups/RHEL-AI-CUDA-AZURE-3.0.0/providers/Microsoft.Compute/galleries/rhel_ai_cuda_azure_3.0.0/images/rhel-ai-cuda-azure-3.0.0/versions/1.0.0
2323

24+
type MarketplacePlan struct {
25+
Name string
26+
Product string
27+
Publisher string
28+
}
29+
2430
type ImageReference struct {
2531
// Market Place
2632
Publisher string
@@ -33,6 +39,8 @@ type ImageReference struct {
3339
// Required disk controller type for this image (e.g. "SCSI", "NVMe").
3440
// Empty means no specific requirement; Azure uses the VM size default.
3541
DiskControllerType string
42+
// Non-nil when the image requires a purchase plan (e.g. marketplace images).
43+
Plan *MarketplacePlan
3644
}
3745

3846
var (

pkg/provider/azure/modules/virtual-machine/virtual-machine.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ type VirtualMachine = *compute.VirtualMachine
4747
// Create virtual machine based on request + export to context
4848
// adminusername and adminuserpassword
4949
func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (VirtualMachine, error) {
50-
ira, err := convertImageRef(mCtx, *args.Image, args.Location)
50+
ira, err := convertImageRef(mCtx, args.Image, args.Location)
5151
if err != nil {
5252
return nil, err
5353
}
@@ -99,6 +99,13 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (Vi
9999
MaxPrice: pulumi.Float64(*args.SpotPrice),
100100
}
101101
}
102+
if args.Image.Plan != nil {
103+
vmArgs.Plan = compute.PlanArgs{
104+
Name: pulumi.String(args.Image.Plan.Name),
105+
Product: pulumi.String(args.Image.Plan.Product),
106+
Publisher: pulumi.String(args.Image.Plan.Publisher),
107+
}
108+
}
102109
logging.Debug("About to create the VM with compute.NewVirtualMachine")
103110
return compute.NewVirtualMachine(ctx,
104111
resourcesUtil.GetResourceName(args.Prefix, args.ComponentID, "vm"),
@@ -130,7 +137,7 @@ func osProfile(computerName string, args *VirtualMachineArgs) compute.OSProfileA
130137
return osProfile
131138
}
132139

133-
func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (*compute.ImageReferenceArgs, error) {
140+
func convertImageRef(mCtx *mc.Context, i *data.ImageReference, location string) (*compute.ImageReferenceArgs, error) {
134141
if len(i.CommunityImageID) > 0 {
135142
return &compute.ImageReferenceArgs{
136143
CommunityGalleryImageId: pulumi.String(i.CommunityImageID),
@@ -151,6 +158,9 @@ func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (
151158
if err != nil {
152159
return nil, err
153160
}
161+
if i.Plan != nil && finalSku != i.Sku {
162+
i.Plan.Name = finalSku
163+
}
154164
return &compute.ImageReferenceArgs{
155165
Publisher: pulumi.String(i.Publisher),
156166
Offer: pulumi.String(i.Offer),

pkg/target/host/rhelai/api.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ type RHELAIArgs struct {
1010
Accelerator string
1111
Version string
1212
CustomImage string
13+
Marketplace bool
1314
Arch string
1415
ComputeRequest *cr.ComputeRequestArgs
1516
Spot *spotTypes.SpotArgs

tkn/infra-azure-rhel-ai.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ spec:
110110
- name: version
111111
description: Version of RHEL AI OS (default 3.2.0)
112112
default: "3.2.0"
113+
- name: marketplace
114+
description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image
115+
default: "false"
113116

114117
# Metadata params
115118
- name: tags
@@ -226,7 +229,10 @@ spec:
226229
if [[ "$(params.compute-sizes)" != "" ]]; then
227230
cmd+="--compute-sizes '$(params.compute-sizes)' "
228231
fi
229-
if [[ "$(params.custom-image)" != "" ]]; then
232+
if [[ "$(params.marketplace)" == "true" ]]; then
233+
cmd+="--marketplace "
234+
cmd+="--accelerator '$(params.accelerator)' "
235+
elif [[ "$(params.custom-image)" != "" ]]; then
230236
cmd+="--custom-image '$(params.custom-image)' "
231237
else
232238
cmd+="--accelerator '$(params.accelerator)' "

tkn/template/infra-azure-rhel-ai.yaml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ spec:
8585
- name: disk-size
8686
description: Disk size in GB for the cloud instance
8787
default: "200"
88+
- name: gpus
89+
description: Number of GPUs for the cloud instance (valid marketplace values are 1, 2, 4, 8)
90+
default: "8"
91+
- name: gpu-manufacturer
92+
description: GPU manufacturer name for instance filtering (e.g. NVIDIA, AMD)
93+
default: ""
8894
- name: compute-sizes
8995
description: Comma seperated list of sizes for the machines to be requested. If set this takes precedence over compute by args
9096
default: "Standard_ND96is_MI300X_v5,Standard_ND96isr_MI300X_v5"
@@ -110,6 +116,9 @@ spec:
110116
- name: version
111117
description: Version of RHEL AI OS (default 3.2.0)
112118
default: "3.2.0"
119+
- name: marketplace
120+
description: Use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image
121+
default: "false"
113122

114123
# Metadata params
115124
- name: tags
@@ -226,7 +235,16 @@ spec:
226235
if [[ "$(params.compute-sizes)" != "" ]]; then
227236
cmd+="--compute-sizes '$(params.compute-sizes)' "
228237
fi
229-
if [[ "$(params.custom-image)" != "" ]]; then
238+
if [[ "$(params.gpus)" != "" ]]; then
239+
cmd+="--gpus '$(params.gpus)' "
240+
fi
241+
if [[ "$(params.gpu-manufacturer)" != "" ]]; then
242+
cmd+="--gpu-manufacturer '$(params.gpu-manufacturer)' "
243+
fi
244+
if [[ "$(params.marketplace)" == "true" ]]; then
245+
cmd+="--marketplace "
246+
cmd+="--accelerator '$(params.accelerator)' "
247+
elif [[ "$(params.custom-image)" != "" ]]; then
230248
cmd+="--custom-image '$(params.custom-image)' "
231249
else
232250
cmd+="--accelerator '$(params.accelerator)' "

0 commit comments

Comments
 (0)