88
99 maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
1010 azureLinux "github.com/redhat-developer/mapt/pkg/provider/azure/action/linux"
11+ cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request"
1112 "github.com/redhat-developer/mapt/pkg/provider/azure/data"
1213 "github.com/redhat-developer/mapt/pkg/provider/util/command"
1314 apiRHELAI "github.com/redhat-developer/mapt/pkg/target/host/rhelai"
@@ -22,6 +23,13 @@ const (
2223 // $1 subscriptionId $2 rgName $3 galleryName $4 imageName
2324 imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0"
2425
26+ // Marketplace image coordinates
27+ marketplacePublisher = "RedHat"
28+ marketplaceOffer = "rh-rhel-ai"
29+ marketplacePlanPublisher = "redhat"
30+ // SKU pattern: rh-rhelai-{nvidia|amd}-{N}gpu (gen2 handled by SkuG2Support)
31+ marketplaceSkuRegex = "rh-rhelai-%s-%dgpu"
32+
2533 username = "azureuser"
2634)
2735
@@ -37,6 +45,13 @@ func imageId(accelerator, version string) string {
3745 return imageIdFromName (fmt .Sprintf (imageNameRegex , accelerator , version ))
3846}
3947
48+ var acceleratorToMarketplace = map [string ]string {
49+ "cuda" : "nvidia" ,
50+ "rocm" : "amd" ,
51+ }
52+
53+ var validMarketplaceGPUCounts = map [int32 ]bool {1 : true , 2 : true , 4 : true , 8 : true }
54+
4055// isGPUCapableSize returns true for ND-series and NC-series Azure VM sizes,
4156// which are the compute GPU families supported for RHEL AI workloads.
4257// NV-series (visualization GPUs) is intentionally excluded.
@@ -50,10 +65,6 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
5065 return fmt .Errorf ("RHEL AI: args and ComputeRequest must not be nil" )
5166 }
5267 logging .Debug ("Creating RHEL AI Server" )
53- sharedImageID := imageId (args .Accelerator , args .Version )
54- if args .CustomImage != "" {
55- sharedImageID = imageIdFromName (args .CustomImage )
56- }
5768 // Shallow-copy to avoid mutating the caller's ComputeRequestArgs.
5869 computeReq := * args .ComputeRequest
5970 // Ensure GPU-capable instance selection for auto-selection paths.
@@ -68,27 +79,66 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
6879 return fmt .Errorf ("RHEL AI: %q is not GPU-capable (expected ND-series or NC-series for vllm)" , s )
6980 }
7081 }
82+ imageRef , err := resolveImageSource (args , & computeReq )
83+ if err != nil {
84+ return err
85+ }
7186 azureLinuxRequest :=
7287 & azureLinux.LinuxArgs {
73- Prefix : args .Prefix ,
74- ComputeRequest : & computeReq ,
75- Spot : args .Spot ,
76- ImageRef : & data.ImageReference {
77- SharedImageID : sharedImageID ,
78- // Belt-and-suspenders: set SCSI explicitly so Azure never infers a
79- // conflicting default. resolveImageRef will also derive this from the
80- // gallery image's Features, but the static value protects against API
81- // failures or future images with multiple supported types.
82- DiskControllerType : "SCSI" ,
83- },
88+ Prefix : args .Prefix ,
89+ ComputeRequest : & computeReq ,
90+ Spot : args .Spot ,
91+ ImageRef : imageRef ,
8492 Username : username ,
8593 ReadinessCommand : command .CommandPing }
86- if err = azureLinux .Create (mCtxArgs , azureLinuxRequest ); err != nil && len (computeReq .ComputeSizes ) == 0 {
87- return fmt .Errorf ("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w" , err )
94+ if err = azureLinux .Create (mCtxArgs , azureLinuxRequest ); err != nil {
95+ if args .Marketplace && imageRef .Plan != nil &&
96+ (strings .Contains (err .Error (), "ResourcePurchaseValidationFailed" ) ||
97+ strings .Contains (err .Error (), "MarketplacePurchaseEligibilityFailed" )) {
98+ return fmt .Errorf ("RHEL AI marketplace: terms not accepted; run: az vm image terms accept --publisher %s --offer %s --plan %s\n %w" ,
99+ imageRef .Plan .Publisher , marketplaceOffer , imageRef .Plan .Name , err )
100+ }
101+ if len (computeReq .ComputeSizes ) == 0 {
102+ return fmt .Errorf ("RHEL AI: failed to provision a GPU-capable instance (ND/NC-series required for vllm); verify GPU quota in the target location/subscription: %w" , err )
103+ }
88104 }
89105 return err
90106}
91107
108+ func resolveImageSource (args * apiRHELAI.RHELAIArgs , computeReq * cr.ComputeRequestArgs ) (* data.ImageReference , error ) {
109+ if args .Marketplace {
110+ gpus := computeReq .GPUs
111+ if ! validMarketplaceGPUCounts [gpus ] {
112+ return nil , fmt .Errorf ("RHEL AI marketplace: --gpus must be 1, 2, 4, or 8 (got %d)" , gpus )
113+ }
114+ accName , ok := acceleratorToMarketplace [strings .ToLower (args .Accelerator )]
115+ if ! ok {
116+ return nil , fmt .Errorf ("RHEL AI marketplace: unsupported accelerator %q (expected cuda or rocm)" , args .Accelerator )
117+ }
118+ sku := fmt .Sprintf (marketplaceSkuRegex , accName , gpus )
119+ return & data.ImageReference {
120+ Publisher : marketplacePublisher ,
121+ Offer : marketplaceOffer ,
122+ Sku : sku ,
123+ Plan : & data.MarketplacePlan {
124+ Name : sku ,
125+ Product : marketplaceOffer ,
126+ Publisher : marketplacePlanPublisher ,
127+ },
128+ }, nil
129+ }
130+ if args .CustomImage != "" {
131+ return & data.ImageReference {
132+ SharedImageID : imageIdFromName (args .CustomImage ),
133+ DiskControllerType : "SCSI" ,
134+ }, nil
135+ }
136+ return & data.ImageReference {
137+ SharedImageID : imageId (args .Accelerator , args .Version ),
138+ DiskControllerType : "SCSI" ,
139+ }, nil
140+ }
141+
92142func Destroy (mCtxArgs * maptContext.ContextArgs ) error {
93143 return azureLinux .Destroy (mCtxArgs )
94144}
0 commit comments