Skip to content

Commit e1367b5

Browse files
rishupkclaude
andcommitted
feat(azure/rhel-ai): add list-versions subcommand
- Add ListGalleriesByPrefix to data/images.go — queries Azure Compute Gallery by resource group and name prefix using the SDK - Add ListVersions to the rhel-ai action — enumerates available RHEL AI image versions for a given accelerator type from the owner gallery - Add `mapt azure rhel-ai list-versions --accelerator <cuda|rocm>` cobra subcommand to expose version discovery via CLI - Extract imageOwnerResourceGroup constant from the hardcoded string in imageIdRegex Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com>
1 parent aaa5c99 commit e1367b5

4 files changed

Lines changed: 200 additions & 3 deletions

File tree

cmd/mapt/cmd/azure/hosts/rhelai.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package hosts
22

33
import (
4+
"fmt"
5+
46
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
57
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
68
rhelai "github.com/redhat-developer/mapt/pkg/provider/azure/action/rhel-ai"
@@ -13,6 +15,9 @@ import (
1315
const (
1416
cmdRHELAI = "rhel-ai"
1517
cmdRHELAIDesc = "manage rhel ai host"
18+
19+
cmdRHELAIListVersions = "list-versions"
20+
cmdRHELAIListVersionsDesc = "list available RHEL AI versions in the Azure Compute Gallery"
1621
)
1722

1823
func GetRHELAICmd() *cobra.Command {
@@ -31,7 +36,7 @@ func GetRHELAICmd() *cobra.Command {
3136
params.AddCommonFlags(flagSet)
3237
c.PersistentFlags().AddFlagSet(flagSet)
3338

34-
c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
39+
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
3540
return c
3641
}
3742

@@ -104,3 +109,27 @@ func getRHELAIDestroy() *cobra.Command {
104109
c.PersistentFlags().AddFlagSet(flagSet)
105110
return c
106111
}
112+
113+
func getRHELAIListVersions() *cobra.Command {
114+
c := &cobra.Command{
115+
Use: cmdRHELAIListVersions,
116+
Short: cmdRHELAIListVersionsDesc,
117+
RunE: func(cmd *cobra.Command, args []string) error {
118+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
119+
return err
120+
}
121+
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
122+
if err != nil {
123+
return err
124+
}
125+
for _, v := range versions {
126+
fmt.Println(v)
127+
}
128+
return nil
129+
},
130+
}
131+
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
132+
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
133+
c.PersistentFlags().AddFlagSet(flagSet)
134+
return c
135+
}

docs/azure/rhelai.md

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Overview
2+
3+
mapt offers operations to manage RHEL AI environments on Azure. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images from the Azure Compute Gallery, suitable for AI/ML workloads.
4+
5+
## Operations
6+
7+
### List Versions
8+
9+
List available RHEL AI versions for a given accelerator type:
10+
11+
```bash
12+
mapt azure rhel-ai list-versions -h
13+
list-versions
14+
15+
Usage:
16+
mapt azure rhel-ai list-versions [flags]
17+
18+
Flags:
19+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
20+
-h, --help help for list-versions
21+
22+
Global Flags:
23+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
24+
--project-name string project name to identify the instance of the stack
25+
```
26+
27+
#### Container
28+
29+
```bash
30+
podman run -it --rm \
31+
-e ARM_TENANT_ID=${ati_value} \
32+
-e ARM_SUBSCRIPTION_ID=${asi_value} \
33+
-e ARM_CLIENT_ID=${aci_value} \
34+
-e ARM_CLIENT_SECRET=${acs_value} \
35+
quay.io/redhat-developer/mapt:0.7.0-dev azure \
36+
rhel-ai list-versions \
37+
--accelerator cuda
38+
```
39+
40+
### Create
41+
42+
This will create a RHEL AI instance according to params specified:
43+
44+
```bash
45+
mapt azure rhel-ai create -h
46+
create
47+
48+
Usage:
49+
mapt azure rhel-ai create [flags]
50+
51+
Flags:
52+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
53+
--conn-details-output string path to export host connection information (host, username and privateKey)
54+
--cpus int32 Number of CPUs for the cloud instance (default 8)
55+
--custom-image string custom image name (overrides version and accelerator)
56+
--disk-size int Disk size in GB
57+
--gpus int32 Number of GPUs
58+
--memory int32 Amount of RAM for the cloud instance in GiB (default 64)
59+
--spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction)
60+
--spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest")
61+
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
62+
--timeout string set a timeout for the instance (e.g. 4h)
63+
--version string version for the RHELAI OS (default "3.0.0")
64+
-h, --help help for create
65+
66+
Global Flags:
67+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
68+
--project-name string project name to identify the instance of the stack
69+
```
70+
71+
#### Outputs
72+
73+
It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`:
74+
75+
* **host**: host for the instance (load balancer DNS if spot)
76+
* **username**: username to connect to the machine
77+
* **id_rsa**: private key to connect to the machine
78+
79+
Also, it will create a state folder holding the state for the created resources at Azure, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources.
80+
81+
#### Container
82+
83+
When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values:
84+
85+
```bash
86+
podman run -d --name mapt-rhelai \
87+
-v ${PWD}:/workspace:z \
88+
-e ARM_TENANT_ID=${ati_value} \
89+
-e ARM_SUBSCRIPTION_ID=${asi_value} \
90+
-e ARM_CLIENT_ID=${aci_value} \
91+
-e ARM_CLIENT_SECRET=${acs_value} \
92+
quay.io/redhat-developer/mapt:0.7.0-dev azure \
93+
rhel-ai create \
94+
--project-name mapt-rhelai \
95+
--backed-url file:///workspace \
96+
--conn-details-output /workspace \
97+
--spot
98+
```
99+
100+
### Destroy
101+
102+
```bash
103+
podman run -d --rm \
104+
-v ${PWD}:/workspace:z \
105+
-e ARM_TENANT_ID=${ati_value} \
106+
-e ARM_SUBSCRIPTION_ID=${asi_value} \
107+
-e ARM_CLIENT_ID=${aci_value} \
108+
-e ARM_CLIENT_SECRET=${acs_value} \
109+
quay.io/redhat-developer/mapt:0.7.0-dev azure \
110+
rhel-ai destroy \
111+
--project-name mapt-rhelai \
112+
--backed-url file:///workspace
113+
```

pkg/provider/azure/action/rhel-ai/rhelai.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package rhelai
33
import (
44
"context"
55
"fmt"
6+
"sort"
67
"strings"
78

89
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
@@ -15,10 +16,11 @@ import (
1516

1617
const (
1718
imageOwnerSubscriptionId = "02db6bd4-035c-4074-b699-468f3d914744"
19+
imageOwnerResourceGroup = "aipcc-productization"
1820
// $1 accelerator $2 version
1921
imageNameRegex = "rhel-ai-%s-azure-%s"
20-
// $1 subscriptionId $2 rgName
21-
imageIdRegex = "/subscriptions/%s/resourceGroups/aipcc-productization/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0"
22+
// $1 subscriptionId $2 rgName $3 galleryName $4 imageName
23+
imageIdRegex = "/subscriptions/%s/resourceGroups/" + imageOwnerResourceGroup + "/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0"
2224

2325
username = "azureuser"
2426
)
@@ -104,3 +106,28 @@ func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err
104106
func Destroy(mCtxArgs *maptContext.ContextArgs) error {
105107
return azureLinux.Destroy(mCtxArgs)
106108
}
109+
110+
// ListVersions returns available RHEL AI version strings for the given accelerator,
111+
// sorted in ascending order. Versions are derived from Azure Compute Gallery names
112+
// in the image owner's subscription (e.g. gallery "rhel_ai_cuda_azure_3.4.0_ea.2"
113+
// yields version "3.4.0-ea.2").
114+
func ListVersions(ctx context.Context, accelerator string) ([]string, error) {
115+
acc := strings.ToLower(strings.TrimSpace(accelerator))
116+
switch acc {
117+
case "cuda", "rocm":
118+
default:
119+
return nil, fmt.Errorf("unsupported accelerator %q (expected: cuda or rocm)", accelerator)
120+
}
121+
prefix := fmt.Sprintf("rhel_ai_%s_azure_", strings.ReplaceAll(acc, "-", "_"))
122+
galleries, err := data.ListGalleriesByPrefix(ctx, imageOwnerSubscriptionId, imageOwnerResourceGroup, prefix)
123+
if err != nil {
124+
return nil, fmt.Errorf("listing RHEL AI versions for accelerator %q: %w", accelerator, err)
125+
}
126+
versions := make([]string, 0, len(galleries))
127+
for _, g := range galleries {
128+
raw := strings.TrimPrefix(g, prefix)
129+
versions = append(versions, strings.ReplaceAll(raw, "_", "-"))
130+
}
131+
sort.Strings(versions)
132+
return versions, nil
133+
}

pkg/provider/azure/data/images.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,34 @@ func GetSharedImageDiskControllerTypes(ctx context.Context, id *string) ([]strin
120120
return nil, nil
121121
}
122122

123+
// ListGalleriesByPrefix returns the names of galleries in resourceGroup (within
124+
// subscriptionID) whose names start with namePrefix.
125+
func ListGalleriesByPrefix(ctx context.Context, subscriptionID, resourceGroup, namePrefix string) ([]string, error) {
126+
ensureAzureEnvs()
127+
cred, err := azidentity.NewDefaultAzureCredential(nil)
128+
if err != nil {
129+
return nil, err
130+
}
131+
c, err := armcompute.NewClientFactory(subscriptionID, cred, nil)
132+
if err != nil {
133+
return nil, err
134+
}
135+
pager := c.NewGalleriesClient().NewListByResourceGroupPager(resourceGroup, nil)
136+
var names []string
137+
for pager.More() {
138+
page, err := pager.NextPage(ctx)
139+
if err != nil {
140+
return nil, err
141+
}
142+
for _, g := range page.Value {
143+
if g.Name != nil && strings.HasPrefix(*g.Name, namePrefix) {
144+
names = append(names, *g.Name)
145+
}
146+
}
147+
}
148+
return names, nil
149+
}
150+
123151
func SkuG2Support(ctx context.Context, location string, publisher string, offer string, sku string) (string, error) {
124152
ensureAzureEnvs()
125153
cred, err := azidentity.NewDefaultAzureCredential(nil)

0 commit comments

Comments
 (0)