Skip to content

Commit 41fd57d

Browse files
rishupkclaude
authored andcommitted
feat(aws/rhel-ai): add list-versions subcommand
- Add ListAMIs() data helper returning all matching AMIs (vs GetAMI's newest-only) - Add ListVersions() action querying us-east-1 AMIs by accelerator with validation - Wire list-versions cobra subcommand with --accelerator flag - Add docs/aws/rhelai.md documenting create, destroy, and list-versions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Rishabh Kothari <rkothari@redhat.com>
1 parent ee2954c commit 41fd57d

4 files changed

Lines changed: 242 additions & 1 deletion

File tree

cmd/mapt/cmd/aws/hosts/rhelai.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package hosts
22

33
import (
4+
"fmt"
5+
46
awsParams "github.com/redhat-developer/mapt/cmd/mapt/cmd/aws/params"
57
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
68
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
@@ -14,6 +16,9 @@ import (
1416
const (
1517
cmdRHELAI = "rhel-ai"
1618
cmdRHELAIDesc = "manage rhel ai host"
19+
20+
cmdRHELAIListVersions = "list-versions"
21+
cmdRHELAIListVersionsDesc = "list available RHEL AI versions"
1722
)
1823

1924
func GetRHELAICmd() *cobra.Command {
@@ -32,7 +37,7 @@ func GetRHELAICmd() *cobra.Command {
3237
params.AddCommonFlags(flagSet)
3338
c.PersistentFlags().AddFlagSet(flagSet)
3439

35-
c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
40+
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
3641
return c
3742
}
3843

@@ -107,3 +112,27 @@ func getRHELAIDestroy() *cobra.Command {
107112
c.PersistentFlags().AddFlagSet(flagSet)
108113
return c
109114
}
115+
116+
func getRHELAIListVersions() *cobra.Command {
117+
c := &cobra.Command{
118+
Use: cmdRHELAIListVersions,
119+
Short: cmdRHELAIListVersionsDesc,
120+
RunE: func(cmd *cobra.Command, args []string) error {
121+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
122+
return err
123+
}
124+
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
125+
if err != nil {
126+
return err
127+
}
128+
for _, v := range versions {
129+
fmt.Println(v)
130+
}
131+
return nil
132+
},
133+
}
134+
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
135+
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
136+
c.PersistentFlags().AddFlagSet(flagSet)
137+
return c
138+
}

docs/aws/rhelai.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Overview
2+
3+
mapt offers operations to manage RHEL AI environments on AWS. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images, suitable for AI/ML workloads.
4+
5+
## Operations
6+
7+
### List Versions
8+
9+
List available RHEL AI versions for a given accelerator type:
10+
11+
```bash
12+
mapt aws rhel-ai list-versions -h
13+
list-versions
14+
15+
Usage:
16+
mapt aws rhel-ai list-versions [flags]
17+
18+
Flags:
19+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
20+
-h, --help help for list-versions
21+
22+
Global Flags:
23+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
24+
--project-name string project name to identify the instance of the stack
25+
```
26+
27+
#### Container
28+
29+
```bash
30+
podman run -it --rm \
31+
-e AWS_ACCESS_KEY_ID=XXX \
32+
-e AWS_SECRET_ACCESS_KEY=XXX \
33+
-e AWS_DEFAULT_REGION=us-east-1 \
34+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
35+
rhel-ai list-versions \
36+
--accelerator cuda
37+
```
38+
39+
### Create
40+
41+
This will create a RHEL AI instance according to params specified:
42+
43+
```bash
44+
mapt aws rhel-ai create -h
45+
create
46+
47+
Usage:
48+
mapt aws rhel-ai create [flags]
49+
50+
Flags:
51+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
52+
--conn-details-output string path to export host connection information (host, username and privateKey)
53+
--cpus int32 Number of CPUs for the cloud instance (default 8)
54+
--custom-image string custom AMI name (overrides version and accelerator)
55+
--disk-size int Disk size in GB (default 2000)
56+
--gpus int32 Number of GPUs
57+
--memory int32 Amount of RAM for the cloud instance in GiB (default 64)
58+
--spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction)
59+
--spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest")
60+
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
61+
--timeout string set a timeout for the instance (e.g. 4h)
62+
--version string version for the RHELAI OS (default "3.0.0")
63+
-h, --help help for create
64+
65+
Global Flags:
66+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
67+
--project-name string project name to identify the instance of the stack
68+
```
69+
70+
#### Outputs
71+
72+
It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`:
73+
74+
* **host**: host for the instance (load balancer DNS if spot)
75+
* **username**: username to connect to the machine
76+
* **id_rsa**: private key to connect to the machine
77+
78+
Also, it will create a state folder holding the state for the created resources at AWS, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources.
79+
80+
#### Container
81+
82+
When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values:
83+
84+
```bash
85+
podman run -d --name mapt-rhelai \
86+
-v ${PWD}:/workspace:z \
87+
-e AWS_ACCESS_KEY_ID=XXX \
88+
-e AWS_SECRET_ACCESS_KEY=XXX \
89+
-e AWS_DEFAULT_REGION=us-east-1 \
90+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
91+
rhel-ai create \
92+
--project-name mapt-rhelai \
93+
--backed-url file:///workspace \
94+
--conn-details-output /workspace \
95+
--spot
96+
```
97+
98+
### Destroy
99+
100+
```bash
101+
podman run -d --rm \
102+
-v ${PWD}:/workspace:z \
103+
-e AWS_ACCESS_KEY_ID=XXX \
104+
-e AWS_SECRET_ACCESS_KEY=XXX \
105+
-e AWS_DEFAULT_REGION=us-east-1 \
106+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
107+
rhel-ai destroy \
108+
--project-name mapt-rhelai \
109+
--backed-url file:///workspace
110+
```

pkg/provider/aws/action/rhel-ai/rhelai.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package rhelai
33
import (
44
"context"
55
"fmt"
6+
"sort"
7+
"strings"
68

79
"github.com/go-playground/validator/v10"
810
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/ec2"
@@ -119,6 +121,57 @@ func Destroy(mCtxArgs *mc.ContextArgs) error {
119121
return aws.CleanupState(mCtx)
120122
}
121123

124+
const listVersionsRegion = "us-east-1"
125+
126+
// ListVersions returns available RHEL AI version strings for the given accelerator,
127+
// sorted in ascending order. Versions are derived from AMI names in a reference
128+
// region (us-east-1) matching the pattern "rhel-ai-{accelerator}-aws-{version}*".
129+
func ListVersions(ctx context.Context, accelerator string) ([]string, error) {
130+
acc := strings.ToLower(strings.TrimSpace(accelerator))
131+
switch acc {
132+
case "cuda", "rocm":
133+
default:
134+
return nil, fmt.Errorf("unsupported accelerator %q (expected: cuda or rocm)", accelerator)
135+
}
136+
nameFilter := fmt.Sprintf("rhel-ai-%s-aws-*", acc)
137+
region := listVersionsRegion
138+
images, err := data.ListAMIs(ctx, data.ImageRequest{
139+
Name: &nameFilter,
140+
Arch: &amiArch,
141+
Owner: &amiOwner,
142+
Region: &region,
143+
})
144+
if err != nil {
145+
return nil, fmt.Errorf("listing RHEL AI AMIs for accelerator %q: %w", acc, err)
146+
}
147+
prefix := fmt.Sprintf("rhel-ai-%s-aws-", acc)
148+
seen := make(map[string]struct{})
149+
for _, img := range images {
150+
if img.Name == nil {
151+
continue
152+
}
153+
raw := strings.TrimPrefix(*img.Name, prefix)
154+
// AMI names may have trailing qualifiers (e.g. "-x86_64-..."); take only the version part
155+
if idx := strings.Index(raw, "-x86_64"); idx > 0 {
156+
raw = raw[:idx]
157+
}
158+
if idx := strings.Index(raw, "-arm64"); idx > 0 {
159+
raw = raw[:idx]
160+
}
161+
if len(raw) > 0 {
162+
// Normalize underscores to dashes (e.g. "3.4.0_ea.2" → "3.4.0-ea.2")
163+
version := strings.ReplaceAll(raw, "_", "-")
164+
seen[version] = struct{}{}
165+
}
166+
}
167+
versions := make([]string, 0, len(seen))
168+
for v := range seen {
169+
versions = append(versions, v)
170+
}
171+
sort.Strings(versions)
172+
return versions, nil
173+
}
174+
122175
func (r *rhelAIRequest) createMachine() error {
123176
cs := manager.Stack{
124177
StackName: r.mCtx.StackNameByProject(stackName),

pkg/provider/aws/data/ami.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,55 @@ func GetAMI(ctx context.Context, r ImageRequest) (*ImageInfo, error) {
103103
nil
104104
}
105105

106+
// ListAMIs returns all AMIs matching the request filters.
107+
// Unlike GetAMI which returns only the newest, this returns the full set.
108+
func ListAMIs(ctx context.Context, r ImageRequest) ([]ec2Types.Image, error) {
109+
var cfgOpts config.LoadOptionsFunc
110+
if r.Region != nil && len(*r.Region) > 0 {
111+
cfgOpts = config.WithRegion(*r.Region)
112+
}
113+
cfg, err := config.LoadDefaultConfig(ctx, cfgOpts)
114+
if err != nil {
115+
return nil, err
116+
}
117+
client := ec2.NewFromConfig(cfg)
118+
var filterName = "name"
119+
filters := []ec2Types.Filter{
120+
{
121+
Name: &filterName,
122+
Values: []string{*r.Name},
123+
},
124+
}
125+
if r.Arch != nil && len(*r.Arch) > 0 {
126+
filter := "architecture"
127+
filters = append(filters, ec2Types.Filter{
128+
Name: &filter,
129+
Values: []string{*r.Arch},
130+
})
131+
}
132+
input := &ec2.DescribeImagesInput{
133+
Filters: filters,
134+
}
135+
if r.Owner != nil && len(*r.Owner) > 0 {
136+
input.Owners = []string{*r.Owner}
137+
aId, err := accountId(ctx)
138+
if err != nil {
139+
return nil, err
140+
}
141+
if *aId != *r.Owner {
142+
input.ExecutableUsers = []string{"self"}
143+
}
144+
}
145+
result, err := client.DescribeImages(ctx, input)
146+
if err != nil {
147+
return nil, err
148+
}
149+
if result == nil {
150+
return nil, nil
151+
}
152+
return result.Images, nil
153+
}
154+
106155
// IsAMIOffered checks if an ami based on its Name is offered on a specific region
107156
func IsAMIOffered(ctx context.Context, r ImageRequest) (bool, *ImageInfo, error) {
108157
ami, err := GetAMI(ctx, r)

0 commit comments

Comments
 (0)