Skip to content

Commit a1d6416

Browse files
authored
Merge branch 'main' into ibmcloud-gh-runners
2 parents 7c2ee65 + d4ef9b7 commit a1d6416

2,316 files changed

Lines changed: 98636 additions & 38256 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/build-go.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ jobs:
2222
rm_cmd: "rmz"
2323

2424
- name: Checkout code
25-
uses: actions/checkout@v6
25+
26+
uses: actions/checkout@v7
2627

2728
- name: Set up Go
2829
uses: actions/setup-go@v6

.github/workflows/build-img-ghrunner-test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
rm_cmd: "rmz"
2222

2323
- name: Checkout code
24-
uses: actions/checkout@v6
24+
uses: actions/checkout@v7
2525

2626
- name: Build image for PR
2727
if: ${{ github.event_name == 'pull_request' }}

.github/workflows/build-oci.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
remove_haskell: true
4141
rm_cmd: "rmz"
4242
- name: Checkout code
43-
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
43+
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
4444
- name: Install Podman (only for arm64)
4545
if: contains(matrix.os, 'arm')
4646
run: |
@@ -121,7 +121,7 @@ jobs:
121121
runs-on: ubuntu-24.04
122122
steps:
123123
- name: Checkout code
124-
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
124+
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
125125
- name: Set version from tag
126126
if: startsWith(github.ref, 'refs/tags/')
127127
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV

.github/workflows/build-on-hosted-runner.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
needs: hosted_runner_provision
2424
steps:
2525
- name: Code checkout
26-
uses: actions/checkout@v6
26+
uses: actions/checkout@v7
2727
- name: Set up Go
2828
uses: actions/setup-go@v6
2929
with:

.github/workflows/tkn-bundle.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
runs-on: ubuntu-24.04
1515
steps:
1616
- name: Checkout code
17-
uses: actions/checkout@v6
17+
uses: actions/checkout@v7
1818

1919
- name: Create k8s Kind Cluster
2020
uses: helm/kind-action@v1
@@ -33,7 +33,7 @@ jobs:
3333
runs-on: ubuntu-24.04
3434
steps:
3535
- name: Checkout code
36-
uses: actions/checkout@v6
36+
uses: actions/checkout@v7
3737

3838
- name: Log in to quay.io Registry
3939
uses: redhat-actions/podman-login@v1

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ TKN_IMG ?= quay.io/redhat-developer/mapt:v${VERSION}-tkn
99
# renovate: datasource=github-releases depName=cirruslabs/cirrus-cli
1010
CIRRUS_CLI ?= v1.0.0
1111
# renovate: datasource=github-releases depName=actions/runner
12-
GITHUB_RUNNER ?= 2.334.0
12+
GITHUB_RUNNER ?= 2.335.1
1313
# renovate: datasource=gitlab-releases depName=gitlab-org/gitlab-runner
14-
GITLAB_RUNNER ?= 19.0.1
14+
GITLAB_RUNNER ?= 19.1.0
1515
# renovate: datasource=github-releases depName=open-telemetry/opentelemetry-collector-releases
1616
OTELCOL_VERSION ?= 0.151.0
1717

cmd/mapt/cmd/aws/hosts/rhelai.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package hosts
22

33
import (
4+
"fmt"
5+
46
awsParams "github.com/redhat-developer/mapt/cmd/mapt/cmd/aws/params"
57
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
68
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
@@ -14,6 +16,9 @@ import (
1416
const (
1517
cmdRHELAI = "rhel-ai"
1618
cmdRHELAIDesc = "manage rhel ai host"
19+
20+
cmdRHELAIListVersions = "list-versions"
21+
cmdRHELAIListVersionsDesc = "list available RHEL AI versions"
1722
)
1823

1924
func GetRHELAICmd() *cobra.Command {
@@ -32,7 +37,7 @@ func GetRHELAICmd() *cobra.Command {
3237
params.AddCommonFlags(flagSet)
3338
c.PersistentFlags().AddFlagSet(flagSet)
3439

35-
c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
40+
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
3641
return c
3742
}
3843

@@ -107,3 +112,27 @@ func getRHELAIDestroy() *cobra.Command {
107112
c.PersistentFlags().AddFlagSet(flagSet)
108113
return c
109114
}
115+
116+
func getRHELAIListVersions() *cobra.Command {
117+
c := &cobra.Command{
118+
Use: cmdRHELAIListVersions,
119+
Short: cmdRHELAIListVersionsDesc,
120+
RunE: func(cmd *cobra.Command, args []string) error {
121+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
122+
return err
123+
}
124+
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
125+
if err != nil {
126+
return err
127+
}
128+
for _, v := range versions {
129+
fmt.Println(v)
130+
}
131+
return nil
132+
},
133+
}
134+
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
135+
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
136+
c.PersistentFlags().AddFlagSet(flagSet)
137+
return c
138+
}

cmd/mapt/cmd/azure/hosts/rhelai.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package hosts
22

33
import (
4+
"fmt"
5+
46
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
57
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
68
rhelai "github.com/redhat-developer/mapt/pkg/provider/azure/action/rhel-ai"
@@ -13,6 +15,9 @@ import (
1315
const (
1416
cmdRHELAI = "rhel-ai"
1517
cmdRHELAIDesc = "manage rhel ai host"
18+
19+
cmdRHELAIListVersions = "list-versions"
20+
cmdRHELAIListVersionsDesc = "list available RHEL AI versions in the Azure Compute Gallery"
1621
)
1722

1823
func GetRHELAICmd() *cobra.Command {
@@ -31,7 +36,7 @@ func GetRHELAICmd() *cobra.Command {
3136
params.AddCommonFlags(flagSet)
3237
c.PersistentFlags().AddFlagSet(flagSet)
3338

34-
c.AddCommand(getRHELAICreate(), getRHELAIDestroy())
39+
c.AddCommand(getRHELAICreate(), getRHELAIDestroy(), getRHELAIListVersions())
3540
return c
3641
}
3742

@@ -58,6 +63,7 @@ func getRHELAICreate() *cobra.Command {
5863
Version: viper.GetString(params.RhelAIVersion),
5964
Accelerator: viper.GetString(params.RhelAIAccelerator),
6065
CustomImage: viper.GetString(params.RhelAICustomImage),
66+
Marketplace: viper.GetBool(params.RhelAIMarketplace),
6167
ComputeRequest: params.ComputeRequestArgs(),
6268
Spot: params.SpotArgs(),
6369
Timeout: viper.GetString(params.Timeout),
@@ -70,6 +76,7 @@ func getRHELAICreate() *cobra.Command {
7076
flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc)
7177
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
7278
flagSet.StringP(params.RhelAICustomImage, "", "", params.RhelAICustomImageDesc)
79+
flagSet.Bool(params.RhelAIMarketplace, false, params.RhelAIMarketplaceDesc)
7380
flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc)
7481
params.AddComputeRequestFlags(flagSet)
7582
params.AddSpotFlags(flagSet)
@@ -104,3 +111,27 @@ func getRHELAIDestroy() *cobra.Command {
104111
c.PersistentFlags().AddFlagSet(flagSet)
105112
return c
106113
}
114+
115+
func getRHELAIListVersions() *cobra.Command {
116+
c := &cobra.Command{
117+
Use: cmdRHELAIListVersions,
118+
Short: cmdRHELAIListVersionsDesc,
119+
RunE: func(cmd *cobra.Command, args []string) error {
120+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
121+
return err
122+
}
123+
versions, err := rhelai.ListVersions(cmd.Context(), viper.GetString(params.RhelAIAccelerator))
124+
if err != nil {
125+
return err
126+
}
127+
for _, v := range versions {
128+
fmt.Println(v)
129+
}
130+
return nil
131+
},
132+
}
133+
flagSet := pflag.NewFlagSet(cmdRHELAIListVersions, pflag.ExitOnError)
134+
flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc)
135+
c.PersistentFlags().AddFlagSet(flagSet)
136+
return c
137+
}

cmd/mapt/cmd/params/params.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ const (
128128
RhelAIAccelearatorDefault string = "cuda"
129129
RhelAICustomImage string = "custom-image"
130130
RhelAICustomImageDesc string = "custom image name to spin RHEL AI OS (AMI name for AWS, image name for Azure)"
131+
RhelAIMarketplace string = "marketplace"
132+
RhelAIMarketplaceDesc string = "use the cloud provider's marketplace RHEL AI image instead of a shared gallery or custom image"
131133

132134
// Serverless
133135
Timeout string = "timeout"

docs/aws/rhelai.md

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Overview
2+
3+
mapt offers operations to manage RHEL AI environments on AWS. RHEL AI instances are GPU-enabled machines with pre-installed RHEL AI images, suitable for AI/ML workloads.
4+
5+
## Operations
6+
7+
### List Versions
8+
9+
List available RHEL AI versions for a given accelerator type:
10+
11+
```bash
12+
mapt aws rhel-ai list-versions -h
13+
list-versions
14+
15+
Usage:
16+
mapt aws rhel-ai list-versions [flags]
17+
18+
Flags:
19+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
20+
-h, --help help for list-versions
21+
22+
Global Flags:
23+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
24+
--project-name string project name to identify the instance of the stack
25+
```
26+
27+
#### Container
28+
29+
```bash
30+
podman run -it --rm \
31+
-e AWS_ACCESS_KEY_ID=XXX \
32+
-e AWS_SECRET_ACCESS_KEY=XXX \
33+
-e AWS_DEFAULT_REGION=us-east-1 \
34+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
35+
rhel-ai list-versions \
36+
--accelerator cuda
37+
```
38+
39+
### Create
40+
41+
This will create a RHEL AI instance according to params specified:
42+
43+
```bash
44+
mapt aws rhel-ai create -h
45+
create
46+
47+
Usage:
48+
mapt aws rhel-ai create [flags]
49+
50+
Flags:
51+
--accelerator string accelerator type. Valid types: cuda and rocm (default "cuda")
52+
--conn-details-output string path to export host connection information (host, username and privateKey)
53+
--cpus int32 Number of CPUs for the cloud instance (default 8)
54+
--custom-image string custom AMI name (overrides version and accelerator)
55+
--disk-size int Disk size in GB (default 2000)
56+
--gpus int32 Number of GPUs
57+
--memory int32 Amount of RAM for the cloud instance in GiB (default 64)
58+
--spot if spot is set the spot prices across all regions will be checked and machine will be started on best spot option (price / eviction)
59+
--spot-eviction-tolerance string if spot is enabled we can define the minimum tolerance level of eviction. Allowed values are: lowest, low, medium, high or highest (default "lowest")
60+
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
61+
--timeout string set a timeout for the instance (e.g. 4h)
62+
--version string version for the RHELAI OS (default "3.0.0")
63+
-h, --help help for create
64+
65+
Global Flags:
66+
--backed-url string backed for stack state. Can be a local path with format file:///path/subpath or s3 s3://existing-bucket
67+
--project-name string project name to identify the instance of the stack
68+
```
69+
70+
#### Outputs
71+
72+
It will create a RHEL AI instance and will give as result several files located at path defined by `--conn-details-output`:
73+
74+
* **host**: host for the instance (load balancer DNS if spot)
75+
* **username**: username to connect to the machine
76+
* **id_rsa**: private key to connect to the machine
77+
78+
Also, it will create a state folder holding the state for the created resources at AWS, the path for this folder is defined within `--backed-url`, the content from that folder is required with the same project name (`--project-name`) in order to destroy the resources.
79+
80+
#### Container
81+
82+
When running the container image it is required to pass the authentication information as variables, following a sample snippet on how to create an instance with default values:
83+
84+
```bash
85+
podman run -d --name mapt-rhelai \
86+
-v ${PWD}:/workspace:z \
87+
-e AWS_ACCESS_KEY_ID=XXX \
88+
-e AWS_SECRET_ACCESS_KEY=XXX \
89+
-e AWS_DEFAULT_REGION=us-east-1 \
90+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
91+
rhel-ai create \
92+
--project-name mapt-rhelai \
93+
--backed-url file:///workspace \
94+
--conn-details-output /workspace \
95+
--spot
96+
```
97+
98+
### Destroy
99+
100+
```bash
101+
podman run -d --rm \
102+
-v ${PWD}:/workspace:z \
103+
-e AWS_ACCESS_KEY_ID=XXX \
104+
-e AWS_SECRET_ACCESS_KEY=XXX \
105+
-e AWS_DEFAULT_REGION=us-east-1 \
106+
quay.io/redhat-developer/mapt:0.7.0-dev aws \
107+
rhel-ai destroy \
108+
--project-name mapt-rhelai \
109+
--backed-url file:///workspace
110+
```

0 commit comments

Comments
 (0)