Skip to content

Commit 289ad16

Browse files
committed
AIPCC:15489: Add support for Gaudi accelerator in mapt's IBM Cloud module
1 parent 7c5d50d commit 289ad16

5 files changed

Lines changed: 802 additions & 0 deletions

File tree

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package hosts
2+
3+
import (
4+
"github.com/redhat-developer/mapt/cmd/mapt/cmd/params"
5+
maptContext "github.com/redhat-developer/mapt/pkg/manager/context"
6+
ibmgaudi "github.com/redhat-developer/mapt/pkg/provider/ibmcloud/action/ibm-gaudi"
7+
"github.com/spf13/cobra"
8+
"github.com/spf13/pflag"
9+
"github.com/spf13/viper"
10+
)
11+
12+
const (
13+
cmdIBMGaudi = "ibm-gaudi"
14+
cmdIBMGaudiDesc = "manage ibm gaudi3 accelerated instances (amd64)"
15+
)
16+
17+
func IBMGaudiCmd() *cobra.Command {
18+
c := &cobra.Command{
19+
Use: cmdIBMGaudi,
20+
Short: cmdIBMGaudiDesc,
21+
RunE: func(cmd *cobra.Command, args []string) error {
22+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
23+
return err
24+
}
25+
return nil
26+
},
27+
}
28+
29+
flagSet := pflag.NewFlagSet(cmdIBMGaudi, pflag.ExitOnError)
30+
params.AddCommonFlags(flagSet)
31+
c.PersistentFlags().AddFlagSet(flagSet)
32+
33+
c.AddCommand(ibmGaudiCreate(), ibmGaudiDestroy())
34+
return c
35+
}
36+
37+
func ibmGaudiCreate() *cobra.Command {
38+
c := &cobra.Command{
39+
Use: params.CreateCmdName,
40+
Short: params.CreateCmdName,
41+
RunE: func(cmd *cobra.Command, args []string) error {
42+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
43+
return err
44+
}
45+
return ibmgaudi.New(
46+
&maptContext.ContextArgs{
47+
Context: cmd.Context(),
48+
ProjectName: viper.GetString(params.ProjectName),
49+
BackedURL: viper.GetString(params.BackedURL),
50+
ResultsOutput: viper.GetString(params.ConnectionDetailsOutput),
51+
Debug: viper.IsSet(params.Debug),
52+
DebugLevel: viper.GetUint(params.DebugLevel),
53+
Tags: viper.GetStringMapString(params.Tags),
54+
},
55+
&ibmgaudi.GaudiArgs{
56+
SubnetID: viper.GetString(params.SubnetID),
57+
OtelAppCode: viper.GetString(params.OtelAppCode),
58+
OtelAuthToken: viper.GetString(params.OtelAuthToken),
59+
OtelEndpoint: viper.GetString(params.OtelEndpoint),
60+
OtelIndex: viper.GetString(params.OtelIndex),
61+
OtelExtraAttrs: viper.GetStringMapString(params.OtelExtraAttrs),
62+
})
63+
},
64+
}
65+
flagSet := pflag.NewFlagSet(params.CreateCmdName, pflag.ExitOnError)
66+
flagSet.StringP(params.ConnectionDetailsOutput, "", "", params.ConnectionDetailsOutputDesc)
67+
flagSet.StringToStringP(params.Tags, "", nil, params.TagsDesc)
68+
flagSet.StringP(params.SubnetID, "", "", params.SubnetIDDesc)
69+
flagSet.StringP(params.OtelAppCode, "", "", params.OtelAppCodeDesc)
70+
flagSet.StringP(params.OtelAuthToken, "", "", params.OtelAuthTokenDesc)
71+
flagSet.StringP(params.OtelEndpoint, "", "https://otel-input.corp.redhat.com", params.OtelEndpointDesc)
72+
flagSet.StringP(params.OtelIndex, "", "", params.OtelIndexDesc)
73+
flagSet.StringToStringP(params.OtelExtraAttrs, "", nil, params.OtelExtraAttrsDesc)
74+
c.PersistentFlags().AddFlagSet(flagSet)
75+
return c
76+
}
77+
78+
func ibmGaudiDestroy() *cobra.Command {
79+
c := &cobra.Command{
80+
Use: params.DestroyCmdName,
81+
Short: params.DestroyCmdName,
82+
RunE: func(cmd *cobra.Command, args []string) error {
83+
if err := viper.BindPFlags(cmd.Flags()); err != nil {
84+
return err
85+
}
86+
return ibmgaudi.Destroy(&maptContext.ContextArgs{
87+
Context: cmd.Context(),
88+
ProjectName: viper.GetString(params.ProjectName),
89+
BackedURL: viper.GetString(params.BackedURL),
90+
Debug: viper.IsSet(params.Debug),
91+
DebugLevel: viper.GetUint(params.DebugLevel),
92+
Serverless: viper.IsSet(params.Serverless),
93+
ForceDestroy: viper.IsSet(params.ForceDestroy),
94+
KeepState: viper.IsSet(params.KeepState),
95+
})
96+
},
97+
}
98+
flagSet := pflag.NewFlagSet(params.DestroyCmdName, pflag.ExitOnError)
99+
flagSet.Bool(params.Serverless, false, params.ServerlessDesc)
100+
flagSet.Bool(params.ForceDestroy, false, params.ForceDestroyDesc)
101+
flagSet.Bool(params.KeepState, false, params.KeepStateDesc)
102+
c.PersistentFlags().AddFlagSet(flagSet)
103+
return c
104+
}

cmd/mapt/cmd/ibmcloud/ibmcloud.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ func GetCmd() *cobra.Command {
2929
params.AddCommonFlags(flagSet)
3030
c.PersistentFlags().AddFlagSet(flagSet)
3131
c.AddCommand(
32+
hosts.IBMGaudiCmd(),
3233
hosts.IBMPowerCmd(),
3334
hosts.IBMZCmd())
3435
return c

docs/ibmcloud/ibm-gaudi.md

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
# Overview
2+
3+
This action provisions an Intel Gaudi 3 accelerated instance on IBM Cloud VPC using the RHEL AI image. The instance uses the `gx3d-160x1792x8gaudi3` profile (160 vCPU, 1792 GB RAM, 8x Gaudi 3 accelerators) and is assigned a floating IP for direct SSH access.
4+
5+
Two networking modes are supported:
6+
7+
- **Existing subnet** (`--subnet-id`): the instance is placed in a pre-existing VPC subnet. VPC, subnet, and gateway are not created. Only `IC_REGION` is required.
8+
- **Auto-provision** (no `--subnet-id`): a new VPC, subnet, and public gateway are created from scratch. Both `IC_REGION` and `IC_ZONE` are required.
9+
10+
## Environment variables
11+
12+
| Variable | Required | Description |
13+
|---|---|---|
14+
| `IBMCLOUD_ACCOUNT` | yes | IBM Cloud account ID |
15+
| `IBMCLOUD_API_KEY` | yes | IBM Cloud API key |
16+
| `IC_REGION` | yes | IBM Cloud region (e.g. `us-east`, `us-south`, `eu-de`) |
17+
| `IC_ZONE` | only without `--subnet-id` | Availability zone (e.g. `us-east-1`) |
18+
| `IBMCLOUD_COS_ACCESS_KEY_ID` | only with S3 `--backed-url` | HMAC access key for IBM Cloud Object Storage |
19+
| `IBMCLOUD_COS_SECRET_ACCESS_KEY` | only with S3 `--backed-url` | HMAC secret key for IBM Cloud Object Storage |
20+
| `IBMCLOUD_COS_ENDPOINT` | no | COS S3 endpoint (defaults to `s3.<region>.cloud-object-storage.appdomain.cloud`) |
21+
22+
## Regional availability
23+
24+
Gaudi 3 instances are available in:
25+
26+
- **us-east** (Washington DC)
27+
- **us-south** (Dallas)
28+
- **eu-de** (Frankfurt)
29+
30+
## Create
31+
32+
```bash
33+
mapt ibmcloud ibm-gaudi create -h
34+
create
35+
36+
Usage:
37+
mapt ibmcloud ibm-gaudi create [flags]
38+
39+
Flags:
40+
--conn-details-output string path to export host connection information (host, username and privateKey)
41+
-h, --help help for create
42+
--otel-app-code string OpenTelemetry appcode identifier (e.g. MAPT-001); when set together with --otel-auth-token, installs the otelcol-contrib filelog collector on the instance
43+
--otel-auth-token string OpenTelemetry authentication token (UUID) used to authenticate against the OTLP endpoint
44+
--otel-endpoint string OTLP HTTP endpoint to export logs to (default "https://otel-input.corp.redhat.com")
45+
--otel-index string Splunk index name for log routing (e.g. rh_linux)
46+
--subnet-id string ID of an existing VPC subnet to deploy the instance into (optional)
47+
--tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default [])
48+
49+
Global Flags:
50+
--backed-url string backed for stack state. (local) file:///path/subpath (s3) s3://existing-bucket, (azure) azblob://existing-blobcontainer. See more https://www.pulumi.com/docs/iac/concepts/state-and-backends/#using-a-self-managed-backend
51+
--debug Enable debug traces and set verbosity to max. Typically to get information to troubleshooting an issue.
52+
--debug-level uint Set the level of verbosity on debug. You can set from minimum 1 to max 9. (default 3)
53+
--project-name string project name to identify the instance of the stack
54+
```
55+
56+
### Outputs
57+
58+
Files written to the path defined by `--conn-details-output`:
59+
60+
| File | Description |
61+
|---|---|
62+
| `host` | Floating IP of the instance (direct SSH) |
63+
| `username` | SSH username (`root`) |
64+
| `id_rsa` | Private key for the instance |
65+
66+
A state folder is also created at `--backed-url`. It is required (together with `--project-name`) to destroy the resources later.
67+
68+
### SSH access
69+
70+
```bash
71+
OUTPUT=/path/to/conn-details-output
72+
73+
ssh -i ${OUTPUT}/id_rsa \
74+
-o StrictHostKeyChecking=no \
75+
root@$(cat ${OUTPUT}/host)
76+
```
77+
78+
### Container
79+
80+
```bash
81+
# Using an existing VPC subnet
82+
podman run -d --name ibm-gaudi \
83+
-v ${PWD}:/workspace:z \
84+
-e IBMCLOUD_API_KEY=XXX \
85+
-e IC_REGION=us-east \
86+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \
87+
--project-name ibm-gaudi \
88+
--backed-url file:///workspace \
89+
--conn-details-output /workspace \
90+
--subnet-id <subnet-id>
91+
92+
# Auto-provisioning VPC, subnet, and gateway
93+
podman run -d --name ibm-gaudi \
94+
-v ${PWD}:/workspace:z \
95+
-e IBMCLOUD_API_KEY=XXX \
96+
-e IC_REGION=us-east \
97+
-e IC_ZONE=us-east-1 \
98+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \
99+
--project-name ibm-gaudi \
100+
--backed-url file:///workspace \
101+
--conn-details-output /workspace
102+
```
103+
104+
## OpenTelemetry log collection
105+
106+
When both `--otel-app-code` and `--otel-auth-token` are provided, cloud-init installs `otelcol-contrib` on the instance at first boot and configures it to ship `/var/log/messages`, `/var/log/secure`, and `/var/log/audit/audit.log` to the OTLP endpoint.
107+
108+
```bash
109+
podman run -d --name ibm-gaudi \
110+
-v ${PWD}:/workspace:z \
111+
-e IBMCLOUD_API_KEY=XXX \
112+
-e IC_REGION=us-east \
113+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \
114+
--project-name ibm-gaudi \
115+
--backed-url file:///workspace \
116+
--conn-details-output /workspace \
117+
--subnet-id <subnet-id> \
118+
--otel-app-code MAPT-001 \
119+
--otel-auth-token <uuid-token>
120+
```
121+
122+
## Using IBM Cloud Object Storage as S3 backend
123+
124+
To store Pulumi state in IBM COS instead of a local file, create [HMAC credentials](https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-uhc-hmac-credentials-main) for your COS instance and pass an `s3://` backed URL:
125+
126+
```bash
127+
podman run -d --name ibm-gaudi \
128+
-v ${PWD}:/workspace:z \
129+
-e IBMCLOUD_API_KEY=XXX \
130+
-e IBMCLOUD_ACCOUNT=XXX \
131+
-e IC_REGION=us-east \
132+
-e IC_ZONE=us-east-1 \
133+
-e IBMCLOUD_COS_ACCESS_KEY_ID=XXX \
134+
-e IBMCLOUD_COS_SECRET_ACCESS_KEY=XXX \
135+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \
136+
--project-name ibm-gaudi \
137+
--backed-url s3://my-cos-bucket \
138+
--conn-details-output /workspace
139+
```
140+
141+
## Destroy
142+
143+
```bash
144+
podman run -d --name ibm-gaudi \
145+
-v ${PWD}:/workspace:z \
146+
-e IBMCLOUD_API_KEY=XXX \
147+
-e IC_REGION=us-east \
148+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi destroy \
149+
--project-name ibm-gaudi \
150+
--backed-url file:///workspace
151+
```
152+
153+
By default, destroy removes the Pulumi state files from the backend after a successful destroy. Use `--keep-state` to preserve them:
154+
155+
```bash
156+
podman run -d --name ibm-gaudi \
157+
-v ${PWD}:/workspace:z \
158+
-e IBMCLOUD_API_KEY=XXX \
159+
-e IBMCLOUD_ACCOUNT=XXX \
160+
-e IC_REGION=us-east \
161+
-e IBMCLOUD_COS_ACCESS_KEY_ID=XXX \
162+
-e IBMCLOUD_COS_SECRET_ACCESS_KEY=XXX \
163+
quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi destroy \
164+
--project-name ibm-gaudi \
165+
--backed-url s3://my-cos-bucket \
166+
--keep-state
167+
```

0 commit comments

Comments
 (0)