-
Notifications
You must be signed in to change notification settings - Fork 35
AIPCC:15489: Add support for Gaudi accelerator in mapt's IBM Cloud module #834
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
deekay2310
wants to merge
3
commits into
redhat-developer:main
Choose a base branch
from
deekay2310:ibmcloud-gaudi3
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
289ad16
AIPCC:15489: Add support for Gaudi accelerator in mapt's IBM Cloud mo…
deekay2310 31ce415
fix(ibmcloud): add SSH readiness check to ibm-gaudi auto-provision path
deekay2310 c4577bb
fix(ibmcloud): use cloud-user as default SSH user for RHEL AI image
deekay2310 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| package hosts | ||
|
|
||
| import ( | ||
| "github.com/redhat-developer/mapt/cmd/mapt/cmd/params" | ||
| maptContext "github.com/redhat-developer/mapt/pkg/manager/context" | ||
| ibmgaudi "github.com/redhat-developer/mapt/pkg/provider/ibmcloud/action/ibm-gaudi" | ||
| "github.com/spf13/cobra" | ||
| "github.com/spf13/pflag" | ||
| "github.com/spf13/viper" | ||
| ) | ||
|
|
||
| const ( | ||
| cmdIBMGaudi = "ibm-gaudi" | ||
| cmdIBMGaudiDesc = "manage ibm gaudi3 accelerated instances (amd64)" | ||
| ) | ||
|
|
||
| func IBMGaudiCmd() *cobra.Command { | ||
| c := &cobra.Command{ | ||
| Use: cmdIBMGaudi, | ||
| Short: cmdIBMGaudiDesc, | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| if err := viper.BindPFlags(cmd.Flags()); err != nil { | ||
| return err | ||
| } | ||
| return nil | ||
| }, | ||
| } | ||
|
|
||
| flagSet := pflag.NewFlagSet(cmdIBMGaudi, pflag.ExitOnError) | ||
| params.AddCommonFlags(flagSet) | ||
| c.PersistentFlags().AddFlagSet(flagSet) | ||
|
|
||
| c.AddCommand(ibmGaudiCreate(), ibmGaudiDestroy()) | ||
| return c | ||
| } | ||
|
|
||
| func ibmGaudiCreate() *cobra.Command { | ||
| c := &cobra.Command{ | ||
| Use: params.CreateCmdName, | ||
| Short: params.CreateCmdName, | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| if err := viper.BindPFlags(cmd.Flags()); err != nil { | ||
| return err | ||
| } | ||
| return ibmgaudi.New( | ||
| &maptContext.ContextArgs{ | ||
| Context: cmd.Context(), | ||
| ProjectName: viper.GetString(params.ProjectName), | ||
| BackedURL: viper.GetString(params.BackedURL), | ||
| ResultsOutput: viper.GetString(params.ConnectionDetailsOutput), | ||
| Debug: viper.IsSet(params.Debug), | ||
| DebugLevel: viper.GetUint(params.DebugLevel), | ||
| Tags: viper.GetStringMapString(params.Tags), | ||
| }, | ||
| &ibmgaudi.GaudiArgs{ | ||
| SubnetID: viper.GetString(params.SubnetID), | ||
| OtelAppCode: viper.GetString(params.OtelAppCode), | ||
| OtelAuthToken: viper.GetString(params.OtelAuthToken), | ||
| OtelEndpoint: viper.GetString(params.OtelEndpoint), | ||
| OtelIndex: viper.GetString(params.OtelIndex), | ||
| OtelExtraAttrs: viper.GetStringMapString(params.OtelExtraAttrs), | ||
| }) | ||
| }, | ||
| } | ||
| flagSet := pflag.NewFlagSet(params.CreateCmdName, pflag.ExitOnError) | ||
| flagSet.StringP(params.ConnectionDetailsOutput, "", "", params.ConnectionDetailsOutputDesc) | ||
| flagSet.StringToStringP(params.Tags, "", nil, params.TagsDesc) | ||
| flagSet.StringP(params.SubnetID, "", "", params.SubnetIDDesc) | ||
| flagSet.StringP(params.OtelAppCode, "", "", params.OtelAppCodeDesc) | ||
| flagSet.StringP(params.OtelAuthToken, "", "", params.OtelAuthTokenDesc) | ||
| flagSet.StringP(params.OtelEndpoint, "", "https://otel-input.corp.redhat.com", params.OtelEndpointDesc) | ||
| flagSet.StringP(params.OtelIndex, "", "", params.OtelIndexDesc) | ||
| flagSet.StringToStringP(params.OtelExtraAttrs, "", nil, params.OtelExtraAttrsDesc) | ||
| c.PersistentFlags().AddFlagSet(flagSet) | ||
| return c | ||
| } | ||
|
|
||
| func ibmGaudiDestroy() *cobra.Command { | ||
| c := &cobra.Command{ | ||
| Use: params.DestroyCmdName, | ||
| Short: params.DestroyCmdName, | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| if err := viper.BindPFlags(cmd.Flags()); err != nil { | ||
| return err | ||
| } | ||
| return ibmgaudi.Destroy(&maptContext.ContextArgs{ | ||
| Context: cmd.Context(), | ||
| ProjectName: viper.GetString(params.ProjectName), | ||
| BackedURL: viper.GetString(params.BackedURL), | ||
| Debug: viper.IsSet(params.Debug), | ||
| DebugLevel: viper.GetUint(params.DebugLevel), | ||
| Serverless: viper.IsSet(params.Serverless), | ||
| ForceDestroy: viper.IsSet(params.ForceDestroy), | ||
| KeepState: viper.IsSet(params.KeepState), | ||
| }) | ||
| }, | ||
| } | ||
| flagSet := pflag.NewFlagSet(params.DestroyCmdName, pflag.ExitOnError) | ||
| flagSet.Bool(params.Serverless, false, params.ServerlessDesc) | ||
| flagSet.Bool(params.ForceDestroy, false, params.ForceDestroyDesc) | ||
| flagSet.Bool(params.KeepState, false, params.KeepStateDesc) | ||
| c.PersistentFlags().AddFlagSet(flagSet) | ||
| return c | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,167 @@ | ||
| # Overview | ||
|
|
||
| This action provisions an Intel Gaudi 3 accelerated instance on IBM Cloud VPC using the RHEL AI image. The instance uses the `gx3d-160x1792x8gaudi3` profile (160 vCPU, 1792 GB RAM, 8x Gaudi 3 accelerators) and is assigned a floating IP for direct SSH access. | ||
|
|
||
| Two networking modes are supported: | ||
|
|
||
| - **Existing subnet** (`--subnet-id`): the instance is placed in a pre-existing VPC subnet. VPC, subnet, and gateway are not created. Only `IC_REGION` is required. | ||
| - **Auto-provision** (no `--subnet-id`): a new VPC, subnet, and public gateway are created from scratch. Both `IC_REGION` and `IC_ZONE` are required. | ||
|
|
||
| ## Environment variables | ||
|
|
||
| | Variable | Required | Description | | ||
| |---|---|---| | ||
| | `IBMCLOUD_ACCOUNT` | yes | IBM Cloud account ID | | ||
| | `IBMCLOUD_API_KEY` | yes | IBM Cloud API key | | ||
| | `IC_REGION` | yes | IBM Cloud region (e.g. `us-east`, `us-south`, `eu-de`) | | ||
| | `IC_ZONE` | only without `--subnet-id` | Availability zone (e.g. `us-east-1`) | | ||
| | `IBMCLOUD_COS_ACCESS_KEY_ID` | only with S3 `--backed-url` | HMAC access key for IBM Cloud Object Storage | | ||
| | `IBMCLOUD_COS_SECRET_ACCESS_KEY` | only with S3 `--backed-url` | HMAC secret key for IBM Cloud Object Storage | | ||
| | `IBMCLOUD_COS_ENDPOINT` | no | COS S3 endpoint (defaults to `s3.<region>.cloud-object-storage.appdomain.cloud`) | | ||
|
|
||
| ## Regional availability | ||
|
|
||
| Gaudi 3 instances are available in: | ||
|
|
||
| - **us-east** (Washington DC) | ||
| - **us-south** (Dallas) | ||
| - **eu-de** (Frankfurt) | ||
|
|
||
| ## Create | ||
|
|
||
| ```bash | ||
| mapt ibmcloud ibm-gaudi create -h | ||
| create | ||
|
|
||
| Usage: | ||
| mapt ibmcloud ibm-gaudi create [flags] | ||
|
|
||
| Flags: | ||
| --conn-details-output string path to export host connection information (host, username and privateKey) | ||
| -h, --help help for create | ||
| --otel-app-code string OpenTelemetry appcode identifier (e.g. MAPT-001); when set together with --otel-auth-token, installs the otelcol-contrib filelog collector on the instance | ||
| --otel-auth-token string OpenTelemetry authentication token (UUID) used to authenticate against the OTLP endpoint | ||
| --otel-endpoint string OTLP HTTP endpoint to export logs to (default "https://otel-input.corp.redhat.com") | ||
| --otel-index string Splunk index name for log routing (e.g. rh_linux) | ||
| --subnet-id string ID of an existing VPC subnet to deploy the instance into (optional) | ||
| --tags stringToString tags to add on each resource (--tags name1=value1,name2=value2) (default []) | ||
|
|
||
| Global Flags: | ||
| --backed-url string backed for stack state. (local) file:///path/subpath (s3) s3://existing-bucket, (azure) azblob://existing-blobcontainer. See more https://www.pulumi.com/docs/iac/concepts/state-and-backends/#using-a-self-managed-backend | ||
| --debug Enable debug traces and set verbosity to max. Typically to get information to troubleshooting an issue. | ||
| --debug-level uint Set the level of verbosity on debug. You can set from minimum 1 to max 9. (default 3) | ||
| --project-name string project name to identify the instance of the stack | ||
| ``` | ||
|
|
||
| ### Outputs | ||
|
|
||
| Files written to the path defined by `--conn-details-output`: | ||
|
|
||
| | File | Description | | ||
| |---|---| | ||
| | `host` | Floating IP of the instance (direct SSH) | | ||
| | `username` | SSH username (`root`) | | ||
| | `id_rsa` | Private key for the instance | | ||
|
|
||
| A state folder is also created at `--backed-url`. It is required (together with `--project-name`) to destroy the resources later. | ||
|
|
||
| ### SSH access | ||
|
|
||
| ```bash | ||
| OUTPUT=/path/to/conn-details-output | ||
|
|
||
| ssh -i ${OUTPUT}/id_rsa \ | ||
| -o StrictHostKeyChecking=no \ | ||
| root@$(cat ${OUTPUT}/host) | ||
| ``` | ||
|
|
||
| ### Container | ||
|
|
||
| ```bash | ||
| # Using an existing VPC subnet | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url file:///workspace \ | ||
| --conn-details-output /workspace \ | ||
| --subnet-id <subnet-id> | ||
|
|
||
| # Auto-provisioning VPC, subnet, and gateway | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| -e IC_ZONE=us-east-1 \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url file:///workspace \ | ||
| --conn-details-output /workspace | ||
| ``` | ||
|
|
||
| ## OpenTelemetry log collection | ||
|
|
||
| When both `--otel-app-code` and `--otel-auth-token` are provided, cloud-init installs `otelcol-contrib` on the instance at first boot and configures it to ship `/var/log/messages`, `/var/log/secure`, and `/var/log/audit/audit.log` to the OTLP endpoint. | ||
|
|
||
| ```bash | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url file:///workspace \ | ||
| --conn-details-output /workspace \ | ||
| --subnet-id <subnet-id> \ | ||
| --otel-app-code MAPT-001 \ | ||
| --otel-auth-token <uuid-token> | ||
| ``` | ||
|
|
||
| ## Using IBM Cloud Object Storage as S3 backend | ||
|
|
||
| To store Pulumi state in IBM COS instead of a local file, create [HMAC credentials](https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-uhc-hmac-credentials-main) for your COS instance and pass an `s3://` backed URL: | ||
|
|
||
| ```bash | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IBMCLOUD_ACCOUNT=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| -e IC_ZONE=us-east-1 \ | ||
| -e IBMCLOUD_COS_ACCESS_KEY_ID=XXX \ | ||
| -e IBMCLOUD_COS_SECRET_ACCESS_KEY=XXX \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi create \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url s3://my-cos-bucket \ | ||
| --conn-details-output /workspace | ||
| ``` | ||
|
|
||
| ## Destroy | ||
|
|
||
| ```bash | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi destroy \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url file:///workspace | ||
| ``` | ||
|
|
||
| By default, destroy removes the Pulumi state files from the backend after a successful destroy. Use `--keep-state` to preserve them: | ||
|
|
||
| ```bash | ||
| podman run -d --name ibm-gaudi \ | ||
| -v ${PWD}:/workspace:z \ | ||
| -e IBMCLOUD_API_KEY=XXX \ | ||
| -e IBMCLOUD_ACCOUNT=XXX \ | ||
| -e IC_REGION=us-east \ | ||
| -e IBMCLOUD_COS_ACCESS_KEY_ID=XXX \ | ||
| -e IBMCLOUD_COS_SECRET_ACCESS_KEY=XXX \ | ||
| quay.io/redhat-developer/mapt:latest ibmcloud ibm-gaudi destroy \ | ||
| --project-name ibm-gaudi \ | ||
| --backed-url s3://my-cos-bucket \ | ||
| --keep-state | ||
| ``` | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🎯 Functional Correctness | 🟠 Major | ⚡ Quick win
SSH username doc likely stale — should be
cloud-user, notroot.Per the PR's commit history, the default SSH user for the RHEL AI image was changed to
cloud-user. This doc still documentsrootas the username (Line 63) and usesroot@...in the SSH access example (Lines 73-75), which will mislead users and cause failed SSH connections.📝 Proposed fix
Also applies to: 68-76
🤖 Prompt for AI Agents