Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/amd-gpu-operator:dev
createdAt: "2026-04-02T12:26:30Z"
createdAt: "2026-04-07T12:28:11Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -1242,6 +1242,18 @@ spec:
- get
- update
- watch
- apiGroups:
- apps
resources:
- deployments
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kmm.sigs.x-k8s.io
resources:
Expand Down
12 changes: 12 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ rules:
- get
- update
- watch
- apiGroups:
- apps
resources:
- deployments
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kmm.sigs.x-k8s.io
resources:
Expand Down
9 changes: 8 additions & 1 deletion docs/autoremediation/auto-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,14 @@ Each entry in the ConfigMap maps a unique error code (AFID) to its remediation w
The following example demonstrates a complete error mapping configuration:

```yaml
- nodeCondition: AMDGPUXgmi
apiVersion: v1
kind: ConfigMap
metadata:
name: auto-remediation-custom-config
namespace: kube-amd-gpu
data:
workflow: |
- nodeCondition: AMDGPUXgmi
workflowTemplate: default-template
validationTestsProfile:
framework: AGFHC
Expand Down
1 change: 0 additions & 1 deletion hack/k8s-patch/template-patch/config-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ rules:
- "apps"
resources:
- daemonsets
- deployments
verbs:
- get
- list
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation-crds
version: v1.0.0
digest: sha256:0806f6b6d7aa21be77bf1c91e720ae3238338a16f107df450a53b02ef940db1b
generated: "2026-04-02T12:26:25.920315689Z"
generated: "2026-04-07T12:28:07.188885215Z"
1 change: 0 additions & 1 deletion helm-charts-k8s/templates/config-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ rules:
- "apps"
resources:
- daemonsets
- deployments
verbs:
- get
- list
Expand Down
12 changes: 12 additions & 0 deletions helm-charts-k8s/templates/manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,18 @@ rules:
- get
- update
- watch
- apiGroups:
- apps
resources:
- deployments
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kmm.sigs.x-k8s.io
resources:
Expand Down
1 change: 1 addition & 0 deletions internal/controllers/device_config_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ func (r *DeviceConfigReconciler) init(ctx context.Context) {
//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=create;delete;get;list;patch;watch
//+kubebuilder:rbac:groups=apps,resources=daemonsets/status,verbs=create;delete;get;list;patch;watch
//+kubebuilder:rbac:groups=apps,resources=daemonsets/finalizers,verbs=create;get;update;watch
//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=create;delete;get;list;patch;watch;update
//+kubebuilder:rbac:groups=core,resources=services,verbs=create;delete;get;list;patch;watch
//+kubebuilder:rbac:groups=core,resources=services/finalizers,verbs=create;get;update;watch
//+kubebuilder:rbac:groups=core,resources=pods,verbs=delete;get;list;watch;create
Expand Down
9 changes: 9 additions & 0 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,15 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
ObjectMeta: metav1.ObjectMeta{
Name: "event-notify-template",
Namespace: devConfig.Namespace,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: devConfig.APIVersion,
Kind: devConfig.Kind,
Name: devConfig.Name,
UID: devConfig.UID,
Controller: ptr.To(true),
},
},
},
Spec: workflowv1alpha1.WorkflowSpec{
Entrypoint: "notify",
Expand Down
Loading