-
Notifications
You must be signed in to change notification settings - Fork 33
k8s API for healtheventwithstatus model #640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| // Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| // Package v1alpha1 contains API Schema definitions for the data-models v1alpha1 API group | ||
| // +kubebuilder:object:generate=true | ||
| // +groupName=data-models.dgxc.nvidia.com | ||
| package v1alpha1 | ||
|
|
||
| import ( | ||
| "k8s.io/apimachinery/pkg/runtime/schema" | ||
| "sigs.k8s.io/controller-runtime/pkg/scheme" | ||
| ) | ||
|
|
||
| var ( | ||
| // GroupVersion is group version used to register these objects | ||
| GroupVersion = schema.GroupVersion{Group: "data-models.dgxc.nvidia.com", Version: "v1alpha1"} | ||
|
||
|
|
||
| // SchemeBuilder is used to add go types to the GroupVersionKind scheme | ||
| SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} | ||
|
|
||
| // AddToScheme adds the types in this group-version to the given scheme. | ||
| AddToScheme = SchemeBuilder.AddToScheme | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| // Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| package v1alpha1 | ||
|
|
||
| import ( | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| ) | ||
|
|
||
| // +kubebuilder:validation:Enum=NotStarted;InProgress;Failed;Succeeded;AlreadyDrained;UnQuarantined;Quarantined;AlreadyQuarantined;Cancelled | ||
| type Status string | ||
|
|
||
| const ( | ||
| StatusNotStarted Status = "NotStarted" | ||
| StatusInProgress Status = "InProgress" | ||
| StatusFailed Status = "Failed" | ||
| StatusSucceeded Status = "Succeeded" | ||
| AlreadyDrained Status = "AlreadyDrained" | ||
|
|
||
| UnQuarantined Status = "UnQuarantined" | ||
| Quarantined Status = "Quarantined" | ||
| AlreadyQuarantined Status = "AlreadyQuarantined" | ||
| Cancelled Status = "Cancelled" | ||
| ) | ||
|
|
||
| // HealthEventSpec defines the desired state of HealthStatus | ||
| type HealthEventSpec struct { | ||
|
||
| // Unique identifier for the health event | ||
| EventID string `json:"eventID"` | ||
|
|
||
| // Node associated with this health event | ||
| NodeName string `json:"nodeName"` | ||
| } | ||
|
|
||
| // OperationStatus captures the status of a remediation operation | ||
| type OperationStatus struct { | ||
| // Current operation status | ||
| Status Status `json:"status"` | ||
|
|
||
| // Optional human-readable message | ||
| Message string `json:"message,omitempty"` | ||
| } | ||
|
|
||
| // HealthEventSnapshot represents a read-only snapshot of a reported health event. | ||
| // This data is observational and originates outside Kubernetes. | ||
| type HealthEventSnapshot struct { | ||
| // Version of the reported health event schema | ||
| Version uint32 `json:"version,omitempty"` | ||
|
|
||
| // Reporting agent that generated the event | ||
| Agent string `json:"agent,omitempty"` | ||
|
|
||
| // Component class that raised the event | ||
| ComponentClass string `json:"componentClass,omitempty"` | ||
|
|
||
| // Specific check or rule that triggered the event | ||
| CheckName string `json:"checkName,omitempty"` | ||
|
|
||
| // Indicates whether the event is fatal | ||
| IsFatal bool `json:"isFatal,omitempty"` | ||
|
|
||
| // Indicates whether the system was reported healthy | ||
| IsHealthy bool `json:"isHealthy,omitempty"` | ||
|
|
||
| // Human-readable event message | ||
| Message string `json:"message,omitempty"` | ||
|
|
||
| // Recommended action provided by the reporting system | ||
| RecommendedAction string `json:"recommendedAction,omitempty"` | ||
|
|
||
| // Error codes associated with this health event | ||
| ErrorCode []string `json:"errorCode,omitempty"` | ||
|
|
||
| // Additional key-value metadata provided by the agent | ||
| Metadata map[string]string `json:"metadata,omitempty"` | ||
|
|
||
| // Time at which the event was generated by the source | ||
| GeneratedTimestamp *metav1.Time `json:"generatedTimestamp,omitempty"` | ||
| } | ||
|
|
||
| // RemediationStatus captures the observed state of remediation workflows | ||
| type RemediationStatus struct { | ||
| // Indicates whether the node is quarantined | ||
| NodeQuarantined *Status `json:"nodeQuarantined,omitempty"` | ||
|
|
||
| // Status of user pods eviction process | ||
| UserPodsEvictionStatus *OperationStatus `json:"userPodsEvictionStatus,omitempty"` | ||
|
|
||
| // Whether the fault has been remediated | ||
| FaultRemediated *bool `json:"faultRemediated,omitempty"` | ||
|
|
||
| // Timestamp of the last remediation attempt | ||
| LastRemediationTimestamp *metav1.Time `json:"lastRemediationTimestamp,omitempty"` | ||
| } | ||
|
|
||
| // HealthEventStatus defines the observed state of HealthStatus | ||
| type HealthEventStatus struct { | ||
| // Snapshot of the reported health event | ||
| Event *HealthEventSnapshot `json:"event,omitempty"` | ||
|
|
||
| // Observed remediation state | ||
| Remediation *RemediationStatus `json:"remediation,omitempty"` | ||
| } | ||
|
|
||
| // +kubebuilder:object:root=true | ||
| // +kubebuilder:subresource:status | ||
| type HealthStatus struct { | ||
|
||
| metav1.TypeMeta `json:",inline"` | ||
| metav1.ObjectMeta `json:"metadata,omitempty"` | ||
|
|
||
| Spec HealthEventSpec `json:"spec,omitempty"` | ||
| Status HealthEventStatus `json:"status,omitempty"` | ||
| } | ||
|
|
||
| // +kubebuilder:object:root=true | ||
| type HealthStatusList struct { | ||
| metav1.TypeMeta `json:",inline"` | ||
| metav1.ListMeta `json:"metadata,omitempty"` | ||
| Items []HealthStatus `json:"items"` | ||
| } | ||
|
|
||
| func init() { | ||
| SchemeBuilder.Register(&HealthStatus{}, &HealthStatusList{}) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v0.20.0 is latest, if we're going to pin we should pickup the newest one
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also, janitor has a controller-gen as well. It's using latest rather than a pinned version though. Checkout how the main makfile in the root dir is handling versions:
NVSentinel/Makefile
Line 24 in 82e7180
We should probably put controller-gen there with that same pattern since we're using it in multiple modules now