Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ dist/
# IDE & Editor Configurations
# ============================================================================


### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
### JetBrains IDEs (GoLand, PyCharm, IntelliJ) ###
# User-specific stuff
Expand Down Expand Up @@ -457,3 +458,8 @@ tests/scale-tests/FQM_LATENCY_TEST_PLAN.md
tests/scale-tests/CONCURRENT_DRAIN_TEST_PLAN.md
tests/scale-tests/results/*.csv
tests/scale-tests/cmd/fqm-scale-test/results/

# data-models Local Go tools installed via Makefile
data-models/.tools/
data-models/bin/

46 changes: 46 additions & 0 deletions data-models/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,56 @@ DOCKER_EXTRA_ARGS :=
include ../make/common.mk
include ../make/go.mk

# Version of controller-gen to use for generating CRD deepcopy, client, etc.
CONTROLLER_GEN_VERSION := v0.17.2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

v0.20.0 is latest, if we're going to pin we should pickup the newest one

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, janitor has a controller-gen as well. It's using latest rather than a pinned version though. Checkout how the main makfile in the root dir is handling versions:

ADDLICENSE_VERSION := $(shell $(YQ) '.linting.addlicense' .versions.yaml)

We should probably put controller-gen there with that same pattern since we're using it in multiple modules now


# Local path to controller-gen binary
CONTROLLER_GEN_BIN := $(PWD)/bin/controller-gen

# API and CRD paths
API_DIR := api/v1alpha1
CRD_OUTPUT_DIR := $(REPO_ROOT)/distros/kubernetes/nvsentinel/charts/data-models/crds


# =============================================================================
# MODULE-SPECIFIC TARGETS
# =============================================================================

# tools: Install controller-gen locally if not present or version mismatch
.PHONY: tools
tools:
@echo "Ensuring controller-gen $(CONTROLLER_GEN_VERSION) is installed..."
mkdir -p $(PWD)/bin
@if [ -f "$(CONTROLLER_GEN_BIN)" ]; then \
VERSION="$$( $(CONTROLLER_GEN_BIN) --version 2>/dev/null | awk '{print $$2}' )"; \
if [ "$$VERSION" = "$(CONTROLLER_GEN_VERSION)" ]; then \
echo "controller-gen $(CONTROLLER_GEN_VERSION) already installed."; \
else \
echo "controller-gen version mismatch (found $$VERSION). Installing correct version..."; \
GO111MODULE=on GOPATH=$(PWD)/.tools go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION); \
ln -sf $(PWD)/.tools/bin/controller-gen $(CONTROLLER_GEN_BIN); \
fi \
else \
echo "controller-gen not found. Installing..."; \
GO111MODULE=on GOPATH=$(PWD)/.tools go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION); \
ln -sf $(PWD)/.tools/bin/controller-gen $(CONTROLLER_GEN_BIN); \
fi

# generate: Generate deepcopy, CRD types, and other Kubernetes boilerplate
# Depends on tools being installed
.PHONY: generate
generate: tools
@echo "Generating deepcopy files for API types..."
@$(CONTROLLER_GEN_BIN) object paths=./$(API_DIR)
@echo "Generating CRD YAML manifests..."
@$(CONTROLLER_GEN_BIN) crd paths=./$(API_DIR) output:crd:artifacts:config=./$(API_DIR)/crds
@echo "Moving CRDs to Helm chart directory..."
@mkdir -p $(CRD_OUTPUT_DIR)
@mv ./$(API_DIR)/crds/*.yaml $(CRD_OUTPUT_DIR)/ || true
@echo "CRDs generated and moved to $(CRD_OUTPUT_DIR)"
@ls -1 $(CRD_OUTPUT_DIR)/*.yaml || echo "No CRD YAMLs generated"


# Generate Go protobuf files for data-models (shared across all Go modules)
.PHONY: protos-generate
protos-generate: protos-clean
Expand Down
34 changes: 34 additions & 0 deletions data-models/api/v1alpha1/groupversion_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package v1alpha1 contains API Schema definitions for the data-models v1alpha1 API group
// +kubebuilder:object:generate=true
// +groupName=data-models.dgxc.nvidia.com
package v1alpha1

import (
"k8s.io/apimachinery/pkg/runtime/schema"
"sigs.k8s.io/controller-runtime/pkg/scheme"
)

var (
// GroupVersion is group version used to register these objects
GroupVersion = schema.GroupVersion{Group: "data-models.dgxc.nvidia.com", Version: "v1alpha1"}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking maybe we do healthevents.dgxc.nvidia.com but open to ideas here too. Just because health-events is a little more descriptive than data-models


// SchemeBuilder is used to add go types to the GroupVersionKind scheme
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}

// AddToScheme adds the types in this group-version to the given scheme.
AddToScheme = SchemeBuilder.AddToScheme
)
135 changes: 135 additions & 0 deletions data-models/api/v1alpha1/healthstatus_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// +kubebuilder:validation:Enum=NotStarted;InProgress;Failed;Succeeded;AlreadyDrained;UnQuarantined;Quarantined;AlreadyQuarantined;Cancelled
type Status string

const (
StatusNotStarted Status = "NotStarted"
StatusInProgress Status = "InProgress"
StatusFailed Status = "Failed"
StatusSucceeded Status = "Succeeded"
AlreadyDrained Status = "AlreadyDrained"

UnQuarantined Status = "UnQuarantined"
Quarantined Status = "Quarantined"
AlreadyQuarantined Status = "AlreadyQuarantined"
Cancelled Status = "Cancelled"
)

// HealthEventSpec defines the desired state of HealthStatus
type HealthEventSpec struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want to manage two copies of the same struct just to make it a kubernetes type.

Can we point directly to the objects that already exist? So it would look something like this

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
type HealthEvent struct {
  Spec HealthEvent // this is the existing health event object
  Status HealthEventStatus // this is the existing status object
}

// Unique identifier for the health event
EventID string `json:"eventID"`

// Node associated with this health event
NodeName string `json:"nodeName"`
}

// OperationStatus captures the status of a remediation operation
type OperationStatus struct {
// Current operation status
Status Status `json:"status"`

// Optional human-readable message
Message string `json:"message,omitempty"`
}

// HealthEventSnapshot represents a read-only snapshot of a reported health event.
// This data is observational and originates outside Kubernetes.
type HealthEventSnapshot struct {
// Version of the reported health event schema
Version uint32 `json:"version,omitempty"`

// Reporting agent that generated the event
Agent string `json:"agent,omitempty"`

// Component class that raised the event
ComponentClass string `json:"componentClass,omitempty"`

// Specific check or rule that triggered the event
CheckName string `json:"checkName,omitempty"`

// Indicates whether the event is fatal
IsFatal bool `json:"isFatal,omitempty"`

// Indicates whether the system was reported healthy
IsHealthy bool `json:"isHealthy,omitempty"`

// Human-readable event message
Message string `json:"message,omitempty"`

// Recommended action provided by the reporting system
RecommendedAction string `json:"recommendedAction,omitempty"`

// Error codes associated with this health event
ErrorCode []string `json:"errorCode,omitempty"`

// Additional key-value metadata provided by the agent
Metadata map[string]string `json:"metadata,omitempty"`

// Time at which the event was generated by the source
GeneratedTimestamp *metav1.Time `json:"generatedTimestamp,omitempty"`
}

// RemediationStatus captures the observed state of remediation workflows
type RemediationStatus struct {
// Indicates whether the node is quarantined
NodeQuarantined *Status `json:"nodeQuarantined,omitempty"`

// Status of user pods eviction process
UserPodsEvictionStatus *OperationStatus `json:"userPodsEvictionStatus,omitempty"`

// Whether the fault has been remediated
FaultRemediated *bool `json:"faultRemediated,omitempty"`

// Timestamp of the last remediation attempt
LastRemediationTimestamp *metav1.Time `json:"lastRemediationTimestamp,omitempty"`
}

// HealthEventStatus defines the observed state of HealthStatus
type HealthEventStatus struct {
// Snapshot of the reported health event
Event *HealthEventSnapshot `json:"event,omitempty"`

// Observed remediation state
Remediation *RemediationStatus `json:"remediation,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
type HealthStatus struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so my thought was this struct would look like this:

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
type HealthStatus struct {
  metav1.TypeMeta   `json:",inline"`
  metav1.ObjectMeta `json:"metadata,omitempty"`

  Spec   model.HealthEvent   `json:"spec,omitempty"`
  Status model.HealthEventStatus `json:"status,omitempty"`
}

So you would not define your own spec and status objects we would just use the existing ones.

There's some implications for api-versioning going forward if we want to adhere to best practices that we should discuss with the NVIDIA folks if we do it this way but it seems like a clean way to share the object that the other datasources use.

metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec HealthEventSpec `json:"spec,omitempty"`
Status HealthEventStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true
type HealthStatusList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []HealthStatus `json:"items"`
}

func init() {
SchemeBuilder.Register(&HealthStatus{}, &HealthStatusList{})
}
Loading