Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 94 additions & 30 deletions data-models/pkg/protos/health_event.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions data-models/protobufs/health_event.proto
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ message HealthEvents {
repeated HealthEvent events = 2;
}

// ProcessingStrategy defines how downstream modules should handle the event.
// EXECUTE_REMEDIATION: normal behavior; downstream modules may update cluster state.
// STORE_ONLY: observability-only behavior; event should be persisted/exported but should not modify cluster resources.
enum ProcessingStrategy {
EXECUTE_REMEDIATION = 0;
STORE_ONLY = 1;
}

enum RecommendedAction {
NONE = 0;
COMPONENT_RESET = 2;
Expand Down Expand Up @@ -66,6 +74,7 @@ message HealthEvent {
string nodeName = 13;
BehaviourOverrides quarantineOverrides = 14;
BehaviourOverrides drainOverrides = 15;
ProcessingStrategy processingStrategy = 16;
}

message BehaviourOverrides {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ spec:
- "--database-client-cert-mount-path={{ .Values.clientCertMountPath }}"
- "--uds-path=/run/nvsentinel/nvsentinel.sock"
- "--metrics-port=2113"
- "--processing-strategy={{ .Values.processingStrategy }}"
resources:
{{- toYaml .Values.quarantineTriggerEngine.resources | default .Values.resources | nindent 12 }}
ports:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ tolerations: []

podAnnotations: {}

# Processing strategy for health events
# valid values: EXECUTE_REMEDIATION, STORE_ONLY
# default: EXECUTE_REMEDIATION
# EXECUTE_REMEDIATION: normal behavior; downstream modules may update cluster state.
# STORE_ONLY: observability-only behavior; event should be persisted/exported but should not modify cluster resources (i.e., no node conditions, no quarantine, no drain, no remediation).
processingStrategy: EXECUTE_REMEDIATION

# Log verbosity level for the main CSP health monitor container (e.g. "debug", "info", "warn", "error")
logLevel: info

Expand Down
1 change: 1 addition & 0 deletions event-exporter/pkg/transformer/cloudevents.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func ToCloudEvent(event *pb.HealthEvent, metadata map[string]string) (*CloudEven
"entitiesImpacted": entities,
"generatedTimestamp": timestamp,
"nodeName": event.NodeName,
"processingStrategy": event.ProcessingStrategy.String(),
}

if len(event.Metadata) > 0 {
Expand Down
4 changes: 4 additions & 0 deletions event-exporter/pkg/transformer/cloudevents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func TestToCloudEvent(t *testing.T) {
Force: false,
Skip: true,
},
ProcessingStrategy: pb.ProcessingStrategy_STORE_ONLY,
},
metadata: map[string]string{
"cluster": "prod-cluster-1",
Expand Down Expand Up @@ -102,6 +103,9 @@ func TestToCloudEvent(t *testing.T) {
if healthEvent["recommendedAction"] != "RESTART_VM" {
t.Errorf("recommendedAction = %v, want %v", healthEvent["recommendedAction"], "RESTART_VM")
}
if healthEvent["processingStrategy"] != "STORE_ONLY" {
t.Errorf("processingStrategy = %v, want STORE_ONLY", healthEvent["processingStrategy"])
}

entities := healthEvent["entitiesImpacted"].([]map[string]any)
if len(entities) != 2 {
Expand Down
1 change: 1 addition & 0 deletions fault-quarantine/pkg/evaluator/rule_evaluator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ func TestRoundTrip(t *testing.T) {
"nanos": float64(eventTime.GetNanos()),
},
"nodeName": "test-node",
"processingStrategy": float64(0),
"quarantineOverrides": nil,
"drainOverrides": nil,
}
Expand Down
2 changes: 1 addition & 1 deletion fault-quarantine/pkg/initializer/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func InitializeAll(ctx context.Context, params InitializationParams) (*Component
}

builder := client.GetPipelineBuilder()
pipeline := builder.BuildAllHealthEventInsertsPipeline()
pipeline := builder.BuildProcessableHealthEventInsertsPipeline()

var tomlCfg config.TomlConfig
if err := configmanager.LoadTOMLConfig(params.TomlConfigPath, &tomlCfg); err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type appConfig struct {
udsPath string
databaseClientCertMountPath string
metricsPort string
processingStrategy string
}

func parseFlags() *appConfig {
Expand All @@ -71,6 +72,8 @@ func parseFlags() *appConfig {
"Directory where database client tls.crt, tls.key, and ca.crt are mounted.",
)
flag.StringVar(&cfg.metricsPort, "metrics-port", defaultMetricsPortSidecar, "Port for the sidecar Prometheus metrics.")
flag.StringVar(&cfg.processingStrategy, "processing-strategy", "EXECUTE_REMEDIATION",
"Event processing strategy: EXECUTE_REMEDIATION or STORE_ONLY")

// Parse flags after initialising klog
flag.Parse()
Expand Down Expand Up @@ -213,7 +216,15 @@ func run() error {
return fmt.Errorf("kubernetes client setup failed: %w", err)
}

engine := trigger.NewEngine(cfg, store, platformConnectorClient, k8sClient)
value, ok := pb.ProcessingStrategy_value[appCfg.processingStrategy]
if !ok {
return fmt.Errorf("invalid processingStrategy %q (expected EXECUTE_REMEDIATION or STORE_ONLY)",
appCfg.processingStrategy)
}

slog.Info("Event handling strategy configured", "processingStrategy", appCfg.processingStrategy)

engine := trigger.NewEngine(cfg, store, platformConnectorClient, k8sClient, pb.ProcessingStrategy(value))

slog.Info("Trigger engine starting...")
engine.Start(gCtx)
Expand Down
44 changes: 24 additions & 20 deletions health-monitors/csp-health-monitor/pkg/triggerengine/trigger.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,14 @@ const (
// Engine polls the datastore for maintenance events and forwards the
// corresponding health signals to NVSentinel through the UDS connector.
type Engine struct {
store datastore.Store
udsClient pb.PlatformConnectorClient
config *config.Config
pollInterval time.Duration
k8sClient kubernetes.Interface
monitoredNodes sync.Map // Track which nodes are currently being monitored
monitorInterval time.Duration
store datastore.Store
udsClient pb.PlatformConnectorClient
config *config.Config
pollInterval time.Duration
k8sClient kubernetes.Interface
monitoredNodes sync.Map // Track which nodes are currently being monitored
monitorInterval time.Duration
processingStrategy pb.ProcessingStrategy
}

// NewEngine constructs a ready-to-run Engine instance.
Expand All @@ -71,14 +72,16 @@ func NewEngine(
store datastore.Store,
udsClient pb.PlatformConnectorClient,
k8sClient kubernetes.Interface,
processingStrategy pb.ProcessingStrategy,
) *Engine {
return &Engine{
config: cfg,
store: store,
udsClient: udsClient,
pollInterval: time.Duration(cfg.MaintenanceEventPollIntervalSeconds) * time.Second,
k8sClient: k8sClient,
monitorInterval: defaultMonitorInterval,
config: cfg,
store: store,
udsClient: udsClient,
pollInterval: time.Duration(cfg.MaintenanceEventPollIntervalSeconds) * time.Second,
k8sClient: k8sClient,
monitorInterval: defaultMonitorInterval,
processingStrategy: processingStrategy,
}
}

Expand Down Expand Up @@ -343,13 +346,14 @@ func (e *Engine) mapMaintenanceEventToHealthEvent(
}

healthEvent := &pb.HealthEvent{
Agent: "csp-health-monitor", // Consistent agent name
ComponentClass: event.ResourceType, // e.g., "EC2", "gce_instance"
CheckName: "CSPMaintenance", // Consistent check name
IsFatal: isFatal,
IsHealthy: isHealthy,
Message: message,
RecommendedAction: pb.RecommendedAction(actionEnum),
Agent: "csp-health-monitor", // Consistent agent name
ComponentClass: event.ResourceType, // e.g., "EC2", "gce_instance"
CheckName: "CSPMaintenance", // Consistent check name
IsFatal: isFatal,
IsHealthy: isHealthy,
ProcessingStrategy: e.processingStrategy,
Message: message,
RecommendedAction: pb.RecommendedAction(actionEnum),
EntitiesImpacted: []*pb.Entity{
{
EntityType: event.ResourceType,
Expand Down
Loading
Loading