Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
# Language-Specific Build Artifacts & Dependencies
# ============================================================================

### Version Managers ###
mise.toml

### Go ###
# Binaries for programs and plugins
*.exe
Expand Down Expand Up @@ -451,4 +454,4 @@ labeler/labeler
metadata-collector/metadata-collector
node-drainer/node-drainer
platform-connectors/platform-connectors
event-exporter/event-exporter
event-exporter/event-exporter
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/config"
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp"
awsclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/aws"
azureclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/azure"
gcpclient "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/csp/gcp"
"github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/datastore"
eventpkg "github.com/nvidia/nvsentinel/health-monitors/csp-health-monitor/pkg/event"
Expand Down Expand Up @@ -256,7 +257,24 @@ func initActiveMonitor(
return awsMonitor
}

slog.Info("No CSP is explicitly enabled in the configuration (GCP or AWS).")
if cfg.Azure.Enabled {
slog.Info("Azure configuration is enabled.")

azureMonitor, err := azureclient.NewClient(ctx, cfg.Azure, cfg.ClusterName, kubeconfigPath)
if err != nil {
metrics.CSPMonitorErrors.WithLabelValues(string(model.CSPAzure), "init_error").Inc()
slog.Error("Failed to initialize Azure monitor. Azure will not be monitored.", "error", err)

return nil
}

slog.Info("Azure monitor initialized",
"subscriptionID", cfg.Azure.SubscriptionID)

return azureMonitor
}

slog.Info("No CSP is explicitly enabled in the configuration (GCP, AWS, or Azure).")

return nil
}
Expand Down
10 changes: 8 additions & 2 deletions health-monitors/csp-health-monitor/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ toolchain go1.25.3
require (
cloud.google.com/go/compute v1.38.0
cloud.google.com/go/logging v1.13.1
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0
github.com/BurntSushi/toml v1.5.0
github.com/aws/aws-sdk-go-v2 v1.40.0
github.com/aws/aws-sdk-go-v2/config v1.32.1
Expand All @@ -31,13 +34,14 @@ require (
)

require (
cel.dev/expr v0.25.1 // indirect
cloud.google.com/go v0.121.6 // indirect
cloud.google.com/go/auth v0.17.0 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
cloud.google.com/go/compute/metadata v0.9.0 // indirect
cloud.google.com/go/iam v1.5.2 // indirect
cloud.google.com/go/longrunning v0.6.7 // indirect
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
github.com/Masterminds/semver/v3 v3.4.0 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.19.1 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.14 // indirect
Expand Down Expand Up @@ -75,6 +79,7 @@ require (
github.com/go-openapi/swag/typeutils v0.25.1 // indirect
github.com/go-openapi/swag/yamlutils v0.25.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang-jwt/jwt/v5 v5.3.0 // indirect
github.com/golang/snappy v1.0.0 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
Expand All @@ -85,11 +90,13 @@ require (
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/lib/pq v1.10.9 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.2 // indirect
Expand Down Expand Up @@ -118,7 +125,6 @@ require (
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.14.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
29 changes: 25 additions & 4 deletions health-monitors/csp-health-monitor/go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
cloud.google.com/go v0.121.6 h1:waZiuajrI28iAf40cWgycWNgaXPO06dupuS+sgibK6c=
cloud.google.com/go v0.121.6/go.mod h1:coChdst4Ea5vUpiALcYKXEpR1S9ZgXbhEzzMcMR66vI=
cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4=
Expand All @@ -20,6 +20,20 @@ cloud.google.com/go/monitoring v1.24.2 h1:5OTsoJ1dXYIiMiuL+sYscLc9BumrL3CarVLL7d
cloud.google.com/go/monitoring v1.24.2/go.mod h1:x7yzPWcgDRnPEv3sI+jJGBkwl5qINf+6qY4eq0I9B4U=
cloud.google.com/go/storage v1.56.0 h1:iixmq2Fse2tqxMbWhLWC9HfBj1qdxqAmiK8/eqtsLxI=
cloud.google.com/go/storage v1.56.0/go.mod h1:Tpuj6t4NweCLzlNbw9Z9iwxEkrSem20AetIeH/shgVU=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0 h1:KpMC6LFL7mqpExyMC9jVOYRiVhLmamjeZfRsUpB7l4s=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I=
github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY=
github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0 h1:rx/pIYQIlCjb+n7TzMyFUzIJYb+d0Gi7Vh+ozA0fSJA=
github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/maintenance/armmaintenance v1.3.0/go.mod h1:o8YD+BbSeK8ANH4SpxQFCiz5OIFKgHxV1uwF2FrQJYY=
github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM=
github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7OputlJIzU=
Expand Down Expand Up @@ -132,6 +146,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
Expand Down Expand Up @@ -160,6 +176,8 @@ github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+l
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
Expand All @@ -186,6 +204,8 @@ github.com/onsi/ginkgo/v2 v2.26.0 h1:1J4Wut1IlYZNEAWIV3ALrT9NfiaGW2cDCJQSFQMs/gE
github.com/onsi/ginkgo/v2 v2.26.0/go.mod h1:qhEywmzWTBUY88kfO0BRvX4py7scov9yR+Az2oavUzw=
github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
Expand Down Expand Up @@ -294,6 +314,7 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
Expand All @@ -319,8 +340,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0=
gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/api v0.256.0 h1:u6Khm8+F9sxbCTYNoBHg6/Hwv0N/i+V94MvkOSor6oI=
Expand Down
47 changes: 37 additions & 10 deletions health-monitors/csp-health-monitor/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@ const (
)

type Config struct {
MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"`
TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"`
PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"`
NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"`
ClusterName string `toml:"clusterName"`
GCP GCPConfig `toml:"gcp"`
AWS AWSConfig `toml:"aws"`
MaintenanceEventPollIntervalSeconds int `toml:"maintenanceEventPollIntervalSeconds"`
TriggerQuarantineWorkflowTimeLimitMinutes int `toml:"triggerQuarantineWorkflowTimeLimitMinutes"`
PostMaintenanceHealthyDelayMinutes int `toml:"postMaintenanceHealthyDelayMinutes"`
NodeReadinessTimeoutMinutes int `toml:"nodeReadinessTimeoutMinutes"`
ClusterName string `toml:"clusterName"`
GCP GCPConfig `toml:"gcp"`
AWS AWSConfig `toml:"aws"`
Azure AzureConfig `toml:"azure"`
}

// GCPConfig holds GCP specific configuration.
Expand All @@ -62,6 +63,13 @@ type AWSConfig struct {
Region string `toml:"region"`
}

// AzureConfig holds Azure specific configuration.
type AzureConfig struct {
Enabled bool `toml:"enabled"`
SubscriptionID string `toml:"subscriptionId"`
PollingIntervalSeconds int `toml:"pollingIntervalSeconds"`
}

// LoadConfig reads the configuration from a TOML file.
func LoadConfig(filePath string) (*Config, error) {
var cfg Config
Expand Down Expand Up @@ -174,7 +182,7 @@ func validateGeneralConfig(cfg *Config) error {
return nil
}

// validateCSPConfig checks GCP/AWS polling intervals and ensures only one CSP is enabled.
// validateCSPConfig checks GCP/AWS/Azure polling intervals and ensures only one CSP is enabled.
func validateCSPConfig(cfg *Config) error {
// Validate GCP polling interval
if cfg.GCP.Enabled && cfg.GCP.APIPollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds {
Expand All @@ -194,9 +202,28 @@ func validateCSPConfig(cfg *Config) error {
)
}

// Validate Azure polling interval
if cfg.Azure.Enabled && cfg.Azure.PollingIntervalSeconds < minCSPSpecificPollingIntervalSeconds {
return fmt.Errorf(
"azure.pollingIntervalSeconds must be at least %d seconds (got %d)",
minCSPSpecificPollingIntervalSeconds,
cfg.Azure.PollingIntervalSeconds,
)
}

// Ensure only one CSP is enabled
if cfg.GCP.Enabled && cfg.AWS.Enabled {
return fmt.Errorf("multiple CSPs enabled: only one of GCP or AWS can be enabled at a time in the configuration")
count := 0

for _, csp := range []bool{cfg.GCP.Enabled, cfg.AWS.Enabled, cfg.Azure.Enabled} {
if csp {
count++
}
}

if count > 1 {
return fmt.Errorf(
"multiple CSPs enabled: only one of GCP, AWS, or Azure can be enabled at a time in the configuration",
)
}

return nil
Expand Down
Loading
Loading