-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathcomponents.go
160 lines (131 loc) · 4.94 KB
/
components.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Package components defines the common interfaces for the components.
package components
import (
"context"
"database/sql"
"fmt"
"sync"
"time"
"github.com/leptonai/gpud/components/common"
components_metrics_state "github.com/leptonai/gpud/components/metrics/state"
"github.com/leptonai/gpud/errdefs"
"github.com/prometheus/client_golang/prometheus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// WatchableComponent wraps the component with a watchable interface.
// Useful to intercept the component states method calls to track metrics.
type WatchableComponent interface {
Component
}
// Component represents an individual component of the system.
//
// Each component check is independent of each other.
// But the underlying implementation may share the same data sources
// in order to minimize the querying overhead (e.g., nvidia-smi calls).
//
// Each component implements its own output format inside the State struct.
// And recommended to have a consistent name for its HTTP handler.
// And recommended to define const keys for the State extra information field.
type Component interface {
// Defines the component name,
// and used for the HTTP handler registration path.
// Must be globally unique.
Name() string
// Start called upon server start.
// Implements component-specific poller start logic.
Start() error
// Returns the current states of the component.
States(ctx context.Context) ([]State, error)
// Returns all the events from "since".
Events(ctx context.Context, since time.Time) ([]Event, error)
// Returns all the metrics from the component.
Metrics(ctx context.Context, since time.Time) ([]Metric, error)
// Called upon server close.
// Implements copmonent-specific poller cleanup logic.
Close() error
}
type SettableComponent interface {
SetStates(ctx context.Context, states ...State) error
SetEvents(ctx context.Context, events ...Event) error
}
// Defines an optional component interface that returns the underlying output data.
type OutputProvider interface {
Output() (any, error)
}
// Defines an optional component interface that supports Prometheus metrics.
type PromRegisterer interface {
RegisterCollectors(reg *prometheus.Registry, dbRW *sql.DB, dbRO *sql.DB, tableName string) error
}
type State struct {
Name string `json:"name,omitempty"`
Healthy bool `json:"healthy,omitempty"`
Health string `json:"health,omitempty"` // Healthy, Degraded, Unhealthy
Reason string `json:"reason,omitempty"` // a detailed and processed reason on why the component is not healthy
Error string `json:"error,omitempty"` // the unprocessed error returned from the component
ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose
SuggestedActions *common.SuggestedActions `json:"suggested_actions,omitempty"`
}
const (
StateHealthy = "Healthy"
StateUnhealthy = "Unhealthy"
StateDegraded = "Degraded"
)
type Event struct {
Time metav1.Time `json:"time"`
Name string `json:"name,omitempty"`
Type common.EventType `json:"type,omitempty"`
Message string `json:"message,omitempty"` // detailed message of the event
ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose
SuggestedActions *common.SuggestedActions `json:"suggested_actions,omitempty"`
}
type Metric struct {
components_metrics_state.Metric
ExtraInfo map[string]string `json:"extra_info,omitempty"` // any extra information the component may want to expose
}
type Info struct {
States []State `json:"states"`
Events []Event `json:"events"`
Metrics []Metric `json:"metrics"`
}
var (
defaultSetMu sync.RWMutex
defaultSet = make(map[string]Component)
)
func IsComponentRegistered(name string) bool {
defaultSetMu.RLock()
defer defaultSetMu.RUnlock()
_, ok := defaultSet[name]
return ok
}
func RegisterComponent(name string, comp Component) error {
defaultSetMu.Lock()
defer defaultSetMu.Unlock()
if defaultSet == nil {
return fmt.Errorf("component set not initialized: %w", errdefs.ErrUnavailable)
}
if _, ok := defaultSet[name]; ok {
return fmt.Errorf("component %s already registered: %w", name, errdefs.ErrAlreadyExists)
}
defaultSet[name] = comp
return nil
}
func GetComponent(name string) (Component, error) {
defaultSetMu.RLock()
defer defaultSetMu.RUnlock()
return getComponent(defaultSet, name)
}
func getComponent(set map[string]Component, name string) (Component, error) {
if set == nil {
return nil, fmt.Errorf("component set not initialized: %w", errdefs.ErrUnavailable)
}
v, ok := set[name]
if !ok {
return nil, fmt.Errorf("component %s not found: %w", name, errdefs.ErrNotFound)
}
return v, nil
}
func GetAllComponents() map[string]Component {
defaultSetMu.RLock()
defer defaultSetMu.RUnlock()
return defaultSet
}