diff --git a/api/v1alpha1/bmc_types.go b/api/v1alpha1/bmc_types.go index 5c2d04d60..e033132fc 100644 --- a/api/v1alpha1/bmc_types.go +++ b/api/v1alpha1/bmc_types.go @@ -206,6 +206,14 @@ type BMCStatus struct { // +optional LastResetTime *metav1.Time `json:"lastResetTime,omitempty"` + // MetricsReportSubscriptionLink is the link to the metrics report subscription of the bmc. + // +optional + MetricsReportSubscriptionLink string `json:"metricsReportSubscriptionLink,omitempty"` + + // EventsSubscriptionLink is the link to the events subscription of the bmc. + // +optional + EventsSubscriptionLink string `json:"eventsSubscriptionLink,omitempty"` + // Conditions represents the latest available observations of the BMC's current state. // +patchStrategy=merge // +patchMergeKey=type diff --git a/bmc/bmc.go b/bmc/bmc.go index d9f6634b7..e1e90ad85 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -109,6 +109,12 @@ type BMC interface { // GetBMCUpgradeTask retrieves the task for the BMC upgrade. GetBMCUpgradeTask(ctx context.Context, manufacturer string, taskURI string) (*schemas.Task, error) + // CreateEventSubscription creates an event subscription for the manager. + CreateEventSubscription(ctx context.Context, destination string, eventType schemas.EventFormatType, protocol schemas.DeliveryRetryPolicy) (string, error) + + // DeleteEventSubscription deletes an event subscription for the manager. + DeleteEventSubscription(ctx context.Context, uri string) error + // CreateOrUpdateAccount creates or updates a BMC user account. CreateOrUpdateAccount(ctx context.Context, userName, role, password string, enabled bool) error diff --git a/bmc/mock/server/data/Managers/BMC/index.json b/bmc/mock/server/data/Managers/BMC/index.json index be410bbe8..c99534fac 100644 --- a/bmc/mock/server/data/Managers/BMC/index.json +++ b/bmc/mock/server/data/Managers/BMC/index.json @@ -4,6 +4,7 @@ "Name": "Manager", "ManagerType": "BMC", "Description": "Contoso BMC", + "Manufacturer": "Contoso", "ServiceEntryPointUUID": "92384634-2938-2342-8820-489239905423", "UUID": "58893887-8974-2487-2389-841168418919", "Model": "Joo Janta 200", @@ -96,4 +97,4 @@ }, "@odata.id": "/redfish/v1/Managers/BMC", "@Redfish.Copyright": "Copyright 2014-2023 DMTF. For the full DMTF copyright policy, see http://www.dmtf.org/about/policies/copyright." -} \ No newline at end of file +} diff --git a/bmc/mock/server/server.go b/bmc/mock/server/server.go index 9e95f4081..128bef06a 100644 --- a/bmc/mock/server/server.go +++ b/bmc/mock/server/server.go @@ -26,6 +26,14 @@ var ( dataFS embed.FS ) +type Collection struct { + Members []Member `json:"Members"` +} + +type Member struct { + OdataID string `json:"@odata.id"` +} + const ( PowerOffState = "Off" PowerOnState = "On" @@ -116,6 +124,7 @@ func (s *MockServer) handleGet(w http.ResponseWriter, r *http.Request) { if _, err := w.Write(content); err != nil { s.log.Error(err, "Failed to write response") } + } func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) { @@ -135,6 +144,66 @@ func (s *MockServer) handlePost(w http.ResponseWriter, r *http.Request) { case strings.Contains(urlPath, "UpdateService/Actions/UpdateService.SimpleUpdate"): s.writeJSON(w, http.StatusAccepted, map[string]string{"status": "Accepted"}) default: + // + urlPath := resolvePath(r.URL.Path) + var update map[string]any + if err := json.Unmarshal(body, &update); err != nil { + http.Error(w, "Invalid JSON", http.StatusBadRequest) + return + } + // Handle resource creation in collections + s.mu.Lock() + defer s.mu.Unlock() + cached, hasOverride := s.overrides[urlPath] + var base Collection + if hasOverride { + s.log.Info("Using overridden data for POST", "path", urlPath) + var ok bool + base, ok = cached.(Collection) + if !ok { + http.Error(w, "Corrupt overridden JSON", http.StatusInternalServerError) + return + } + } else { + s.log.Info("Using embedded data for POST", "path", urlPath) + data, err := dataFS.ReadFile(urlPath) + if err != nil { + s.log.Error(err, "Failed to read embedded data for POST", "path", urlPath) + http.NotFound(w, r) + return + } + if err := json.Unmarshal(data, &base); err != nil { + http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError) + return + } + } + // If resource collection (has "Members"), add a new member + if len(base.Members) > 0 { + newID := fmt.Sprintf("%d", len(base.Members)+1) + location := path.Join(r.URL.Path, newID) + newMemberPath := resolvePath(location) + base.Members = append(base.Members, Member{ + OdataID: location, + }) + s.log.Info("Adding new member", "id", newID, "location", location, "memberPath", newMemberPath) + if strings.HasSuffix(r.URL.Path, "/Subscriptions") { + w.Header().Set("Location", location) + } + s.overrides[urlPath] = base + s.overrides[newMemberPath] = update + } else { + base.Members = make([]Member, 0) + location := r.URL.JoinPath("1").String() + base.Members = []Member{ + { + OdataID: r.URL.JoinPath("1").String(), + }, + } + s.overrides[urlPath] = base + if strings.HasSuffix(r.URL.Path, "/Subscriptions") { + w.Header().Set("Location", location) + } + } s.writeJSON(w, http.StatusCreated, map[string]string{"status": "created"}) } } @@ -178,22 +247,53 @@ func (s *MockServer) handlePatch(w http.ResponseWriter, r *http.Request) { func (s *MockServer) handleDelete(w http.ResponseWriter, r *http.Request) { filePath := resolvePath(r.URL.Path) - base, err := s.loadResource(filePath) if err != nil { s.handleError(w, r, err) return } - if _, isCollection := base["Members"]; isCollection { http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) return } - s.mu.Lock() delete(s.overrides, filePath) s.mu.Unlock() + // get collection of the resource + collectionPath := path.Dir(filePath) + cached, hasOverride := s.overrides[collectionPath] + var collection Collection + if hasOverride { + var ok bool + collection, ok = cached.(Collection) + if !ok { + http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError) + return + } + } else { + data, err := dataFS.ReadFile(collectionPath + "/index.json") + if err != nil { + http.NotFound(w, r) + return + } + if err := json.Unmarshal(data, &collection); err != nil { + http.Error(w, "Corrupt embedded JSON", http.StatusInternalServerError) + return + } + } + // remove member from collection + newMembers := make([]Member, 0) + for _, member := range collection.Members { + if member.OdataID != r.URL.Path { + newMembers = append(newMembers, member) + } + } + s.log.Info("Removing member from collection", "members", newMembers, "collection", collectionPath) + collection.Members = newMembers + s.mu.Lock() + s.overrides[collectionPath] = collection + s.mu.Unlock() w.WriteHeader(http.StatusNoContent) } diff --git a/bmc/redfish.go b/bmc/redfish.go index 08d3d195e..bea4a5cb5 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -12,6 +12,7 @@ import ( "io" "maps" "math/big" + "net/url" "slices" "strings" "time" @@ -975,3 +976,89 @@ func shuffleRunes(a []rune) error { } return nil } + +type subscriptionPayload struct { + Destination string `json:"Destination,omitempty"` + EventTypes []schemas.EventType `json:"EventTypes,omitempty"` + EventFormatType schemas.EventFormatType `json:"EventFormatType,omitempty"` + RegistryPrefixes []string `json:"RegistryPrefixes,omitempty"` + ResourceTypes []string `json:"ResourceTypes,omitempty"` + DeliveryRetryPolicy schemas.DeliveryRetryPolicy `json:"DeliveryRetryPolicy,omitempty"` + HTTPHeaders map[string]string `json:"HttpHeaders,omitempty"` + Oem any `json:"Oem,omitempty"` + Protocol schemas.EventDestinationProtocol `json:"Protocol,omitempty"` + Context string `json:"Context,omitempty"` +} + +func (r *RedfishBaseBMC) CreateEventSubscription( + ctx context.Context, + destination string, + eventFormatType schemas.EventFormatType, + retry schemas.DeliveryRetryPolicy, +) (string, error) { + service := r.client.GetService() + ev, err := service.EventService() + if err != nil { + return "", fmt.Errorf("failed to get event service: %w", err) + } + if !ev.ServiceEnabled { + return "", fmt.Errorf("event service is not enabled") + } + payload := &subscriptionPayload{ + Destination: destination, + EventFormatType: eventFormatType, // event or metricreport + Protocol: schemas.RedfishEventDestinationProtocol, + DeliveryRetryPolicy: retry, + Context: "metal-operator", + } + client := ev.GetClient() + // some implementations (like Dell) do not support ResourceTypes and RegistryPrefixes + if len(ev.ResourceTypes) == 0 { + payload.EventTypes = []schemas.EventType{} + } else { + payload.RegistryPrefixes = []string{""} // Filters by the prefix of the event's MessageId, which points to a Message Registry: [Base, ResourceEvent, iLOEvents] + payload.ResourceTypes = []string{""} // Filters by the schema name (Resource Type) of the event's OriginOfCondition: [Chassis, ComputerSystem, Power] + } + resp, err := client.Post(ev.SubscriptionsLink, payload) + if err != nil { + return "", err + } + defer func() { + _ = resp.Body.Close() + }() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", fmt.Errorf("failed to create event subscription status code: %d", resp.StatusCode) + } + // return subscription link from returned location + subscriptionLink := resp.Header.Get("Location") + if subscriptionLink == "" { + return "", fmt.Errorf("failed to get subscription link from response header") + } + urlParser, err := url.ParseRequestURI(subscriptionLink) + if err == nil { + subscriptionLink = urlParser.RequestURI() + } + return subscriptionLink, nil +} + +func (r *RedfishBaseBMC) DeleteEventSubscription(ctx context.Context, uri string) error { + service := r.client.GetService() + ev, err := service.EventService() + if err != nil { + return fmt.Errorf("failed to get event service: %w", err) + } + if !ev.ServiceEnabled { + return fmt.Errorf("event service is not enabled") + } + event, err := ev.GetEventSubscription(uri) + if err != nil { + return fmt.Errorf("failed to get event subscription: %w", err) + } + if event == nil { + return nil + } + if err := ev.DeleteEventSubscription(uri); err != nil { + return fmt.Errorf("failed to delete event subscription: %w", err) + } + return nil +} diff --git a/cmd/main.go b/cmd/main.go index cf162bb63..7d32172f5 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -14,6 +14,7 @@ import ( "github.com/ironcore-dev/controller-utils/conditionutils" "github.com/ironcore-dev/metal-operator/internal/cmd/dns" + "github.com/ironcore-dev/metal-operator/internal/serverevents" webhookv1alpha1 "github.com/ironcore-dev/metal-operator/internal/webhook/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -76,6 +77,9 @@ func main() { // nolint: gocyclo registryPort int registryProtocol string registryURL string + eventPort int + eventURL string + eventProtocol string registryResyncInterval time.Duration webhookPort int enforceFirstBoot bool @@ -125,6 +129,10 @@ func main() { // nolint: gocyclo flag.StringVar(®istryURL, "registry-url", "", "The URL of the registry.") flag.StringVar(®istryProtocol, "registry-protocol", "http", "The protocol to use for the registry.") flag.IntVar(®istryPort, "registry-port", 10000, "The port to use for the registry.") + flag.StringVar(&eventURL, "event-url", "", "The URL of the server events endpoint for alerts and metrics.") + flag.IntVar(&eventPort, "event-port", 10001, "The port to use for the server events endpoint for alerts and metrics.") + flag.StringVar(&eventProtocol, "event-protocol", "http", + "The protocol to use for the server events endpoint for alerts and metrics.") flag.StringVar(&probeImage, "probe-image", "", "Image for the first boot probing of a Server.") flag.StringVar(&probeOSImage, "probe-os-image", "", "OS image for the first boot probing of a Server.") flag.StringVar(&managerNamespace, "manager-namespace", "default", "Namespace the manager is running in.") @@ -210,6 +218,17 @@ func main() { // nolint: gocyclo registryURL = fmt.Sprintf("%s://%s:%d", registryProtocol, registryAddr, registryPort) } + // set the correct event URL by getting the address from the environment + var eventAddr string + if eventURL == "" { + eventAddr = os.Getenv("EVENT_ADDRESS") + if eventAddr == "" { + setupLog.Error(nil, "failed to set the event URL as no address is provided") + } else { + eventURL = fmt.Sprintf("%s://%s:%d", eventProtocol, eventAddr, eventPort) + } + } + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancelation and @@ -355,6 +374,7 @@ func main() { // nolint: gocyclo BMCResetWaitTime: bmcResetWaitingInterval, BMCClientRetryInterval: bmcResetResyncInterval, ManagerNamespace: managerNamespace, + EventURL: eventURL, DNSRecordTemplate: dnsRecordTemplate, Conditions: conditionutils.NewAccessor(conditionutils.AccessorOptions{}), BMCOptions: bmc.Options{ @@ -615,6 +635,21 @@ func main() { // nolint: gocyclo os.Exit(1) } + if eventURL != "" { + if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + setupLog.Info("starting event server for alerts and metrics", "EventURL", eventURL) + eventServer := serverevents.NewServer(setupLog, fmt.Sprintf(":%d", eventPort)) + if err := eventServer.Start(ctx); err != nil { + return fmt.Errorf("unable to start event server: %w", err) + } + <-ctx.Done() + return nil + })); err != nil { + setupLog.Error(err, "unable to add event runnable to manager") + os.Exit(1) + } + } + setupLog.Info("Starting manager") if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "Failed to run manager") diff --git a/config/crd/bases/metal.ironcore.dev_bmcs.yaml b/config/crd/bases/metal.ironcore.dev_bmcs.yaml index efdefb8fa..7857c9a4d 100644 --- a/config/crd/bases/metal.ironcore.dev_bmcs.yaml +++ b/config/crd/bases/metal.ironcore.dev_bmcs.yaml @@ -246,6 +246,10 @@ spec: - type type: object type: array + eventsSubscriptionLink: + description: EventsSubscriptionLink is the link to the events subscription + of the bmc. + type: string firmwareVersion: description: FirmwareVersion is the version of the firmware currently running on the BMC. @@ -265,6 +269,10 @@ spec: manufacturer: description: Manufacturer is the name of the BMC manufacturer. type: string + metricsReportSubscriptionLink: + description: MetricsReportSubscriptionLink is the link to the metrics + report subscription of the bmc. + type: string model: description: Model is the model number or name of the BMC. type: string diff --git a/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml b/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml index 6c5de8ec9..c328f965f 100755 --- a/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml +++ b/dist/chart/templates/crd/metal.ironcore.dev_bmcs.yaml @@ -252,6 +252,10 @@ spec: - type type: object type: array + eventsSubscriptionLink: + description: EventsSubscriptionLink is the link to the events subscription + of the bmc. + type: string firmwareVersion: description: FirmwareVersion is the version of the firmware currently running on the BMC. @@ -271,6 +275,10 @@ spec: manufacturer: description: Manufacturer is the name of the BMC manufacturer. type: string + metricsReportSubscriptionLink: + description: MetricsReportSubscriptionLink is the link to the metrics + report subscription of the bmc. + type: string model: description: Model is the model number or name of the BMC. type: string diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md index 0a09802c8..5f8678fe7 100644 --- a/docs/api-reference/api.md +++ b/docs/api-reference/api.md @@ -670,6 +670,8 @@ _Appears in:_ | `state` _[BMCState](#bmcstate)_ | State represents the current state of the BMC.
kubebuilder:validation:Enum=Enabled;Error;Pending | Pending | | | `powerState` _[BMCPowerState](#bmcpowerstate)_ | PowerState represents the current power state of the BMC. | | | | `lastResetTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#time-v1-meta)_ | LastResetTime is the timestamp of the last reset operation performed on the BMC. | | | +| `metricsReportSubscriptionLink` _string_ | MetricsReportSubscriptionLink is the link to the metrics report subscription of the bmc. | | | +| `eventsSubscriptionLink` _string_ | EventsSubscriptionLink is the link to the events subscription of the bmc. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#condition-v1-meta) array_ | Conditions represents the latest available observations of the BMC's current state. | | | diff --git a/go.mod b/go.mod index 91e34533b..0f2ed5ae6 100644 --- a/go.mod +++ b/go.mod @@ -73,6 +73,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.1 // indirect github.com/prometheus/procfs v0.19.1 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/internal/controller/bmc_controller.go b/internal/controller/bmc_controller.go index a87d43a66..5bdd28771 100644 --- a/internal/controller/bmc_controller.go +++ b/internal/controller/bmc_controller.go @@ -24,6 +24,7 @@ import ( metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" "github.com/ironcore-dev/metal-operator/bmc" "github.com/ironcore-dev/metal-operator/internal/bmcutils" + "github.com/ironcore-dev/metal-operator/internal/serverevents" "github.com/stmcginnis/gofish/schemas" corev1 "k8s.io/api/core/v1" @@ -61,6 +62,7 @@ type BMCReconciler struct { BMCFailureResetDelay time.Duration BMCOptions bmc.Options ManagerNamespace string + EventURL string // BMCResetWaitTime defines the duration to wait after a BMC reset before attempting reconciliation again. BMCResetWaitTime time.Duration // BMCClientRetryInterval defines the duration to requeue reconciliation after a BMC client error/reset/unavailablility. @@ -107,6 +109,14 @@ func (r *BMCReconciler) delete(ctx context.Context, bmcObj *metalv1alpha1.BMC) ( } } + bmcClient, err := bmcutils.GetBMCClientFromBMC(ctx, r.Client, bmcObj, r.Insecure, r.BMCOptions) + if err == nil { + defer bmcClient.Logout() + if err := r.deleteEventSubscription(ctx, bmcClient, bmcObj); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to delete event subscriptions: %w", err) + } + } + if _, err := clientutils.PatchEnsureNoFinalizer(ctx, r.Client, bmcObj, BMCFinalizer); err != nil { return ctrl.Result{}, err } @@ -184,6 +194,10 @@ func (r *BMCReconciler) reconcile(ctx context.Context, bmcObj *metalv1alpha1.BMC } log.V(1).Info("Discovered servers") + if modified, err := r.handleEventSubscriptions(ctx, bmcClient, bmcObj); err != nil || modified { + return ctrl.Result{}, err + } + log.V(1).Info("Reconciled BMC") return ctrl.Result{}, nil } @@ -530,6 +544,61 @@ func (r *BMCReconciler) updateConditions(ctx context.Context, bmcObj *metalv1alp return nil } +func (r *BMCReconciler) handleEventSubscriptions(ctx context.Context, bmcClient bmc.BMC, bmcObj *metalv1alpha1.BMC) (bool, error) { + log := ctrl.LoggerFrom(ctx) + if r.EventURL == "" { + return false, nil + } + log.V(1).Info("Handling event subscriptions for BMC") + modified := false + + if bmcObj.Status.MetricsReportSubscriptionLink == "" { + link, err := serverevents.SubscribeMetricsReport(ctx, r.EventURL, bmcObj.Name, bmcClient) + if err != nil { + return false, fmt.Errorf("failed to subscribe to server metrics report: %w", err) + } + bmcBase := bmcObj.DeepCopy() + bmcObj.Status.MetricsReportSubscriptionLink = link + modified = true + if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { + return false, fmt.Errorf("failed to patch server status with subscription links: %w", err) + } + } + if bmcObj.Status.EventsSubscriptionLink == "" { + link, err := serverevents.SubscribeEvents(ctx, r.EventURL, bmcObj.Name, bmcClient) + if err != nil { + return false, fmt.Errorf("failed to subscribe to server alerts: %w", err) + } + bmcBase := bmcObj.DeepCopy() + bmcObj.Status.EventsSubscriptionLink = link + modified = true + if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { + return false, fmt.Errorf("failed to patch server status with subscription links: %w", err) + } + } + return modified, nil +} + +func (r *BMCReconciler) deleteEventSubscription(ctx context.Context, bmcClient bmc.BMC, bmcObj *metalv1alpha1.BMC) error { + log := ctrl.LoggerFrom(ctx) + if r.EventURL == "" { + return nil + } + if bmcObj.Status.MetricsReportSubscriptionLink != "" { + if err := bmcClient.DeleteEventSubscription(ctx, bmcObj.Status.MetricsReportSubscriptionLink); err != nil { + return fmt.Errorf("failed to unsubscribe from server metrics report: %w", err) + } + log.V(1).Info("Unsubscribed from server metrics report") + } + if bmcObj.Status.EventsSubscriptionLink != "" { + if err := bmcClient.DeleteEventSubscription(ctx, bmcObj.Status.EventsSubscriptionLink); err != nil { + return fmt.Errorf("failed to unsubscribe from server events: %w", err) + } + log.V(1).Info("Unsubscribed from server events") + } + return nil +} + func (r *BMCReconciler) enqueueBMCByEndpoint(ctx context.Context, obj client.Object) []ctrl.Request { return []ctrl.Request{ { diff --git a/internal/controller/bmc_controller_test.go b/internal/controller/bmc_controller_test.go index 40b15bf0b..22aab6641 100644 --- a/internal/controller/bmc_controller_test.go +++ b/internal/controller/bmc_controller_test.go @@ -61,6 +61,8 @@ var _ = Describe("BMC Controller", func() { HaveField("Status.State", metalv1alpha1.BMCStateEnabled), HaveField("Status.PowerState", metalv1alpha1.OnPowerState), HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"), + HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")), + HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")), )) By("Ensuring that the Server resource will be created") @@ -143,6 +145,8 @@ var _ = Describe("BMC Controller", func() { HaveField("Status.State", metalv1alpha1.BMCStateEnabled), HaveField("Status.PowerState", metalv1alpha1.OnPowerState), HaveField("Status.FirmwareVersion", "1.45.455b66-rev4"), + HaveField("Status.MetricsReportSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/5")), + HaveField("Status.EventsSubscriptionLink", Equal("/redfish/v1/EventService/Subscriptions/6")), )) By("Ensuring that the Server resource has been created") diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index a71cb06d2..04047389f 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -186,7 +186,6 @@ func (r *ServerReconciler) delete(ctx context.Context, server *metalv1alpha1.Ser return ctrl.Result{}, err } log.V(1).Info("Ensured that the finalizer has been removed") - log.V(1).Info("Deleted server") return ctrl.Result{}, nil } diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 9db3050f7..919fc1e45 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -14,6 +14,7 @@ import ( "github.com/ironcore-dev/controller-utils/conditionutils" metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/ironcore-dev/metal-operator/bmc" "github.com/ironcore-dev/metal-operator/bmc/mock/server" "github.com/ironcore-dev/metal-operator/internal/api/macdb" @@ -176,6 +177,7 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { ManagerNamespace: ns.Name, BMCResetWaitTime: 400 * time.Millisecond, BMCClientRetryInterval: 25 * time.Millisecond, + EventURL: "http://localhost:8008", DNSRecordTemplate: dnsTemplate, Conditions: accessor, }).SetupWithManager(k8sManager)).To(Succeed()) diff --git a/internal/serverevents/metrics.go b/internal/serverevents/metrics.go new file mode 100644 index 000000000..560598d19 --- /dev/null +++ b/internal/serverevents/metrics.go @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package serverevents + +import ( + "strconv" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +type MetricEntry struct { + MetricID string + Value float64 + Type string + Unit string + OriginContext string + Source string + Timestamp time.Time +} + +type RedfishEventCollector struct { + lastReadings map[string]MetricEntry + alertCounts map[EventKey]uint64 + mux sync.RWMutex + sensorDesc *prometheus.Desc + alertDesc *prometheus.Desc +} + +type EventKey struct { + Source string + Severity string + EventID string + Component string +} + +// NewRedfishEventCollector initializes a new RedfishEventCollector and registers it with Prometheus. +func NewRedfishEventCollector() *RedfishEventCollector { + c := &RedfishEventCollector{ + lastReadings: make(map[string]MetricEntry), + alertCounts: make(map[EventKey]uint64), + sensorDesc: prometheus.NewDesc( + "redfish_monitor_reading", + "Latest value pushed via Redfish MetricReport event", + []string{"hostname", "metric_id", "type", "unit", "origin_context"}, + nil, + ), + alertDesc: prometheus.NewDesc( + "redfish_event_alert_total", + "Total count of Redfish alerts/events received", + []string{"hostname", "severity", "message_id", "component"}, + nil, + ), + } + metrics.Registry.MustRegister(c) + return c +} + +// UpdateFromMetricsReport processes incoming MetricReport events and updates the internal state. +func (c *RedfishEventCollector) UpdateFromMetricsReport(hostname string, report MetricsReport) { + c.mux.Lock() + defer c.mux.Unlock() + + for _, entry := range report.MetricsValues { + unit := entry.Units + if unit == "" { + unit = "seconds" + } + mType := entry.MetricValueKind + if mType == "" { + // Fallback: Try to guess from the ID + if strings.Contains(strings.ToLower(entry.MetricID), "temp") { + mType = "Temperature" + } else { + mType = "Gauge" + } + } + val, err := strconv.ParseFloat(entry.MetricValue, 64) + if err != nil { + continue + } + key := entry.MetricID + entry.MetricProperty + c.lastReadings[key] = MetricEntry{ + Value: val, + Type: mType, + Unit: unit, + MetricID: entry.MetricID, + OriginContext: entry.MetricProperty, + Source: hostname, + Timestamp: time.Now(), + } + } +} + +// UpdateFromEvent processes incoming Redfish events and updates the alert counts. +func (c *RedfishEventCollector) UpdateFromEvent(hostname string, data EventData) { + c.mux.Lock() + defer c.mux.Unlock() + + for _, event := range data.Events { + // Determine the component from the URI (e.g., .../Sensors/Fan1 -> Fan1) + component := "system" + if event.OriginOfCondition != "" { + parts := strings.Split(strings.TrimRight(event.OriginOfCondition, "/"), "/") + component = parts[len(parts)-1] + } + event.OriginOfCondition = component + key := EventKey{ + Source: hostname, + Severity: event.Severity, + EventID: event.EventID, + Component: component, + } + c.alertCounts[key]++ + } + +} + +// Describe and Collect implement the prometheus.Collector interface to expose metrics. +func (c *RedfishEventCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.sensorDesc +} + +// Collect gathers the latest metrics and sends them to Prometheus. +func (c *RedfishEventCollector) Collect(ch chan<- prometheus.Metric) { + c.mux.RLock() + defer c.mux.RUnlock() + + for _, data := range c.lastReadings { + if time.Since(data.Timestamp) > 10*time.Minute { + continue + } + ch <- prometheus.MustNewConstMetric( + c.sensorDesc, + prometheus.GaugeValue, + data.Value, + data.Source, + data.MetricID, + data.Type, + data.Unit, + data.OriginContext, + ) + } + for key, count := range c.alertCounts { + ch <- prometheus.MustNewConstMetric( + c.alertDesc, + prometheus.CounterValue, + float64(count), + key.Source, + key.Severity, + key.EventID, + key.Component, + ) + } +} diff --git a/internal/serverevents/server.go b/internal/serverevents/server.go new file mode 100644 index 000000000..7d1b78749 --- /dev/null +++ b/internal/serverevents/server.go @@ -0,0 +1,132 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package serverevents + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "path" + "time" + + "github.com/go-logr/logr" +) + +type Server struct { + addr string + mux *http.ServeMux + log logr.Logger + collector *RedfishEventCollector +} + +type MetricsReport struct { + MetricsValues []MetricsValue `json:"MetricsValues"` +} + +type MetricsValue struct { + MetricID string `json:"MetricId"` + MetricProperty string `json:"MetricProperty"` + MetricValue string `json:"MetricValue"` + Units string `json:"Units"` + MetricValueKind string `json:"MetricValueKind"` + Timestamp string `json:"Timestamp"` + Oem any `json:"Oem"` +} + +type EventData struct { + Events []Event `json:"Alerts"` + Name string `json:"Name"` +} + +type Event struct { + EventID string `json:"EventId"` + Message string `json:"Message"` + Severity string `json:"Severity"` + EventTimestamp string `json:"EventTimestamp"` + OriginOfCondition string `json:"OriginOfCondition"` +} + +func NewServer(log logr.Logger, addr string) *Server { + mux := http.NewServeMux() + server := &Server{ + addr: addr, + mux: mux, + log: log, + collector: NewRedfishEventCollector(), + } + server.routes() + return server +} + +func (s *Server) routes() { + s.mux.HandleFunc("/serverevents/alerts", s.alertHandler) + s.mux.HandleFunc("/serverevents/metricsreport", s.metricsreportHandler) +} + +func (s *Server) alertHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Only POST method is allowed", http.StatusMethodNotAllowed) + return + } + s.log.Info("Received alert data") + // expected path: /serverevents/alerts/{hostname} + hostname := path.Base(r.URL.Path) + eventData := EventData{} + if err := json.NewDecoder(r.Body).Decode(&eventData); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + s.collector.UpdateFromEvent(hostname, eventData) + w.WriteHeader(http.StatusOK) +} + +func (s *Server) metricsreportHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "Only POST method is allowed", http.StatusMethodNotAllowed) + return + } + // expected path: /serverevents/metricsreport/{hostname} + hostname := path.Base(r.URL.Path) + s.log.Info("received metrics report", "hostname", hostname) + metricsReport := MetricsReport{} + if err := json.NewDecoder(r.Body).Decode(&metricsReport); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + s.collector.UpdateFromMetricsReport(hostname, metricsReport) + w.WriteHeader(http.StatusOK) +} + +// Start starts the server on the specified address and adds logging for key events. +func (s *Server) Start(ctx context.Context) error { + s.log.Info("Starting registry server", "address", s.addr) + server := &http.Server{Addr: s.addr, Handler: s.mux} + + errChan := make(chan error, 1) + go func() { + if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + errChan <- fmt.Errorf("HTTP registry server ListenAndServe: %w", err) + } + }() + select { + case <-ctx.Done(): + s.log.Info("Shutting down registry server...") + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := server.Shutdown(shutdownCtx); err != nil { + return fmt.Errorf("HTTP server Shutdown: %w", err) + } + s.log.Info("Registry server graciously stopped") + return nil + case err := <-errChan: + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if shutdownErr := server.Shutdown(shutdownCtx); shutdownErr != nil { + s.log.Error(shutdownErr, "Error shutting down registry server") + } + return err + } +} diff --git a/internal/serverevents/subscription.go b/internal/serverevents/subscription.go new file mode 100644 index 000000000..cddc3d563 --- /dev/null +++ b/internal/serverevents/subscription.go @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package serverevents + +import ( + "context" + "fmt" + + "github.com/ironcore-dev/metal-operator/bmc" + "github.com/stmcginnis/gofish/schemas" +) + +// SubscribeMetricsReport subscribes to Redfish metric reporting events for the given hostname and callback URL. +func SubscribeMetricsReport(ctx context.Context, url, hostname string, bmcClient bmc.BMC) (string, error) { + link, err := bmcClient.CreateEventSubscription( + ctx, + fmt.Sprintf("%s/serverevents/metricsreport/%s", url, hostname), + schemas.MetricReportEventFormatType, + schemas.TerminateAfterRetriesDeliveryRetryPolicy, + ) + if err != nil { + return link, fmt.Errorf("failed to create event subscription: %w", err) + } + return link, nil +} + +// SubscribeEvents creates a Redfish event subscription for events. +func SubscribeEvents(ctx context.Context, url, hostname string, bmcClient bmc.BMC) (string, error) { + link, err := bmcClient.CreateEventSubscription( + ctx, + fmt.Sprintf("%s/serverevents/alerts/%s", url, hostname), + schemas.EventEventFormatType, + schemas.TerminateAfterRetriesDeliveryRetryPolicy, + ) + if err != nil { + return link, fmt.Errorf("failed to create alert subscription: %w", err) + } + return link, nil +} diff --git a/test/serverevents/main.go b/test/serverevents/main.go new file mode 100644 index 000000000..a9c6b05fa --- /dev/null +++ b/test/serverevents/main.go @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "flag" + "os" + + "github.com/ironcore-dev/metal-operator/internal/serverevents" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + setupLog = ctrl.Log.WithName("setup") +) + +func main() { + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + ctx := ctrl.SetupSignalHandler() + + setupLog.Info("starting serverevent agent") + server := serverevents.NewServer(setupLog, ":8888") + + if err := server.Start(ctx); err != nil { + setupLog.Error(err, "problem running telemetry server") + os.Exit(1) + } +}