Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions example.scrutiny.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# lowercased automatically. As such, Configuration keys are case-insensitive,
# and should be lowercase in this file to be consistent with usage.


######################################################################
# Version
#
Expand All @@ -26,7 +25,7 @@ web:
# see docs/TROUBLESHOOTING_REVERSE_PROXY.md
# basepath: `/scrutiny`
# leave empty unless behind a path prefixed proxy
basepath: ''
basepath: ""
database:
# can also set absolute path here
location: /opt/scrutiny/config/scrutiny.db
Expand All @@ -40,12 +39,12 @@ web:
# and store the information in the config file. If you 're re-using an existing influxdb installation, you'll need to provide
# the `token`
influxdb:
# scheme: 'http'
# scheme: 'http'
host: 0.0.0.0
port: 8086
# token: 'my-token'
# org: 'my-org'
# bucket: 'bucket'
# token: 'my-token'
# org: 'my-org'
# bucket: 'bucket'
retention_policy: true
# if you wish to disable TLS certificate verification,
# when using self-signed certificates for example,
Expand All @@ -54,9 +53,21 @@ web:
# insecure_skip_verify: false

log:
file: '' #absolute or relative paths allowed, eg. web.log
file: "" #absolute or relative paths allowed, eg. web.log
level: INFO

# Optional: ignore or force specific SMART attributes at the host level.
# smart:
# attribute_overrides:
# - protocol: ATA # ATA | NVMe | SCSI
# attribute_id: "187" # string ID as shown in the UI/SMART table
# wwn: "0x5000c5002df89099" # optional: limit to a specific device WWN
# action: ignore # ignore | force_status
# warn_above: 5 # optional: warn when value exceeds
# fail_above: 10 # optional: fail when value exceeds (takes precedence)
# - protocol: NVMe
# attribute_id: "media_errors"
# action: force_status
# status: passed # passed | warn | failed

# Notification "urls" look like the following. For more information about service specific configuration see
# Shoutrrr's documentation: https://containrrr.dev/shoutrrr/services/overview/
Expand Down
3 changes: 3 additions & 0 deletions webapp/backend/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ func (c *configuration) Init() error {
c.SetDefault("web.influxdb.tls.insecure_skip_verify", false)
c.SetDefault("web.influxdb.retention_policy", true)

// SMART handling overrides
c.SetDefault("smart.attribute_overrides", []map[string]interface{}{})

//c.SetDefault("disks.include", []string{})
//c.SetDefault("disks.exclude", []string{})

Expand Down
6 changes: 4 additions & 2 deletions webapp/backend/pkg/database/scrutiny_repository_device.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ func (sr *scrutinyRepository) UpdateDeviceStatus(ctx context.Context, wwn string
return device, fmt.Errorf("Could not get device from DB: %v", err)
}

device.DeviceStatus = pkg.DeviceStatusSet(device.DeviceStatus, status)
return device, sr.gormClient.Model(&device).Updates(device).Error
// Overwrite with the latest evaluated status so old failure bits do not linger.
device.DeviceStatus = status
// Use map update so status=0 (passed) is persisted; gorm skips zero values in struct Updates.
return device, sr.gormClient.Model(&device).Update("device_status", status).Error
}

func (sr *scrutinyRepository) GetDeviceDetails(ctx context.Context, wwn string) (models.Device, error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,32 @@ import (
"strings"
"time"

"github.com/analogj/scrutiny/webapp/backend/pkg"
"github.com/analogj/scrutiny/webapp/backend/pkg/models"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/collector"
"github.com/analogj/scrutiny/webapp/backend/pkg/models/measurements"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/influxdata/influxdb-client-go/v2/api"
"github.com/mitchellh/mapstructure"
log "github.com/sirupsen/logrus"
)

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// SMART
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
func (sr *scrutinyRepository) SaveSmartAttributes(ctx context.Context, wwn string, collectorSmartData collector.SmartInfo) (measurements.Smart, error) {
sr.logger.Infof("SaveSmartAttributes called for wwn=%s", wwn)
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The Infof log at this line will log on every SMART attribute save operation, which could be quite frequent in production. Consider using Debugf instead for this type of operational logging to avoid log verbosity.

Suggested change
sr.logger.Infof("SaveSmartAttributes called for wwn=%s", wwn)
sr.logger.Debugf("SaveSmartAttributes called for wwn=%s", wwn)

Copilot uses AI. Check for mistakes.
deviceSmartData := measurements.Smart{}
err := deviceSmartData.FromCollectorSmartInfo(wwn, collectorSmartData)
if err != nil {
sr.logger.Errorln("Could not process SMART metrics", err)
return measurements.Smart{}, err
}

// apply host-level attribute overrides before persisting or notifying
attributeOverrides := sr.loadAttributeOverrides()
sr.applyAttributeOverrides(&deviceSmartData, wwn, attributeOverrides)
Comment on lines +31 to +33
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The loadAttributeOverrides() function is called on every SMART data save operation, causing repeated configuration parsing and map allocation. Consider caching the parsed overrides (e.g., loading once during repository initialization or using a cached value with a reload mechanism) to improve performance, especially for systems with many devices reporting frequently.

Copilot uses AI. Check for mistakes.

tags, fields := deviceSmartData.Flatten()

// write point immediately
Expand Down Expand Up @@ -196,11 +204,195 @@ func (sr *scrutinyRepository) generateSmartAttributesSubquery(wwn string, durati
}

partialQueryStr = append(partialQueryStr, `|> aggregateWindow(every: 1d, fn: last, createEmpty: false)`)

if selectEntries > 0 {
partialQueryStr = append(partialQueryStr, fmt.Sprintf(`|> tail(n: %d, offset: %d)`, selectEntries, selectEntriesOffset))
}
partialQueryStr = append(partialQueryStr, "|> schema.fieldsAsCols()")

return strings.Join(partialQueryStr, "\n")
}

// loadAttributeOverrides retrieves the user-provided overrides from configuration.
func (sr *scrutinyRepository) loadAttributeOverrides() []models.AttributeOverride {
// Load raw maps so we can detect presence of threshold keys even when value is zero.
rawOverrides := []map[string]interface{}{}
if err := sr.appConfig.UnmarshalKey("smart.attribute_overrides", &rawOverrides); err != nil {
sr.logger.Debugf("failed to parse smart.attribute_overrides: %v", err)
return []models.AttributeOverride{}
}

overrides := make([]models.AttributeOverride, 0, len(rawOverrides))
for _, raw := range rawOverrides {
var ao models.AttributeOverride
if err := mapstructure.Decode(raw, &ao); err != nil {
sr.logger.Debugf("failed to decode attribute override entry: %v", err)
continue
}
if _, ok := raw["warn_above"]; ok {
ao.WarnAboveSet = true
}
if _, ok := raw["fail_above"]; ok {
ao.FailAboveSet = true
}
overrides = append(overrides, ao)
}

sr.logger.Infof("Loaded %d attribute overrides from config", len(overrides))
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] This Infof log is called on every SMART data upload and will execute the loop iteration even when there are no overrides configured (most common case). The logging should be moved inside the if len(overrides) == 0 check or reduced to Debugf level to avoid excessive logging in production.

Suggested change
sr.logger.Infof("Loaded %d attribute overrides from config", len(overrides))
sr.logger.Debugf("Loaded %d attribute overrides from config", len(overrides))

Copilot uses AI. Check for mistakes.
for i, o := range overrides {
sr.logger.Debugf(" Override %d: protocol=%s attributeId=%s wwn=%s action=%s warnAbove=%d warnSet=%t failAbove=%d failSet=%t", i, o.Protocol, o.AttributeId, o.WWN, o.Action, o.WarnAbove, o.WarnAboveSet, o.FailAbove, o.FailAboveSet)
}
return overrides
}

// applyAttributeOverrides adjusts attribute statuses according to configured overrides
// and recomputes the device status accordingly (while preserving SMART failure bits).
func (sr *scrutinyRepository) applyAttributeOverrides(smart *measurements.Smart, wwn string, overrides []models.AttributeOverride) {
if len(overrides) == 0 {
return
}

sr.logger.Debugf("Applying attribute overrides to device wwn=%s protocol=%s with %d attributes", wwn, smart.DeviceProtocol, len(smart.Attributes))

failedSmart := pkg.DeviceStatusHas(smart.Status, pkg.DeviceStatusFailedSmart)
// reset and rebuild device status; keep SMART failure flag intact.
smart.Status = pkg.DeviceStatusPassed
if failedSmart {
smart.Status = pkg.DeviceStatusFailedSmart
}

for attrKey, attrData := range smart.Attributes {
override := sr.matchingOverride(smart.DeviceProtocol, wwn, attrKey, overrides)
if override != nil {
sr.logger.Infof("Applying override to attribute %s: action=%s", attrKey, override.Action)
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] This Infof log is inside a loop over all attributes and will generate a log entry for every matched override on every SMART upload. Consider using Debugf instead to reduce log verbosity in production.

Suggested change
sr.logger.Infof("Applying override to attribute %s: action=%s", attrKey, override.Action)
sr.logger.Debugf("Applying override to attribute %s: action=%s", attrKey, override.Action)

Copilot uses AI. Check for mistakes.
attrData = sr.applyOverrideToAttribute(attrData, *override)
smart.Attributes[attrKey] = attrData
}

// rebuild device status from attribute statuses after overrides are applied
if pkg.AttributeStatusHas(attrData.GetStatus(), pkg.AttributeStatusFailedScrutiny) {
smart.Status = pkg.DeviceStatusSet(smart.Status, pkg.DeviceStatusFailedScrutiny)
}
}
}

func (sr *scrutinyRepository) matchingOverride(protocol string, wwn string, attributeId string, overrides []models.AttributeOverride) *models.AttributeOverride {
for ndx := range overrides {
o := overrides[ndx]
sr.logger.Debugf("Checking override %d: seeking protocol=%s attributeId=%s wwn=%s against override protocol=%s attributeId=%s wwn=%s", ndx, protocol, attributeId, wwn, o.Protocol, o.AttributeId, o.WWN)
if !strings.EqualFold(o.Protocol, protocol) {
sr.logger.Debugf(" Protocol mismatch")
continue
}
if o.AttributeId != "" && o.AttributeId != attributeId {
sr.logger.Debugf(" AttributeId mismatch: '%s' != '%s'", o.AttributeId, attributeId)
continue
}
if o.WWN != "" && !strings.EqualFold(o.WWN, wwn) {
sr.logger.Debugf(" WWN mismatch: '%s' != '%s'", o.WWN, wwn)
continue
}

sr.logger.Debugf(" MATCH!")
return &o
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Returning a pointer to the loop variable o can lead to unexpected behavior. The loop variable is reused in each iteration, so all returned pointers could point to the same memory location with the last iteration's value. Copy the value or return by index: return &overrides[ndx] instead of return &o.

Suggested change
return &o
return &overrides[ndx]

Copilot uses AI. Check for mistakes.
}
return nil
}

func (sr *scrutinyRepository) applyOverrideToAttribute(attr measurements.SmartAttribute, override models.AttributeOverride) measurements.SmartAttribute {
action := strings.ToLower(strings.TrimSpace(override.Action))
// threshold overrides take precedence over generic force_status/pass logic (unless ignored)
if action == "ignore" {
return setAttributeStatus(attr, pkg.AttributeStatusPassed, "Ignored by attribute override")
}

if override.FailAboveSet || override.WarnAboveSet {
return applyThresholdOverride(attr, override)
}

switch action {
case "ignore":
return setAttributeStatus(attr, pkg.AttributeStatusPassed, "Ignored by attribute override")
Comment on lines +314 to +315
Copy link

Copilot AI Dec 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "ignore" action is handled twice: once at the beginning (lines 305-307) and again in the switch statement (lines 314-315). The first check makes the switch case unreachable. Remove the duplicate case from the switch statement.

Suggested change
case "ignore":
return setAttributeStatus(attr, pkg.AttributeStatusPassed, "Ignored by attribute override")

Copilot uses AI. Check for mistakes.
case "force_status":
status := strings.ToLower(strings.TrimSpace(override.Status))
switch status {
case "warn", "warning":
return setAttributeStatus(attr, pkg.AttributeStatusWarningScrutiny, "Status forced to warning by attribute override")
case "failed", "fail", "error":
return setAttributeStatus(attr, pkg.AttributeStatusFailedScrutiny, "Status forced to failed by attribute override")
default:
return setAttributeStatus(attr, pkg.AttributeStatusPassed, "Status forced to passed by attribute override")
}
default:
return attr
}
}

// applyThresholdOverride compares the attribute value to warn/fail thresholds.
// If both warn and fail are set, fail takes precedence when exceeded.
func applyThresholdOverride(attr measurements.SmartAttribute, override models.AttributeOverride) measurements.SmartAttribute {
val := currentAttributeValue(attr)
status := pkg.AttributeStatusPassed
reason := "Status forced to passed by attribute override"

// fail_above takes priority over warn_above when both are exceeded
if override.FailAboveSet && val > override.FailAbove {
status = pkg.AttributeStatusFailedScrutiny
reason = fmt.Sprintf("Value %d exceeded fail threshold %d", val, override.FailAbove)
} else if override.WarnAboveSet && val > override.WarnAbove {
status = pkg.AttributeStatusWarningScrutiny
reason = fmt.Sprintf("Value %d exceeded warn threshold %d", val, override.WarnAbove)
}

return setAttributeStatus(attr, status, reason)
}

// currentAttributeValue returns the most relevant numeric value for comparison.
// ATA uses raw counts; NVMe/SCSI use the current Value field.
func currentAttributeValue(attr measurements.SmartAttribute) int64 {
switch v := attr.(type) {
case *measurements.SmartAtaAttribute:
if v.RawValue != 0 {
return v.RawValue
}
if v.TransformedValue != 0 {
return v.TransformedValue
}
return v.Value
case *measurements.SmartNvmeAttribute:
if v.TransformedValue != 0 {
return v.TransformedValue
}
return v.Value
case *measurements.SmartScsiAttribute:
if v.TransformedValue != 0 {
return v.TransformedValue
}
return v.Value
default:
return 0
}
}

// setAttributeStatus updates the concrete attribute status/reason fields in-place.
func setAttributeStatus(attr measurements.SmartAttribute, status pkg.AttributeStatus, reason string) measurements.SmartAttribute {
switch v := attr.(type) {
case *measurements.SmartAtaAttribute:
v.Status = status
v.StatusReason = reason
v.FailureRate = 0
return v
case *measurements.SmartNvmeAttribute:
v.Status = status
v.StatusReason = reason
v.FailureRate = 0
return v
case *measurements.SmartScsiAttribute:
v.Status = status
v.StatusReason = reason
v.FailureRate = 0
return v
default:
return attr
}
}
Loading