Skip to content

Commit c179545

Browse files
ADometriusclaude
andcommitted
fix: export domain info metrics for all domain states
Previously, libvirt_domain_info_state and related info metrics were only exported for running domains. This was because DomainGetInfo() fails for inactive domains, causing the CollectDomain function to exit early before emitting any metrics. This change ensures that domain info and state metrics are always exported regardless of domain state by: 1. Moving domain info metric emission before DomainGetInfo() call 2. Adding graceful fallback to DomainGetState() when DomainGetInfo() fails 3. Always emitting libvirt_domain_info_state for complete domain visibility Now users can monitor all domains (running, stopped, paused, crashed, etc.) and get proper domain state information for inventory and alerting purposes. Fixes missing metrics for inactive domains while preserving existing behavior for performance metrics that only apply to running domains. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 2b03995 commit c179545

File tree

1 file changed

+71
-34
lines changed

1 file changed

+71
-34
lines changed

pkg/exporter/prometheus-libvirt-exporter.go

Lines changed: 71 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -609,34 +609,11 @@ func CollectDomain(ch chan<- prometheus.Metric, l *libvirt.Libvirt, domain domai
609609
rState uint8
610610
rvirCpu uint16
611611
rmaxmem, rmemory, rcputime uint64
612+
domainInfoAvailable bool = true
612613
)
613-
type rDomainStatsState struct {
614-
rState uint8
615-
rvirCpu uint16
616-
rmaxmem, rmemory, rcputime uint64
617-
err error
618-
}
619-
620-
chDomainStats := make(chan rDomainStatsState, 1)
621-
go func() {
622-
var data rDomainStatsState
623-
data.rState, data.rmaxmem, data.rmemory, data.rvirCpu, data.rcputime, data.err = l.DomainGetInfo(domain.libvirtDomain)
624-
chDomainStats <- data
625-
}()
626614

627-
select {
628-
case res := <-chDomainStats:
629-
if res.err != nil {
630-
return res.err, false
631-
}
632-
633-
rState = res.rState
634-
rvirCpu = res.rvirCpu
635-
rmaxmem = res.rmaxmem
636-
rmemory = res.rmemory
637-
rcputime = res.rcputime
638-
case <-time.After(time.Duration(timeout) * time.Second):
639-
return fmt.Errorf("call to DomainGetInfo has timed out"), true
615+
promLabels := []string{
616+
domain.domainName,
640617
}
641618

642619
openstackInfoLabels := []string{
@@ -657,19 +634,79 @@ func CollectDomain(ch chan<- prometheus.Metric, l *libvirt.Libvirt, domain domai
657634
domain.os_type_machine,
658635
}
659636

660-
promLabels := []string{
661-
domain.domainName,
662-
}
663-
637+
// Always emit these info metrics for all domains, regardless of state
664638
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoDesc, prometheus.GaugeValue, 1.0, infoLabels...)
665639
ch <- prometheus.MustNewConstMetric(libvirtDomainOpenstackInfoDesc, prometheus.GaugeValue, 1.0, openstackInfoLabels...)
666640

641+
// Try to get domain info, but don't fail completely if it doesn't work
642+
type rDomainStatsState struct {
643+
rState uint8
644+
rvirCpu uint16
645+
rmaxmem, rmemory, rcputime uint64
646+
err error
647+
}
648+
649+
chDomainStats := make(chan rDomainStatsState, 1)
650+
go func() {
651+
var data rDomainStatsState
652+
data.rState, data.rmaxmem, data.rmemory, data.rvirCpu, data.rcputime, data.err = l.DomainGetInfo(domain.libvirtDomain)
653+
chDomainStats <- data
654+
}()
655+
656+
select {
657+
case res := <-chDomainStats:
658+
if res.err != nil {
659+
logger.Debug("DomainGetInfo failed, will attempt to get domain state separately", "domain", domain.domainName, "error", res.err)
660+
domainInfoAvailable = false
661+
} else {
662+
rState = res.rState
663+
rvirCpu = res.rvirCpu
664+
rmaxmem = res.rmaxmem
665+
rmemory = res.rmemory
666+
rcputime = res.rcputime
667+
}
668+
case <-time.After(time.Duration(timeout) * time.Second):
669+
logger.Debug("DomainGetInfo timed out, will attempt to get domain state separately", "domain", domain.domainName)
670+
domainInfoAvailable = false
671+
}
672+
673+
// If DomainGetInfo failed, try to get just the domain state
674+
if !domainInfoAvailable {
675+
type rDomainState struct {
676+
state int32
677+
err error
678+
}
679+
chDomainState := make(chan rDomainState, 1)
680+
go func() {
681+
var data rDomainState
682+
var reason int32
683+
data.state, reason, data.err = l.DomainGetState(domain.libvirtDomain, 0)
684+
_ = reason // ignore reason for now
685+
chDomainState <- data
686+
}()
687+
688+
select {
689+
case res := <-chDomainState:
690+
if res.err != nil {
691+
logger.Error("failed to get domain state", "domain", domain.domainName, "error", res.err)
692+
return res.err, false
693+
}
694+
rState = uint8(res.state)
695+
case <-time.After(time.Duration(timeout) * time.Second):
696+
return fmt.Errorf("call to DomainGetState has timed out"), true
697+
}
698+
}
699+
700+
// Always emit domain state - this is the key metric for all domains
667701
ch <- prometheus.MustNewConstMetric(libvirtDomainState, prometheus.GaugeValue, float64(rState), append(promLabels, domainState[libvirt_schema.DomainState(rState)])...)
668702

669-
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoMaxMemDesc, prometheus.GaugeValue, float64(rmaxmem)*1024, promLabels...)
670-
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoMemoryDesc, prometheus.GaugeValue, float64(rmemory)*1024, promLabels...)
671-
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoNrVirtCpuDesc, prometheus.GaugeValue, float64(rvirCpu), promLabels...)
672-
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoCpuTimeDesc, prometheus.CounterValue, float64(rcputime)/1e9, promLabels...)
703+
// Only emit detailed domain info metrics if DomainGetInfo succeeded
704+
if domainInfoAvailable {
705+
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoMaxMemDesc, prometheus.GaugeValue, float64(rmaxmem)*1024, promLabels...)
706+
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoMemoryDesc, prometheus.GaugeValue, float64(rmemory)*1024, promLabels...)
707+
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoNrVirtCpuDesc, prometheus.GaugeValue, float64(rvirCpu), promLabels...)
708+
ch <- prometheus.MustNewConstMetric(libvirtDomainInfoCpuTimeDesc, prometheus.CounterValue, float64(rcputime)/1e9, promLabels...)
709+
}
673710

674711
var isActive int32
675712
if isActive, err = l.DomainIsActive(domain.libvirtDomain); err != nil {

0 commit comments

Comments
 (0)