@@ -609,34 +609,11 @@ func CollectDomain(ch chan<- prometheus.Metric, l *libvirt.Libvirt, domain domai
609609 rState uint8
610610 rvirCpu uint16
611611 rmaxmem , rmemory , rcputime uint64
612+ domainInfoAvailable bool = true
612613 )
613- type rDomainStatsState struct {
614- rState uint8
615- rvirCpu uint16
616- rmaxmem , rmemory , rcputime uint64
617- err error
618- }
619-
620- chDomainStats := make (chan rDomainStatsState , 1 )
621- go func () {
622- var data rDomainStatsState
623- data .rState , data .rmaxmem , data .rmemory , data .rvirCpu , data .rcputime , data .err = l .DomainGetInfo (domain .libvirtDomain )
624- chDomainStats <- data
625- }()
626614
627- select {
628- case res := <- chDomainStats :
629- if res .err != nil {
630- return res .err , false
631- }
632-
633- rState = res .rState
634- rvirCpu = res .rvirCpu
635- rmaxmem = res .rmaxmem
636- rmemory = res .rmemory
637- rcputime = res .rcputime
638- case <- time .After (time .Duration (timeout ) * time .Second ):
639- return fmt .Errorf ("call to DomainGetInfo has timed out" ), true
615+ promLabels := []string {
616+ domain .domainName ,
640617 }
641618
642619 openstackInfoLabels := []string {
@@ -657,19 +634,79 @@ func CollectDomain(ch chan<- prometheus.Metric, l *libvirt.Libvirt, domain domai
657634 domain .os_type_machine ,
658635 }
659636
660- promLabels := []string {
661- domain .domainName ,
662- }
663-
637+ // Always emit these info metrics for all domains, regardless of state
664638 ch <- prometheus .MustNewConstMetric (libvirtDomainInfoDesc , prometheus .GaugeValue , 1.0 , infoLabels ... )
665639 ch <- prometheus .MustNewConstMetric (libvirtDomainOpenstackInfoDesc , prometheus .GaugeValue , 1.0 , openstackInfoLabels ... )
666640
641+ // Try to get domain info, but don't fail completely if it doesn't work
642+ type rDomainStatsState struct {
643+ rState uint8
644+ rvirCpu uint16
645+ rmaxmem , rmemory , rcputime uint64
646+ err error
647+ }
648+
649+ chDomainStats := make (chan rDomainStatsState , 1 )
650+ go func () {
651+ var data rDomainStatsState
652+ data .rState , data .rmaxmem , data .rmemory , data .rvirCpu , data .rcputime , data .err = l .DomainGetInfo (domain .libvirtDomain )
653+ chDomainStats <- data
654+ }()
655+
656+ select {
657+ case res := <- chDomainStats :
658+ if res .err != nil {
659+ logger .Debug ("DomainGetInfo failed, will attempt to get domain state separately" , "domain" , domain .domainName , "error" , res .err )
660+ domainInfoAvailable = false
661+ } else {
662+ rState = res .rState
663+ rvirCpu = res .rvirCpu
664+ rmaxmem = res .rmaxmem
665+ rmemory = res .rmemory
666+ rcputime = res .rcputime
667+ }
668+ case <- time .After (time .Duration (timeout ) * time .Second ):
669+ logger .Debug ("DomainGetInfo timed out, will attempt to get domain state separately" , "domain" , domain .domainName )
670+ domainInfoAvailable = false
671+ }
672+
673+ // If DomainGetInfo failed, try to get just the domain state
674+ if ! domainInfoAvailable {
675+ type rDomainState struct {
676+ state int32
677+ err error
678+ }
679+ chDomainState := make (chan rDomainState , 1 )
680+ go func () {
681+ var data rDomainState
682+ var reason int32
683+ data .state , reason , data .err = l .DomainGetState (domain .libvirtDomain , 0 )
684+ _ = reason // ignore reason for now
685+ chDomainState <- data
686+ }()
687+
688+ select {
689+ case res := <- chDomainState :
690+ if res .err != nil {
691+ logger .Error ("failed to get domain state" , "domain" , domain .domainName , "error" , res .err )
692+ return res .err , false
693+ }
694+ rState = uint8 (res .state )
695+ case <- time .After (time .Duration (timeout ) * time .Second ):
696+ return fmt .Errorf ("call to DomainGetState has timed out" ), true
697+ }
698+ }
699+
700+ // Always emit domain state - this is the key metric for all domains
667701 ch <- prometheus .MustNewConstMetric (libvirtDomainState , prometheus .GaugeValue , float64 (rState ), append (promLabels , domainState [libvirt_schema .DomainState (rState )])... )
668702
669- ch <- prometheus .MustNewConstMetric (libvirtDomainInfoMaxMemDesc , prometheus .GaugeValue , float64 (rmaxmem )* 1024 , promLabels ... )
670- ch <- prometheus .MustNewConstMetric (libvirtDomainInfoMemoryDesc , prometheus .GaugeValue , float64 (rmemory )* 1024 , promLabels ... )
671- ch <- prometheus .MustNewConstMetric (libvirtDomainInfoNrVirtCpuDesc , prometheus .GaugeValue , float64 (rvirCpu ), promLabels ... )
672- ch <- prometheus .MustNewConstMetric (libvirtDomainInfoCpuTimeDesc , prometheus .CounterValue , float64 (rcputime )/ 1e9 , promLabels ... )
703+ // Only emit detailed domain info metrics if DomainGetInfo succeeded
704+ if domainInfoAvailable {
705+ ch <- prometheus .MustNewConstMetric (libvirtDomainInfoMaxMemDesc , prometheus .GaugeValue , float64 (rmaxmem )* 1024 , promLabels ... )
706+ ch <- prometheus .MustNewConstMetric (libvirtDomainInfoMemoryDesc , prometheus .GaugeValue , float64 (rmemory )* 1024 , promLabels ... )
707+ ch <- prometheus .MustNewConstMetric (libvirtDomainInfoNrVirtCpuDesc , prometheus .GaugeValue , float64 (rvirCpu ), promLabels ... )
708+ ch <- prometheus .MustNewConstMetric (libvirtDomainInfoCpuTimeDesc , prometheus .CounterValue , float64 (rcputime )/ 1e9 , promLabels ... )
709+ }
673710
674711 var isActive int32
675712 if isActive , err = l .DomainIsActive (domain .libvirtDomain ); err != nil {
0 commit comments