66
77import argparse , openstack , logging , time , yaml
88from dataclasses import dataclass
9- from prometheus_client import start_http_server , Gauge
9+ from prometheus_client import start_http_server , Gauge , Counter
1010
1111log = logging .getLogger ("zuul-capacity" )
1212
@@ -49,16 +49,24 @@ def get_providers(nodepool_yaml):
4949 providers [provider ["name" ]] = Provider .from_nodepool (provider )
5050 return providers
5151
52+ def update_provider_metric (metrics , name , provider ):
53+ resources = get_resources (provider .cloud )
54+ metrics ["instances" ].labels (cloud = name ).set (len (resources ))
55+ cpu , mem = 0 , 0
56+ for resource in resources :
57+ cpu += resource .cpu
58+ mem += resource .mem
59+ metrics ["cpu" ].labels (cloud = name ).set (cpu )
60+ metrics ["mem" ].labels (cloud = name ).set (mem )
61+
5262def update_providers_metric (metrics , providers ):
5363 for (name , provider ) in providers .items ():
54- resources = get_resources (provider .cloud )
55- metrics ["instances" ].labels (cloud = name ).set (len (resources ))
56- cpu , mem = 0 , 0
57- for resource in resources :
58- cpu += resource .cpu
59- mem += resource .mem
60- metrics ["cpu" ].labels (cloud = name ).set (cpu )
61- metrics ["mem" ].labels (cloud = name ).set (mem )
64+ try :
65+ update_provider_metric (metrics , name , provider )
66+ except Exception as e :
67+ log .exception ("Couldn't get provider" , name , e )
68+ metrics ["error" ].labels (cloud = name ).inc ()
69+
6270
6371def usage ():
6472 parser = argparse .ArgumentParser ()
@@ -74,6 +82,7 @@ def main():
7482 instances = Gauge ('zuul_instances_total' , 'Instance count' , ['cloud' ]),
7583 mem = Gauge ('zuul_instances_mem' , 'Memory usage' , ['cloud' ]),
7684 cpu = Gauge ('zuul_instances_cpu' , 'VCPU usage' , ['cloud' ]),
85+ error = Counter ("zuul_provider_error" , 'API call error' , ['cloud' ])
7786 )
7887
7988 providers = get_providers (args .nodepool )
0 commit comments