itdove
diff --git a/‎Makefile‎
Lines changed: 5 additions & 3 deletions b/‎Makefile‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎metrics_utility/automation_controller_billing/collectors.py‎
Lines changed: 88 additions & 33 deletions b/‎metrics_utility/automation_controller_billing/collectors.py‎
Lines changed: 88 additions & 33 deletions
diff --git a/‎metrics_utility/automation_controller_billing/kubernetes_client.py‎
Lines changed: 58 additions & 0 deletions b/‎metrics_utility/automation_controller_billing/kubernetes_client.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎metrics_utility/automation_controller_billing/prometheus_client.py‎
Lines changed: 142 additions & 0 deletions b/‎metrics_utility/automation_controller_billing/prometheus_client.py‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎metrics_utility/base/README.md‎
Lines changed: 1 addition & 1 deletion b/‎metrics_utility/base/README.md‎
Lines changed: 1 addition & 1 deletion
@@ -1,3 +1,5 @@
+CONTAINER_ENGINE ?= docker
+
 help:
 	@echo help sync test coverage lint fix compose clean psql
 
@@ -19,12 +21,12 @@ fix:
 	uv run ruff format
 
 compose:
-	docker compose -f tools/docker/docker-compose.yaml up
+	${CONTAINER_ENGINE} compose -f tools/docker/docker-compose.yaml up
 
 clean:
-	docker compose -f tools/docker/docker-compose.yaml down -v
+	${CONTAINER_ENGINE} compose -f tools/docker/docker-compose.yaml down -v
 
 psql:
-	docker compose -f tools/docker/docker-compose.yaml exec postgres psql -U awx
+	${CONTAINER_ENGINE} compose -f tools/docker/docker-compose.yaml exec postgres psql -U awx
 
 .PHONY: help sync test coverage lint fix compose clean psql
@@ -4,6 +4,7 @@
 import platform
 
 from datetime import datetime, timezone
+from typing import Tuple
 
 import distro
 
@@ -14,14 +15,14 @@
 from django.db.utils import ProgrammingError
 from django.utils.timezone import now, timedelta
 from django.utils.translation import gettext_lazy as _
-from kubernetes import client
-from kubernetes import config as kube_config
 
 from metrics_utility.base import CsvFileSplitter, register
 from metrics_utility.base.utils import get_max_gather_period_days, get_optional_collectors
 from metrics_utility.exceptions import MetricsException, MissingRequiredEnvVar
 from metrics_utility.logger import logger, logger_info_level
 
+from .prometheus_client import PrometheusClient
+
 
 """
 This module is used to define metrics collected by
@@ -551,10 +552,17 @@ def total_workers_vcpu(since, full_path, until, **kwargs):
         raise MissingRequiredEnvVar('environment variable METRICS_UTILITY_CLUSTER_NAME is not set')
 
     now = datetime.now(timezone.utc)
-
-    info = {'cluster_name': cluster_name, 'timestamp': now.isoformat(), 'nodes': []}
-    # If METRICS_UTILITY_USAGE_BASED_BILLING_ENABLED is not set or set to false then it returns 1
-    usage_based_billing_enabled_str = os.getenv('METRICS_UTILITY_USAGE_BASED_BILLING_ENABLED')
+    current_ts = now.timestamp()
+    prev_hour_start, prev_hour_end = get_hour_boundaries(current_ts)
+
+    info = {
+        'cluster_name': cluster_name,
+        'collection_timestamp': datetime.fromtimestamp(current_ts).isoformat(),
+        'start_timestamp': datetime.fromtimestamp(prev_hour_start).isoformat(),
+        'end_timestamp': datetime.fromtimestamp(prev_hour_end).isoformat(),
+    }
+    # If METRICS_UTILITY_USAGE_BASED_METERING_ENABLED is not set or set to false then it returns 1
+    usage_based_billing_enabled_str = os.getenv('METRICS_UTILITY_USAGE_BASED_METERING_ENABLED')
     usage_based_billing_enabled = False
     if usage_based_billing_enabled_str and (usage_based_billing_enabled_str.lower() == 'true'):
         usage_based_billing_enabled = True
@@ -563,42 +571,89 @@ def total_workers_vcpu(since, full_path, until, **kwargs):
         info['total_workers_vcpu'] = 1
         # This message must always appear in the log regardless of the log level.
         logger_info_level.info(json.dumps(info, indent=2))
-        return {'cluster_name': info['cluster_name'], 'total_workers_vcpu': info['total_workers_vcpu']}
+        return {'timestamp': info['end_timestamp'], 'cluster_name': info['cluster_name'], 'total_workers_vcpu': info['total_workers_vcpu']}
+
+    url = os.getenv('METRICS_UTILITY_PROMETHEUS_URL')
+    if not url:
+        prometheus_default_url = 'https://prometheus-k8s.openshift-monitoring.svc.cluster.local:9091'
+        logger.info(
+            f'environment variable METRICS_UTILITY_PROMETHEUS_URL is not set, \
+                    default {prometheus_default_url} will be assigned'
+        )
+        url = prometheus_default_url
 
     try:
-        kube_config.load_incluster_config()
-    except kube_config.ConfigException:
-        try:
-            kube_config.load_kube_config()
-        except kube_config.ConfigException as e:
-            logger.error(f'Could not configure Kubernetes Python client ERROR: {e}')
-            raise MetricsException(f'Could not configure Kubernetes Python client ERROR: {e}')
-
-    # Create a CoreV1Api client
-    api_instance = client.CoreV1Api()
-    if not api_instance:
-        raise MetricsException('Could not get a Kube CoreV1Api client')
+        prom = PrometheusClient(url=url)
+    except Exception as e:
+        raise MetricsException(f'Can not create a prometheus api client ERROR: {e}')
 
     try:
-        nodes = api_instance.list_node()
-    except Exception as e:
+        total_workers_vcpu, promql_query = get_total_workers_cpu(prom, prev_hour_start)
+        timeline = get_cpu_timeline(prom, prev_hour_start, prev_hour_end)
+    except MetricsException as e:
         raise MetricsException(f'Unexpected error when retrieving nodes: {e}')
 
-    if nodes is None:
-        raise MetricsException('No nodes found')
+    info['promql_query'] = promql_query
+    info['timeline'] = timeline
+
+    logger.debug(f'total_workers_vcpu: {total_workers_vcpu}')
 
-    total_workers_vcpu = 0
-    # In SaaS case we have only Worker nodes and so we don't need to filter out the control plan.
-    # If it used for other environement, we might need to implement the filtering.
-    for node_info in nodes.items:
-        for resource, value in node_info.status.capacity.items():
-            if resource == 'cpu':
-                info['nodes'].append({node_info.metadata.name: int(value)})
-                total_workers_vcpu += int(value)
+    # This can happen when the prev_hour_start doesn't have data, it could be when the cluster just started or
+    # if for some reasons prometheus loss some data.
+    if total_workers_vcpu is None:
+        logger.warning('No data availble yet, the cluster is probably running for less than an hour')
+        raise MetricsException('No data availble yet, the cluster is probably running for less than an hour')
 
-    info['total_workers_vcpu'] = total_workers_vcpu
+    info['total_workers_vcpu'] = int(total_workers_vcpu)
 
     # This message must always appear in the log regardless of the log level.
     logger_info_level.info(json.dumps(info, indent=2))
 
-    return {'cluster_name': info['cluster_name'], 'total_workers_vcpu': info['total_workers_vcpu']}
+    return {'timestamp': info['end_timestamp'], 'cluster_name': info['cluster_name'], 'total_workers_vcpu': info['total_workers_vcpu']}
+
+
+def get_hour_boundaries(current_timestamp: float) -> Tuple[float, float, float]:
+    current_hour_start = (current_timestamp // 3600) * 3600
+    previous_hour_start = current_hour_start - 3600
+    previous_hour_end = current_hour_start - 1
+    return previous_hour_start, previous_hour_end
+
+
+def get_total_workers_cpu(prom: PrometheusClient, base_timestamp: float) -> Tuple[float, str]:
+    promql_query = f'max_over_time(sum(machine_cpu_cores)[59m59s:5m] @ {base_timestamp})'
+
+    try:
+        total_workers_vcpu = prom.get_current_value(promql_query)
+    except Exception as e:
+        raise MetricsException(f'Unexpected error when retrieving nodes: {e}')
+
+    return total_workers_vcpu, promql_query
+
+
+def get_cpu_timeline(prom: PrometheusClient, previous_hour_start, previous_hour_end: float) -> list:
+    """
+    Get array of timestamp/CPU pairs for the hour leading up to previous_hour_end
+    Returns:
+        List of dicts with 'timestamp' (ISO format) and 'cpu_sum' keys
+    """
+    # Use instant query - query_range will handle the time range
+    query = 'sum(machine_cpu_cores)'
+
+    try:
+        response = prom.query_range(query=query, start_time=previous_hour_start, end_time=previous_hour_end, step='5m')
+
+        result = []
+        if response and 'data' in response and 'result' in response['data']:
+            for series in response['data']['result']:
+                if 'values' in series:
+                    for timestamp_val, cpu_val in series['values']:
+                        result.append(
+                            {'timestamp': datetime.fromtimestamp(float(timestamp_val), timezone.utc).isoformat(), 'cpu_sum': float(cpu_val)}
+                        )
+
+        # Sort by timestamp
+        result.sort(key=lambda x: x['timestamp'])
+        return result
+
+    except Exception as e:
+        raise MetricsException(f'Error querying CPU timeline: {e}')
@@ -0,0 +1,58 @@
+import os
+
+from metrics_utility.exceptions import MetricsException
+from metrics_utility.logger import logger
+
+
+TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+CA_CERT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt'
+
+
+class KubernetesClient:
+    """
+    Simplified Kubernetes client for service account token operations.
+
+    This class assumes running in a Kubernetes pod with standard service account
+    files mounted at /var/run/secrets/kubernetes.io/serviceaccount/
+    """
+
+    def __init__(self):
+        """Initialize the client and validate service account files are available."""
+        self._validate_service_account_files()
+
+    def _validate_service_account_files(self):
+        """Validate that required service account files exist."""
+
+        if not os.path.exists(TOKEN_PATH):
+            raise MetricsException('Service account token not found at /var/run/secrets/kubernetes.io/serviceaccount/token')
+
+        logger.info('Service account files validated')
+
+    def get_current_token(self) -> str:
+        """
+        Get the current pod's service account token from the mounted file.
+
+        Returns:
+            Current service account token
+
+        Raises:
+            MetricsException: If token cannot be read
+        """
+
+        try:
+            with open(TOKEN_PATH, 'r') as f:
+                token = f.read().strip()
+            logger.info("Retrieved current pod's mounted token")
+            logger.info(f'   Token Length: {len(token)} characters')
+            return token
+        except Exception as e:
+            raise MetricsException(f'Error reading token: {e}')
+
+    def get_ca_cert_path(self) -> str:
+        """
+        Get the current pod's service account ca_cert from the mounted file.
+
+        Returns:
+            Current service account ca_cert
+        """
+        return CA_CERT_PATH
@@ -0,0 +1,142 @@
+import os
+
+from typing import Optional
+
+import requests
+
+from metrics_utility.exceptions import MetricsException
+from metrics_utility.logger import logger
+
+from .kubernetes_client import KubernetesClient
+
+
+class PrometheusClient:
+    """
+    Prometheus client with Kubernetes service account authentication support.
+
+    This class handles:
+    - Service account token retrieval from Kubernetes
+    - Prometheus connection management
+    - Query execution with proper error handling
+    """
+
+    def __init__(self, url: str, timeout: int = 30):
+        """
+        Initialize Prometheus client.
+
+        Args:
+            url: Prometheus server URL
+            timeout: Request timeout in seconds (default: 30)
+        """
+        self.url = url.rstrip('/')  # Remove trailing slash
+        self.timeout = timeout
+        self.token = None
+        self.ca_cert_path = None
+        self.session = requests.Session()
+
+        kube_client = KubernetesClient()
+        self.token = kube_client.get_current_token()
+        if not self.token:
+            raise MetricsException('Unable to retrieve the token for the current service account')
+
+        self.ca_cert_path = kube_client.get_ca_cert_path()
+
+        # Setup session
+        self._setup_session()
+
+    def _setup_session(self):
+        """Setup HTTP session with authentication headers and CA certificate"""
+        if self.token:
+            logger.info('Creating authenticated Prometheus client')
+            logger.info(f'   URL: {self.url}')
+
+            self.session.headers.update({'Authorization': f'Bearer {self.token}', 'Content-Type': 'application/x-www-form-urlencoded'})
+        else:
+            logger.info('Creating unauthenticated Prometheus client')
+            logger.info(f'   URL: {self.url}')
+
+        # Use service CA certificate for SSL verification
+        if os.path.exists(self.ca_cert_path):
+            self.session.verify = self.ca_cert_path
+            logger.info(f'Using service CA certificate: {self.ca_cert_path}')
+        else:
+            raise MetricsException(f'CA_CERT not found at {self.ca_cert_path}')
+
+    def query(self, query: str, time_param: Optional[float] = None) -> Optional[list]:
+        """
+        Execute instant PromQL query.
+
+        Args:
+            query: PromQL query string
+            time_param: Optional timestamp for the query
+
+        Returns:
+            Query results as list, or raise MetricsException if failed
+        """
+        try:
+            url = f'{self.url}/api/v1/query'
+            params = {'query': query}
+
+            if time_param:
+                params['time'] = time_param
+
+            response = self.session.get(url, params=params, timeout=self.timeout)
+
+            logger.debug(f'response: {response}')
+            if response.status_code == 200:
+                data = response.json()
+                logger.debug(f'data: {data}')
+                if data.get('status') == 'success':
+                    return data.get('data', {}).get('result', [])
+                else:
+                    raise MetricsException(f'Prometheus API error: {data.get("error", "Unknown error")}')
+            else:
+                raise MetricsException(f'HTTP error {response.status_code}: {response.text}')
+
+        except Exception as e:
+            raise MetricsException(f'Query failed: {e}')
+
+    def get_current_value(self, query: str) -> Optional[float]:
+        """
+        Get current value from an instant query.
+
+        Args:
+            query: PromQL query string
+
+        Returns:
+            Current value as float, or None if result is empty
+        """
+        result = self.query(query)
+        if result and len(result) > 0:
+            return float(result[0]['value'][1])
+        return None
+
+    def query_range(self, query: str, start_time: float, end_time: float, step: str = '5m') -> Optional[dict]:
+        """
+        Execute a range query against Prometheus.
+        Args:
+            query: PromQL instant query (not range query)
+            start_time: Start time (Unix timestamp)
+            end_time: End time (Unix timestamp)
+            step: Query resolution step (e.g., '1m', '5m')
+        """
+        params = {'query': query, 'start': start_time, 'end': end_time, 'step': step}
+
+        try:
+            url = f'{self.url}/api/v1/query_range'
+            logger.debug(f'Range query URL: {url}')
+            logger.debug(f'Range query params: {params}')
+
+            response = self.session.get(url, params=params, timeout=self.timeout)
+            response.raise_for_status()
+
+            data = response.json()
+            if data.get('status') == 'success':
+                return data
+            else:
+                logger.error(f'Prometheus range query failed: {data.get("error", "Unknown error")}')
+                return None
+
+        except Exception as e:
+            logger.error(f'Range query failed: {e}')
+            raise MetricsException(e)
@@ -92,7 +92,7 @@ An example can be found in [Test package](tests/classes/package.py)
 ### Environment variables for total_workers_vcpu collector:
 
   - `METRICS_UTILITY_CLUSTER_NAME`: Contains the cluster name which is part of the collection payload.
-  - `METRICS_UTILITY_USAGE_BASED_BILLING_ENABLED`: [true/false] In case of true, the payload will contain the actual number of total vcpu accross all workers otherwise the total will be set to 1.
+  - `METRICS_UTILITY_USAGE_BASED_METERING_ENABLED`: [true/false] In case of true, the payload will contain the actual number of total vcpu accross all workers otherwise the total will be set to 1.
 
 N.B.: The SaaS solution runs on ROSA HCP so all nodes are workers, if this collector is used for another solution then the filtering must be implemented.