diff --git a/.github/images.yml b/.github/images.yml index fb7f3c07f..93ca6d3b9 100644 --- a/.github/images.yml +++ b/.github/images.yml @@ -30,7 +30,7 @@ images: image: ghcr.io/spack/cache-indexer:0.0.6 - path: ./analytics - image: ghcr.io/spack/django:0.5.18 + image: ghcr.io/spack/django:0.5.19 - path: ./images/ci-prune-buildcache image: ghcr.io/spack/ci-prune-buildcache:0.0.5 diff --git a/analytics/analytics/core/management/commands/backfill_jobs.py b/analytics/analytics/core/management/commands/backfill_jobs.py index f4aab0174..c724821da 100644 --- a/analytics/analytics/core/management/commands/backfill_jobs.py +++ b/analytics/analytics/core/management/commands/backfill_jobs.py @@ -234,8 +234,9 @@ def backfill_jobs(start: datetime, end: datetime, dry_run: bool) -> None: f"Found {len(build_ids)} unprocessed jobs between {start} and {end}." ) + pbar = tqdm(total=len(build_ids)) for i, ids_chunk in enumerate(batched(build_ids, BATCH_SIZE)): - click.echo( + pbar.set_description( f"Querying jobs {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}..." ) @@ -254,6 +255,10 @@ def backfill_jobs(start: datetime, end: datetime, dry_run: bool) -> None: if result.get("build_started_at") is None: result["build_started_at"] = result["build_created_at"] + pbar.set_description( + f"Processing records for {result['build_started_at'].replace(second=0, microsecond=0)}" + ) + # The Gitlab DB returns a nullable integer, but the webhooks we # receive use a string from the enum. result["build_failure_reason"] = FAILURE_REASON_MAP[ @@ -279,13 +284,11 @@ def backfill_jobs(start: datetime, end: datetime, dry_run: bool) -> None: click.echo( f"[Dry Run] Would process records {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}" ) + pbar.update(len(results)) continue - pbar = tqdm(results, total=len(build_ids)) - for webhook_dict in pbar: - pbar.set_description( - f"Processing records {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}" - ) + for webhook_dict in results: process_job(json.dumps(webhook_dict)) + pbar.update(1) click.echo(f"Total records processed: {len(build_ids)}") diff --git a/analytics/analytics/job_processor/__init__.py b/analytics/analytics/job_processor/__init__.py index 5c1feb49b..216692dca 100644 --- a/analytics/analytics/job_processor/__init__.py +++ b/analytics/analytics/job_processor/__init__.py @@ -1,11 +1,12 @@ +from datetime import timedelta import json +import logging import re -from datetime import timedelta -import gitlab -import gitlab.exceptions from celery import shared_task from django.db import transaction +import gitlab +import gitlab.exceptions from gitlab.v4.objects import ProjectJob from requests.exceptions import RequestException @@ -38,6 +39,8 @@ get_gitlab_project, ) +logger = logging.getLogger(__name__) + def calculate_job_cost(info: JobInfo, duration: float) -> float | None: if info.node is None or info.pod is None: @@ -155,8 +158,13 @@ def process_job(job_input_data_json: str): gl = get_gitlab_handle() gl_project = get_gitlab_project(job_input_data["project_id"]) gl_job = get_gitlab_job(gl_project, job_input_data["build_id"]) - job_trace: str = gl_job.trace().decode() # type: ignore + # In this case, don't bother processing the job, as it likely never started. + if gl_job.started_at is None: + logger.info("Build found with no start time. Skipping...") + return + + job_trace: str = gl_job.trace().decode() # type: ignore with transaction.atomic(): job = create_job_fact(gl, gl_job, job_input_data, job_trace) diff --git a/analytics/analytics/job_processor/prometheus.py b/analytics/analytics/job_processor/prometheus.py index 4be6ae0c2..c311f82f4 100644 --- a/analytics/analytics/job_processor/prometheus.py +++ b/analytics/analytics/job_processor/prometheus.py @@ -84,6 +84,13 @@ def calculate_node_occupancy(data: list[dict], step: int): This is achieved by summing the number of pods present for all of the selected samples, multiplied by the step size (to normalize result across step size), and divided by the duration, to return a fraction. + + This seems counter-intuitive, as no individual pod is mentioned in this function. + This is because node occupancy is a "shared" metric. It computes the occupancy of any one + pod on this node, over the course of a pod's lifetime. The only thing that tailors this + function to a specific pod is that the "data" argument conforms to the lifetime of that + one pod. Two pods on the same node with the same lifetime would return identical node + occupancy values. """ # Key is the timestamp, value is the number of jobs timeline = {} @@ -99,6 +106,11 @@ def calculate_node_occupancy(data: list[dict], step: int): start = min(timeline.keys()) end = max(timeline.keys()) + # There is an edge case where all data points are at the same point in time. + # In this case, the node occupancy is just one over the number of pods present at that time. + if start == end: + return 1 / timeline[start] + # Remove the first data point, as otherwise we'd be counting an extra time step towards the numerator timeline.pop(start) @@ -248,6 +260,14 @@ def get_pod_usage_and_occupancy( step=step, ) + # Require more than one timeline value for the node, as we + # need a range of values, not just a single point in time. + if len(results) == 1: + raise UnexpectedPrometheusResult( + message=f"Node {node} only returned 1 timeline value", + query=cpu_seconds_query, + ) + # First, get the cpu utlization by the pod we care about # To do this, just get the last value from the response, since that'll be the total of the counter pod_results = next((res for res in results if res["metric"]["pod"] == pod), None) diff --git a/k8s/production/custom/webhook-handler/deployments.yaml b/k8s/production/custom/webhook-handler/deployments.yaml index e88c4e46d..857a6fd44 100644 --- a/k8s/production/custom/webhook-handler/deployments.yaml +++ b/k8s/production/custom/webhook-handler/deployments.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler - image: ghcr.io/spack/django:0.5.18 + image: ghcr.io/spack/django:0.5.19 imagePullPolicy: Always resources: requests: @@ -146,7 +146,7 @@ spec: serviceAccountName: webhook-handler containers: - name: webhook-handler-worker - image: ghcr.io/spack/django:0.5.18 + image: ghcr.io/spack/django:0.5.19 command: [ "celery",