Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ images:
image: ghcr.io/spack/cache-indexer:0.0.6

- path: ./analytics
image: ghcr.io/spack/django:0.5.18
image: ghcr.io/spack/django:0.5.19

- path: ./images/ci-prune-buildcache
image: ghcr.io/spack/ci-prune-buildcache:0.0.5
Expand Down
15 changes: 9 additions & 6 deletions analytics/analytics/core/management/commands/backfill_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,15 +234,20 @@ def backfill_jobs(start: datetime, end: datetime, dry_run: bool) -> None:
f"Found {len(build_ids)} unprocessed jobs between {start} and {end}."
)

pbar = tqdm(total=len(build_ids))
for i, ids_chunk in enumerate(batched(build_ids, BATCH_SIZE)):
click.echo(
pbar.set_description(
f"Querying jobs {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}..."
)

# Make query that formulates the webhook shape using the relevant database tables
cursor.execute(WEBHOOK_QUERY, {"job_ids": ids_chunk})
results = dict_fetchall(cursor)

pbar.set_description(
f"Processing records {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}"
)

# Process each result
for result in results:
# If the "build_started_at" field is None, set it to the created_at value. This seems
Expand Down Expand Up @@ -279,13 +284,11 @@ def backfill_jobs(start: datetime, end: datetime, dry_run: bool) -> None:
click.echo(
f"[Dry Run] Would process records {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}"
)
pbar.update(len(results))
continue

pbar = tqdm(results, total=len(build_ids))
for webhook_dict in pbar:
pbar.set_description(
f"Processing records {i * BATCH_SIZE} - {i * BATCH_SIZE + len(ids_chunk)}"
)
for webhook_dict in results:
process_job(json.dumps(webhook_dict))
pbar.update(1)

click.echo(f"Total records processed: {len(build_ids)}")
20 changes: 20 additions & 0 deletions analytics/analytics/job_processor/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ def calculate_node_occupancy(data: list[dict], step: int):
This is achieved by summing the number of pods present for all of the selected samples,
multiplied by the step size (to normalize result across step size), and divided by the
duration, to return a fraction.

This seems counter-intuitive, as no individual pod is mentioned in this function.
This is because node occupancy is a "shared" metric. It computes the occupancy of any one
pod on this node, over the course of a pod's lifetime. The only thing that tailors this
function to a specific pod is that the "data" argument conforms to the lifetime of that
one pod. Two pods on the same node with the same lifetime would return identical node
occupancy values.
"""
# Key is the timestamp, value is the number of jobs
timeline = {}
Expand All @@ -99,6 +106,11 @@ def calculate_node_occupancy(data: list[dict], step: int):
start = min(timeline.keys())
end = max(timeline.keys())

# There is an edge case where all data points are at the same point in time.
# In this case, the node occupancy is just one over the number of pods present at that time.
if start == end:
return 1 / timeline[start]

# Remove the first data point, as otherwise we'd be counting an extra time step towards the numerator
timeline.pop(start)

Expand Down Expand Up @@ -248,6 +260,14 @@ def get_pod_usage_and_occupancy(
step=step,
)

# Require more than one timeline value for the node, as we
# need a range of values, not just a single point in time.
if len(results) == 1:
raise UnexpectedPrometheusResult(
message=f"Node {node} only returned 1 timeline value",
query=cpu_seconds_query,
)

# First, get the cpu utlization by the pod we care about
# To do this, just get the last value from the response, since that'll be the total of the counter
pod_results = next((res for res in results if res["metric"]["pod"] == pod), None)
Expand Down
4 changes: 2 additions & 2 deletions k8s/production/custom/webhook-handler/deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler
image: ghcr.io/spack/django:0.5.18
image: ghcr.io/spack/django:0.5.19
imagePullPolicy: Always
resources:
requests:
Expand Down Expand Up @@ -146,7 +146,7 @@ spec:
serviceAccountName: webhook-handler
containers:
- name: webhook-handler-worker
image: ghcr.io/spack/django:0.5.18
image: ghcr.io/spack/django:0.5.19
command:
[
"celery",
Expand Down
Loading