Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/environment.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ All known metrics-utility environment variables:
AWX_PATH
KUBERNETES_SERVICE_PORT
METRICS_UTILITY_BILLING_ACCOUNT_ID
METRICS_UTILITY_GATHER_BATCH_SIZE
METRICS_UTILITY_GATHER_INTERVAL_HOURS
METRICS_UTILITY_BILLING_PROVIDER
METRICS_UTILITY_BUCKET_ACCESS_KEY
METRICS_UTILITY_BUCKET_ENDPOINT
Expand Down Expand Up @@ -93,6 +95,8 @@ container
* `METRICS_UTILITY_COLLECTOR_LOCK_SUFFIX` - `total_workers_vcpu` collector custom lock name
* `METRICS_UTILITY_DISABLE_JOB_HOST_SUMMARY_COLLECTOR` - disable `job_host_summary` collector (use together with `METRICS_UTILITY_OPTIONAL_COLLECTORS`)
* `METRICS_UTILITY_DISABLE_SAVE_LAST_GATHERED_ENTRIES` - skip updating last gather info from controller settings
* `METRICS_UTILITY_GATHER_BATCH_SIZE` - row-count batch size for collectors that support ID-range batching (`job_host_summary`, `main_jobevent`, `job_host_summary_service`, `main_host_daily`, `main_indirectmanagednodeaudit`, `unified_jobs`); default 0 (disabled). When set (e.g. 100000), each COPY query is limited to approximately that many rows using keyset pagination on the primary key — no OFFSET/LIMIT overhead. Combine with `METRICS_UTILITY_GATHER_INTERVAL_HOURS` for high-scale deployments.
* `METRICS_UTILITY_GATHER_INTERVAL_HOURS` - time-window size per gather slice in hours; default 24 (one slice per calendar day). Reducing this (e.g. to 4) divides each day into N smaller COPY queries, each producing its own tarball. Applies to all `daily_slicing` collectors. Does not affect `until_slicing` collectors (config, main_host, execution_environments, etc.).
* `METRICS_UTILITY_MAX_GATHER_PERIOD_DAYS` - maximum lenght of collection interval in days, default 28; `get_max_gather_period_days`
* `METRICS_UTILITY_OPTIONAL_COLLECTORS` - optional collectors, comma-separated list
* `METRICS_UTILITY_PROMETHEUS_URL` - Prometheus base url
Expand Down
30 changes: 19 additions & 11 deletions metrics_utility/automation_controller_billing/collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from metrics_utility.automation_controller_billing.helpers import get_last_entries_from_db
from metrics_utility.base import register
from metrics_utility.base.utils import bool_from_env, get_max_gather_period_days, get_optional_collectors
from metrics_utility.base.utils import bool_from_env, get_gather_interval_hours, get_max_gather_period_days, get_optional_collectors
from metrics_utility.exceptions import MetricsException, MissingRequiredEnvVar
from metrics_utility.library.collectors.controller import (
config,
Expand Down Expand Up @@ -56,7 +56,15 @@ def cli_something(since, until, output):


def daily_slicing(key, last_gather, **kwargs):
"""Yield (since, until) time slices for a collector, bounded by METRICS_UTILITY_GATHER_INTERVAL_HOURS.

Slices never cross a calendar-day boundary so tarballs stay day-aligned in storage.
When METRICS_UTILITY_GATHER_INTERVAL_HOURS < 24, multiple slices are emitted per day.
The default of 24 h reproduces the original one-slice-per-day behaviour.
"""
since, until = kwargs.get('since', None), kwargs.get('until', now())
interval = timedelta(hours=get_gather_interval_hours())

if since is not None:
last_entry = since
else:
Expand All @@ -67,22 +75,22 @@ def daily_slicing(key, last_gather, **kwargs):
except TypeError: # last_entries has a stale non-datetime entry for this collector
last_entry = max(last_gather, horizon)

start, end = last_entry, None
start_beginning_of_next_day = start.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)

# If the date range is over one day, we want first interval to contain the rest of the day
# then we'll cycle by full days
if until > start_beginning_of_next_day:
yield (start, start_beginning_of_next_day)
start = start_beginning_of_next_day

Comment thread
cursor[bot] marked this conversation as resolved.
start = last_entry
while start < until:
end = min(start + timedelta(days=1), until)
# Never cross a calendar-day boundary so tarballs stay day-aligned in storage
day_end = start.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
end = min(start + interval, day_end, until)
yield (start, end)
start = end


def until_slicing(_key, _last_gather, **kwargs):
"""Yield a single point-in-time slice for snapshot collectors that do a full-table scan.

Used for collectors whose data does not have a meaningful time range (e.g. config,
execution_environments). The slice is always (until - 1 s, until - 1 s) so the
snapshot lands in the last second of the current gather window.
"""
# For tables where we always need to do a table full scan, ignoring since & until
# Always store the inventory snapshot into the last daily partition (until - 1 second)
until = kwargs.get('until', now())
Expand Down
17 changes: 17 additions & 0 deletions metrics_utility/base/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,23 @@ def get_max_gather_period_days():
raise


def get_gather_interval_hours():
"""
Get the gather interval in hours from environment variable.
Defaults to 24 (one slice per calendar day). Set to a smaller value
(e.g. 4) to break large daily queries into sub-day batches.
Must be >= 1; a zero or negative value would cause daily_slicing to loop forever.
"""
try:
value = int(os.getenv('METRICS_UTILITY_GATHER_INTERVAL_HOURS', '24'))
except (ValueError, TypeError):
logger.error('METRICS_UTILITY_GATHER_INTERVAL_HOURS cannot be converted to an integer')
raise
Comment thread
coderabbitai[bot] marked this conversation as resolved.
if value < 1:
raise ValueError(f'METRICS_UTILITY_GATHER_INTERVAL_HOURS must be >= 1, got {value}')
return value


def get_optional_collectors():
"""
Get the list of optional collectors from environment variable.
Expand Down
159 changes: 90 additions & 69 deletions metrics_utility/library/collectors/controller/job_host_summary.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,99 @@
from ..util import DataframeOutput, collector, date_where, ensure_functions
from ..util import DataframeOutput, collector, date_where, ensure_functions, get_batch_size


@collector
def job_host_summary(*, db=None, since=None, until=None, output=DataframeOutput()):
where = date_where('main_jobhostsummary.modified', since, until)
"""Collect job-host summary rows from the Controller DB for the given time window.

# TODO: controler needs to have an index on main_jobhostsummary.modified
query = f"""
WITH
filtered_hosts AS (
SELECT DISTINCT main_jobhostsummary.host_id
FROM main_jobhostsummary
WHERE {where}
),
hosts_variables AS (
SELECT
filtered_hosts.host_id,
CASE
WHEN (metrics_utility_is_valid_json(main_host.variables))
THEN main_host.variables::jsonb->>'ansible_host'
ELSE metrics_utility_parse_yaml_field(main_host.variables, 'ansible_host' )
END AS ansible_host_variable,
CASE
WHEN (metrics_utility_is_valid_json(main_host.variables))
THEN main_host.variables::jsonb->>'ansible_connection'
ELSE metrics_utility_parse_yaml_field(main_host.variables, 'ansible_connection' )
END AS ansible_connection_variable
FROM filtered_hosts
LEFT JOIN main_host ON main_host.id = filtered_hosts.host_id
)
SELECT
main_jobhostsummary.id,
main_jobhostsummary.created,
main_jobhostsummary.modified,
main_jobhostsummary.host_name,
main_jobhostsummary.host_id as host_remote_id,
hosts_variables.ansible_host_variable,
hosts_variables.ansible_connection_variable,
main_jobhostsummary.changed,
main_jobhostsummary.dark,
main_jobhostsummary.failures,
main_jobhostsummary.ok,
main_jobhostsummary.processed,
main_jobhostsummary.skipped,
main_jobhostsummary.failed,
main_jobhostsummary.ignored,
main_jobhostsummary.rescued,
main_unifiedjob.created AS job_created,
main_jobhostsummary.job_id AS job_remote_id,
main_unifiedjob.unified_job_template_id AS job_template_remote_id,
main_unifiedjob.name AS job_template_name,
main_inventory.id AS inventory_remote_id,
main_inventory.name AS inventory_name,
main_organization.id AS organization_remote_id,
main_organization.name AS organization_name,
main_unifiedjobtemplate_project.id AS project_remote_id,
main_unifiedjobtemplate_project.name AS project_name
FROM main_jobhostsummary
-- connect to main_job, that has connections into inventory and project
LEFT JOIN main_job ON main_jobhostsummary.job_id = main_job.unifiedjob_ptr_id
-- get project name from project_options
LEFT JOIN main_unifiedjobtemplate AS main_unifiedjobtemplate_project ON main_unifiedjobtemplate_project.id = main_job.project_id
-- get inventory name from main_inventory
LEFT JOIN main_inventory ON main_inventory.id = main_job.inventory_id
-- get job name from main_unifiedjob
LEFT JOIN main_unifiedjob ON main_unifiedjob.id = main_jobhostsummary.job_id
-- get organization name from main_organization
LEFT JOIN main_organization ON main_organization.id = main_unifiedjob.organization_id
-- get variables from precomputed hosts_variables
LEFT JOIN hosts_variables ON hosts_variables.host_id = main_jobhostsummary.host_id
WHERE {where}
ORDER BY main_jobhostsummary.modified ASC
Joins main_jobhostsummary with host variables, job metadata, inventory, and
organization. When METRICS_UTILITY_GATHER_BATCH_SIZE is set, executes the query
in ID-range batches so the ID filter is pushed into the filtered_hosts CTE and the
final WHERE clause, keeping each batch cheap.
"""
where = date_where('main_jobhostsummary.modified', since, until)

# ensure_functions writes to DB, cannot be used in service (readonly DB)
ensure_functions(db)
return output.sql(db, query)

def build_query(batch_filter='TRUE'):
# TODO: controller needs to have an index on main_jobhostsummary.modified

Check warning on line 19 in metrics_utility/library/collectors/controller/job_host_summary.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Complete the task associated to this "TODO" comment.

See more on https://sonarcloud.io/project/issues?id=ansible_metrics-utility&issues=AZ3ONEe-a6WasO31-g36&open=AZ3ONEe-a6WasO31-g36&pullRequest=379
return f"""
WITH
filtered_hosts AS (
SELECT DISTINCT main_jobhostsummary.host_id
FROM main_jobhostsummary
WHERE {where} AND ({batch_filter})
),
hosts_variables AS (
SELECT
filtered_hosts.host_id,
CASE
WHEN (metrics_utility_is_valid_json(main_host.variables))
THEN main_host.variables::jsonb->>'ansible_host'
ELSE metrics_utility_parse_yaml_field(main_host.variables, 'ansible_host' )
END AS ansible_host_variable,
CASE
WHEN (metrics_utility_is_valid_json(main_host.variables))
THEN main_host.variables::jsonb->>'ansible_connection'
ELSE metrics_utility_parse_yaml_field(main_host.variables, 'ansible_connection' )
END AS ansible_connection_variable
FROM filtered_hosts
LEFT JOIN main_host ON main_host.id = filtered_hosts.host_id
)
SELECT
main_jobhostsummary.id,
main_jobhostsummary.created,
main_jobhostsummary.modified,
main_jobhostsummary.host_name,
main_jobhostsummary.host_id as host_remote_id,
hosts_variables.ansible_host_variable,
hosts_variables.ansible_connection_variable,
main_jobhostsummary.changed,
main_jobhostsummary.dark,
main_jobhostsummary.failures,
main_jobhostsummary.ok,
main_jobhostsummary.processed,
main_jobhostsummary.skipped,
main_jobhostsummary.failed,
main_jobhostsummary.ignored,
main_jobhostsummary.rescued,
main_unifiedjob.created AS job_created,
main_jobhostsummary.job_id AS job_remote_id,
main_unifiedjob.unified_job_template_id AS job_template_remote_id,
main_unifiedjob.name AS job_template_name,
main_inventory.id AS inventory_remote_id,
main_inventory.name AS inventory_name,
main_organization.id AS organization_remote_id,
main_organization.name AS organization_name,
main_unifiedjobtemplate_project.id AS project_remote_id,
main_unifiedjobtemplate_project.name AS project_name
FROM main_jobhostsummary
-- connect to main_job, that has connections into inventory and project
LEFT JOIN main_job ON main_jobhostsummary.job_id = main_job.unifiedjob_ptr_id
-- get project name from project_options
LEFT JOIN main_unifiedjobtemplate AS main_unifiedjobtemplate_project ON main_unifiedjobtemplate_project.id = main_job.project_id
-- get inventory name from main_inventory
LEFT JOIN main_inventory ON main_inventory.id = main_job.inventory_id
-- get job name from main_unifiedjob
LEFT JOIN main_unifiedjob ON main_unifiedjob.id = main_jobhostsummary.job_id
-- get organization name from main_organization
LEFT JOIN main_organization ON main_organization.id = main_unifiedjob.organization_id
-- get variables from precomputed hosts_variables
LEFT JOIN hosts_variables ON hosts_variables.host_id = main_jobhostsummary.host_id
WHERE {where} AND ({batch_filter})
ORDER BY main_jobhostsummary.id ASC
"""

batch_size = get_batch_size()
if batch_size:
# ID-range batching: filter is pushed into filtered_hosts CTE and the
# final WHERE so each batch only scans its share of rows.
min_max_query = f'SELECT MIN(id), MAX(id) FROM main_jobhostsummary WHERE {where}'
return output.batch_sql(
db,
query_fn=lambda s, e: build_query(f'main_jobhostsummary.id >= {s} AND main_jobhostsummary.id < {e}'),
min_max_query=min_max_query,
batch_size=batch_size,
)

return output.sql(db, build_query())
Loading