Skip to content

Commit d941ecd

Browse files
Milan fix json (#330)
* Test for json serialization * Sanitize json after prepare * Test json * Convert ids to strings * Delete duplicated file * Ruff
1 parent c225bdf commit d941ecd

10 files changed

Lines changed: 326 additions & 49 deletions

metrics_utility/anonymized_rollups/base_anonymized_rollup.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,23 @@ def merge(self, dataframe_all, dataframe_new):
2424

2525
return pd.concat([dataframe_all, dataframe_new], ignore_index=True)
2626

27+
def _convert_id_columns_to_strings(self, dataframe):
28+
"""Convert ID columns to strings at the beginning of prepare().
29+
30+
Converts numeric ID columns (id, job_id, host_id, job_remote_id) to strings
31+
to ensure consistent JSON serialization.
32+
"""
33+
if dataframe.empty:
34+
return dataframe
35+
36+
id_columns = ['id', 'job_id', 'host_id', 'job_remote_id']
37+
for col in id_columns:
38+
if col in dataframe.columns:
39+
# Convert numeric IDs to strings, preserving NaN values
40+
dataframe[col] = dataframe[col].apply(lambda x: str(int(x)) if pd.notna(x) and isinstance(x, (int, float)) and x == int(x) else x)
41+
42+
return dataframe
43+
2744
# takes raw data and computes aggregation
2845
# this works in batches, for example we are collecting every hour
2946
# this hourly data arrive into prepare, then it gets merged with partial rollup (initaly empty)

metrics_utility/anonymized_rollups/controller_version_anonymized_rollup.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22

33
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
4+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
45

56

67
class ControllerVersionAnonymizedRollup(BaseAnonymizedRollup):
@@ -16,13 +17,18 @@ def prepare(self, dataframe):
1617
"""
1718
Transform dataframe to JSON structure with controller versions list.
1819
"""
20+
# Convert ID columns to strings at the beginning
21+
if dataframe is not None and not dataframe.empty:
22+
dataframe = self._convert_id_columns_to_strings(dataframe)
23+
1924
# Handle None or empty dataframe
2025
if dataframe is None or dataframe.empty:
21-
return []
26+
return sanitize_json([])
2227

2328
controller_versions = dataframe['controller_version'].tolist() if 'controller_version' in dataframe.columns else []
2429

25-
return controller_versions
30+
# Sanitize to convert NumPy types to native Python types for JSON serialization
31+
return sanitize_json(controller_versions)
2632

2733
def merge(self, data_all, data_new):
2834
"""

metrics_utility/anonymized_rollups/credentials_anonymized_rollup.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
2+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
23

34

45
class CredentialsAnonymizedRollup(BaseAnonymizedRollup):
@@ -23,26 +24,36 @@ def prepare(self, dataframe):
2324
Batch processing that extracts unique credential types in this batch.
2425
Returns a dictionary with a list of unique credential types.
2526
"""
27+
# Convert ID columns to strings at the beginning
28+
dataframe = self._convert_id_columns_to_strings(dataframe)
29+
2630
if dataframe.empty:
27-
return {
28-
'credential_types': [],
29-
}
31+
return sanitize_json(
32+
{
33+
'credential_types': [],
34+
}
35+
)
3036

3137
# Check if credential_type column exists (required for processing)
3238
if 'credential_type' not in dataframe.columns:
33-
return {
34-
'credential_types': [],
35-
}
39+
return sanitize_json(
40+
{
41+
'credential_types': [],
42+
}
43+
)
3644

3745
# Get unique credential types in this batch
3846
unique_credential_types = dataframe['credential_type'].dropna().unique()
3947
# Convert to sorted list of strings
4048
credential_types_list = sorted([str(ct) for ct in unique_credential_types])
4149

42-
return {
50+
result = {
4351
'credential_types': credential_types_list,
4452
}
4553

54+
# Sanitize to convert NumPy types to native Python types for JSON serialization
55+
return sanitize_json(result)
56+
4657
def merge(self, data_all, data_new):
4758
"""
4859
Merge two credential type dictionaries by unioning the credential_types lists.

metrics_utility/anonymized_rollups/events_modules_anonymized_rollup.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66

77
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
8+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
89
from metrics_utility.automation_controller_billing.dataframe_engine.dataframe_content_usage import DataframeContentUsage
910

1011

@@ -567,6 +568,9 @@ def _compute_unique_metadata(self, task_summary):
567568
# as default, merging is done by concatenating dataframes (defined in base class)
568569
def prepare(self, dataframe):
569570
"""Prepare dataframe for aggregation by filtering, transforming, and computing statistics."""
571+
# Convert ID columns to strings at the beginning
572+
dataframe = self._convert_id_columns_to_strings(dataframe)
573+
570574
collected_events_total, warnings_total, deprecations_total = self._count_initial_statistics(dataframe)
571575

572576
event_lists = self._get_event_lists()
@@ -582,17 +586,19 @@ def prepare(self, dataframe):
582586
task_summary = self._aggregate_task_summary(task_summary)
583587

584588
if task_summary.empty:
585-
return {
586-
'collected_events_total': collected_events_total,
587-
'warnings_total': warnings_total,
588-
'deprecations_total': deprecations_total,
589-
'module_stats': [],
590-
'collection_stats': [],
591-
'role_stats': [],
592-
'unique_modules': [],
593-
'modules_per_playbook': {},
594-
'unique_hosts': [],
595-
}
589+
return sanitize_json(
590+
{
591+
'collected_events_total': collected_events_total,
592+
'warnings_total': warnings_total,
593+
'deprecations_total': deprecations_total,
594+
'module_stats': [],
595+
'collection_stats': [],
596+
'role_stats': [],
597+
'unique_modules': [],
598+
'modules_per_playbook': {},
599+
'unique_hosts': [],
600+
}
601+
)
596602

597603
task_summary = task_summary.assign(
598604
jobs_successful_duration_total_seconds=lambda x: x['job_duration_seconds'].where(~x['job_failed'], 0),
@@ -603,7 +609,7 @@ def prepare(self, dataframe):
603609
module_stats_json, collection_stats_json, role_stats_json = self._convert_stats_to_json(module_stats, collection_stats, role_stats)
604610
unique_modules, modules_per_playbook, unique_hosts = self._compute_unique_metadata(task_summary)
605611

606-
return {
612+
result = {
607613
'collected_events_total': collected_events_total,
608614
'warnings_total': warnings_total,
609615
'deprecations_total': deprecations_total,
@@ -615,6 +621,9 @@ def prepare(self, dataframe):
615621
'unique_hosts': unique_hosts,
616622
}
617623

624+
# Sanitize to convert NumPy types to native Python types for JSON serialization
625+
return sanitize_json(result)
626+
618627
def base(self, data):
619628
"""
620629
*Failure/Success rate of modules

metrics_utility/anonymized_rollups/execution_environments_anonymized_rollup.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22

33
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
4+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
45

56

67
class ExecutionEnvironmentsAnonymizedRollup(BaseAnonymizedRollup):
@@ -16,25 +17,34 @@ def prepare(self, dataframe):
1617
"""
1718
Transform dataframe to JSON structure with totals for managed, unmanaged and all EE.
1819
"""
20+
# Convert ID columns to strings at the beginning
21+
if dataframe is not None and not dataframe.empty:
22+
dataframe = self._convert_id_columns_to_strings(dataframe)
23+
1924
# Handle None or empty dataframe
2025
if dataframe is None or dataframe.empty:
21-
return {
22-
'execution_environments_total': 0,
23-
'execution_environments_default_total': 0,
24-
'execution_environments_custom_total': 0,
25-
}
26+
return sanitize_json(
27+
{
28+
'execution_environments_total': 0,
29+
'execution_environments_default_total': 0,
30+
'execution_environments_custom_total': 0,
31+
}
32+
)
2633

2734
execution_environments_total = int(len(dataframe))
2835
dataframe['managed'] = dataframe['managed'].map({'t': True, 'f': False, True: True, False: False})
2936
execution_environments_default_total = int(dataframe['managed'].sum())
3037
execution_environments_custom_total = execution_environments_total - execution_environments_default_total
3138

32-
return {
39+
result = {
3340
'execution_environments_total': execution_environments_total,
3441
'execution_environments_default_total': execution_environments_default_total,
3542
'execution_environments_custom_total': execution_environments_custom_total,
3643
}
3744

45+
# Sanitize to convert NumPy types to native Python types for JSON serialization
46+
return sanitize_json(result)
47+
3848
def merge(self, data_all, data_new):
3949
"""
4050
For snapshot collectors, always pick new data (no merging needed).

metrics_utility/anonymized_rollups/helpers.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,29 @@
55
import math
66

77

8+
try:
9+
import numpy as np
10+
11+
HAS_NUMPY = True
12+
except ImportError:
13+
HAS_NUMPY = False
14+
15+
816
def sanitize_json(obj):
917
"""
10-
Sanitize a Python object to be JSON-serializable by replacing NaN and infinity values.
18+
Sanitize a Python object to be JSON-serializable by replacing NaN and infinity values
19+
and converting NumPy types to native Python types.
1120
1221
This function recursively traverses dictionaries, lists, and other data structures
13-
and replaces any NaN or infinity values with None (which becomes null in JSON).
22+
and replaces any NaN or infinity values with None (which becomes null in JSON),
23+
and converts NumPy types (int64, float64, etc.) to native Python types.
1424
1525
Args:
1626
obj: The object to sanitize (can be dict, list, float, int, str, etc.)
1727
1828
Returns:
1929
The sanitized object with all NaN and infinity values replaced with None
30+
and NumPy types converted to native Python types
2031
2132
Examples:
2233
>>> sanitize_json({'value': float('nan')})
@@ -27,6 +38,10 @@ def sanitize_json(obj):
2738
2839
>>> sanitize_json({'nested': {'value': float('-inf')}})
2940
{'nested': {'value': None}}
41+
42+
>>> import numpy as np
43+
>>> sanitize_json({'value': np.int64(42)})
44+
{'value': 42}
3045
"""
3146
if isinstance(obj, dict):
3247
# Recursively sanitize dictionary values
@@ -37,6 +52,15 @@ def sanitize_json(obj):
3752
elif isinstance(obj, tuple):
3853
# Recursively sanitize tuple items (convert to list for JSON)
3954
return [sanitize_json(item) for item in obj]
55+
elif HAS_NUMPY and isinstance(obj, (np.integer, np.floating)):
56+
# Convert NumPy integer and float types to native Python types
57+
# Check for NaN or infinity first
58+
if isinstance(obj, np.floating) and (math.isnan(obj) or math.isinf(obj)):
59+
return None
60+
return obj.item() # Convert NumPy scalar to native Python type
61+
elif HAS_NUMPY and isinstance(obj, np.ndarray):
62+
# Convert NumPy arrays to lists
63+
return sanitize_json(obj.tolist())
4064
elif isinstance(obj, float):
4165
# Check for NaN or infinity
4266
if math.isnan(obj) or math.isinf(obj):

metrics_utility/anonymized_rollups/jobhostsummary_anonymized_rollup.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22

33
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
4+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
45

56

67
class JobHostSummaryAnonymizedRollup(BaseAnonymizedRollup):
@@ -267,17 +268,22 @@ def _aggregate_by_ansible_version(self, aggregated_by_job, common_aggregations):
267268
return aggregations_by_ansible_version
268269

269270
def prepare(self, dataframe):
271+
# Convert ID columns to strings at the beginning
272+
dataframe = self._convert_id_columns_to_strings(dataframe)
273+
270274
# Count all records before processing
271275
job_host_pairs_total = len(dataframe)
272276

273277
# Handle empty dataframe
274278
if dataframe.empty:
275-
return {
276-
'by_job_type': [],
277-
'by_launch_type': [],
278-
'by_ansible_version': [],
279-
'job_host_pairs_total': job_host_pairs_total,
280-
}
279+
return sanitize_json(
280+
{
281+
'by_job_type': [],
282+
'by_launch_type': [],
283+
'by_ansible_version': [],
284+
'job_host_pairs_total': job_host_pairs_total,
285+
}
286+
)
281287

282288
# Normalize dataframe columns
283289
self._normalize_dataframe(dataframe)
@@ -298,13 +304,16 @@ def prepare(self, dataframe):
298304
by_launch_type = aggregations_by_launch_type.to_dict(orient='records')
299305
by_ansible_version = aggregations_by_ansible_version.to_dict(orient='records')
300306

301-
return {
307+
result = {
302308
'by_job_type': by_job_type,
303309
'by_launch_type': by_launch_type,
304310
'by_ansible_version': by_ansible_version,
305311
'job_host_pairs_total': job_host_pairs_total,
306312
}
307313

314+
# Sanitize to convert NumPy types to native Python types for JSON serialization
315+
return sanitize_json(result)
316+
308317
def base(self, data):
309318
"""
310319
Returns the already-aggregated JSON data from prepare() and merge().

metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66

77
from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
8+
from metrics_utility.anonymized_rollups.helpers import sanitize_json
89

910

1011
class JobsAnonymizedRollup(BaseAnonymizedRollup):
@@ -146,21 +147,26 @@ def _extract_metadata(self, dataframe):
146147
return organizations, job_ids, forks_total, scm_types
147148

148149
def prepare(self, dataframe):
150+
# Convert ID columns to strings at the beginning
151+
dataframe = self._convert_id_columns_to_strings(dataframe)
152+
149153
# Filter out jobs that are not finished
150154
dataframe = dataframe[dataframe['finished'].notna()]
151155

152156
# Handle empty dataframe
153157
if dataframe.empty:
154-
return {
155-
'by_job_type': [],
156-
'by_launch_type': [],
157-
'by_ansible_version': [],
158-
'organizations': [],
159-
'forks_total': 0,
160-
'job_ids': [],
161-
'scm_types': [],
162-
'installed_collections': [],
163-
}
158+
return sanitize_json(
159+
{
160+
'by_job_type': [],
161+
'by_launch_type': [],
162+
'by_ansible_version': [],
163+
'organizations': [],
164+
'forks_total': 0,
165+
'job_ids': [],
166+
'scm_types': [],
167+
'installed_collections': [],
168+
}
169+
)
164170

165171
# Preprocess dataframe
166172
dataframe = self._preprocess_dataframe(dataframe)
@@ -185,7 +191,7 @@ def prepare(self, dataframe):
185191
# Process collections statistics
186192
collections_stats = self._process_collections_from_jobs(dataframe)
187193

188-
return {
194+
result = {
189195
'by_job_type': by_job_type,
190196
'by_launch_type': by_launch_type,
191197
'by_ansible_version': by_ansible_version,
@@ -196,6 +202,9 @@ def prepare(self, dataframe):
196202
'installed_collections': collections_stats,
197203
}
198204

205+
# Sanitize to convert NumPy types to native Python types for JSON serialization
206+
return sanitize_json(result)
207+
199208
def _merge_stats_json(self, stats_all, stats_new, groupby_col):
200209
"""Merge two stats JSON lists by summing numeric columns and unioning lists."""
201210
if not stats_all:

0 commit comments

Comments
 (0)