Milan fix json (#330)

MilanPospisil · web-flow · commit d941ecd00ca2 · 2026-02-23T14:58:54.000+01:00
* Test for json serialization

* Sanitize json after prepare

* Test json

* Convert ids to strings

* Delete duplicated file

* Ruff
diff --git a/metrics_utility/anonymized_rollups/base_anonymized_rollup.py b/metrics_utility/anonymized_rollups/base_anonymized_rollup.py
@@ -24,6 +24,23 @@ def merge(self, dataframe_all, dataframe_new):
 
         return pd.concat([dataframe_all, dataframe_new], ignore_index=True)
 
+    def _convert_id_columns_to_strings(self, dataframe):
+        """Convert ID columns to strings at the beginning of prepare().
+
+        Converts numeric ID columns (id, job_id, host_id, job_remote_id) to strings
+        to ensure consistent JSON serialization.
+        """
+        if dataframe.empty:
+            return dataframe
+
+        id_columns = ['id', 'job_id', 'host_id', 'job_remote_id']
+        for col in id_columns:
+            if col in dataframe.columns:
+                # Convert numeric IDs to strings, preserving NaN values
+                dataframe[col] = dataframe[col].apply(lambda x: str(int(x)) if pd.notna(x) and isinstance(x, (int, float)) and x == int(x) else x)
+
+        return dataframe
+
     # takes raw data and computes aggregation
     # this works in batches, for example we are collecting every hour
     # this hourly data arrive into prepare, then it gets merged with partial rollup (initaly empty)
diff --git a/metrics_utility/anonymized_rollups/controller_version_anonymized_rollup.py b/metrics_utility/anonymized_rollups/controller_version_anonymized_rollup.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 
 
 class ControllerVersionAnonymizedRollup(BaseAnonymizedRollup):
@@ -16,13 +17,18 @@ def prepare(self, dataframe):
         """
         Transform dataframe to JSON structure with controller versions list.
         """
+        # Convert ID columns to strings at the beginning
+        if dataframe is not None and not dataframe.empty:
+            dataframe = self._convert_id_columns_to_strings(dataframe)
+
         # Handle None or empty dataframe
         if dataframe is None or dataframe.empty:
-            return []
+            return sanitize_json([])
 
         controller_versions = dataframe['controller_version'].tolist() if 'controller_version' in dataframe.columns else []
 
-        return controller_versions
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(controller_versions)
 
     def merge(self, data_all, data_new):
         """
diff --git a/metrics_utility/anonymized_rollups/credentials_anonymized_rollup.py b/metrics_utility/anonymized_rollups/credentials_anonymized_rollup.py
@@ -1,4 +1,5 @@
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 
 
 class CredentialsAnonymizedRollup(BaseAnonymizedRollup):
@@ -23,26 +24,36 @@ def prepare(self, dataframe):
         Batch processing that extracts unique credential types in this batch.
         Returns a dictionary with a list of unique credential types.
         """
+        # Convert ID columns to strings at the beginning
+        dataframe = self._convert_id_columns_to_strings(dataframe)
+
         if dataframe.empty:
-            return {
-                'credential_types': [],
-            }
+            return sanitize_json(
+                {
+                    'credential_types': [],
+                }
+            )
 
         # Check if credential_type column exists (required for processing)
         if 'credential_type' not in dataframe.columns:
-            return {
-                'credential_types': [],
-            }
+            return sanitize_json(
+                {
+                    'credential_types': [],
+                }
+            )
 
         # Get unique credential types in this batch
         unique_credential_types = dataframe['credential_type'].dropna().unique()
         # Convert to sorted list of strings
         credential_types_list = sorted([str(ct) for ct in unique_credential_types])
 
-        return {
+        result = {
             'credential_types': credential_types_list,
         }
 
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(result)
+
     def merge(self, data_all, data_new):
         """
         Merge two credential type dictionaries by unioning the credential_types lists.
diff --git a/metrics_utility/anonymized_rollups/events_modules_anonymized_rollup.py b/metrics_utility/anonymized_rollups/events_modules_anonymized_rollup.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 from metrics_utility.automation_controller_billing.dataframe_engine.dataframe_content_usage import DataframeContentUsage
 
 
@@ -567,6 +568,9 @@ def _compute_unique_metadata(self, task_summary):
     # as default, merging is done by concatenating dataframes (defined in base class)
     def prepare(self, dataframe):
         """Prepare dataframe for aggregation by filtering, transforming, and computing statistics."""
+        # Convert ID columns to strings at the beginning
+        dataframe = self._convert_id_columns_to_strings(dataframe)
+
         collected_events_total, warnings_total, deprecations_total = self._count_initial_statistics(dataframe)
 
         event_lists = self._get_event_lists()
@@ -582,17 +586,19 @@ def prepare(self, dataframe):
         task_summary = self._aggregate_task_summary(task_summary)
 
         if task_summary.empty:
-            return {
-                'collected_events_total': collected_events_total,
-                'warnings_total': warnings_total,
-                'deprecations_total': deprecations_total,
-                'module_stats': [],
-                'collection_stats': [],
-                'role_stats': [],
-                'unique_modules': [],
-                'modules_per_playbook': {},
-                'unique_hosts': [],
-            }
+            return sanitize_json(
+                {
+                    'collected_events_total': collected_events_total,
+                    'warnings_total': warnings_total,
+                    'deprecations_total': deprecations_total,
+                    'module_stats': [],
+                    'collection_stats': [],
+                    'role_stats': [],
+                    'unique_modules': [],
+                    'modules_per_playbook': {},
+                    'unique_hosts': [],
+                }
+            )
 
         task_summary = task_summary.assign(
             jobs_successful_duration_total_seconds=lambda x: x['job_duration_seconds'].where(~x['job_failed'], 0),
@@ -603,7 +609,7 @@ def prepare(self, dataframe):
         module_stats_json, collection_stats_json, role_stats_json = self._convert_stats_to_json(module_stats, collection_stats, role_stats)
         unique_modules, modules_per_playbook, unique_hosts = self._compute_unique_metadata(task_summary)
 
-        return {
+        result = {
             'collected_events_total': collected_events_total,
             'warnings_total': warnings_total,
             'deprecations_total': deprecations_total,
@@ -615,6 +621,9 @@ def prepare(self, dataframe):
             'unique_hosts': unique_hosts,
         }
 
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(result)
+
     def base(self, data):
         """
         *Failure/Success rate of modules
diff --git a/metrics_utility/anonymized_rollups/execution_environments_anonymized_rollup.py b/metrics_utility/anonymized_rollups/execution_environments_anonymized_rollup.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 
 
 class ExecutionEnvironmentsAnonymizedRollup(BaseAnonymizedRollup):
@@ -16,25 +17,34 @@ def prepare(self, dataframe):
         """
         Transform dataframe to JSON structure with totals for managed, unmanaged and all EE.
         """
+        # Convert ID columns to strings at the beginning
+        if dataframe is not None and not dataframe.empty:
+            dataframe = self._convert_id_columns_to_strings(dataframe)
+
         # Handle None or empty dataframe
         if dataframe is None or dataframe.empty:
-            return {
-                'execution_environments_total': 0,
-                'execution_environments_default_total': 0,
-                'execution_environments_custom_total': 0,
-            }
+            return sanitize_json(
+                {
+                    'execution_environments_total': 0,
+                    'execution_environments_default_total': 0,
+                    'execution_environments_custom_total': 0,
+                }
+            )
 
         execution_environments_total = int(len(dataframe))
         dataframe['managed'] = dataframe['managed'].map({'t': True, 'f': False, True: True, False: False})
         execution_environments_default_total = int(dataframe['managed'].sum())
         execution_environments_custom_total = execution_environments_total - execution_environments_default_total
 
-        return {
+        result = {
             'execution_environments_total': execution_environments_total,
             'execution_environments_default_total': execution_environments_default_total,
             'execution_environments_custom_total': execution_environments_custom_total,
         }
 
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(result)
+
     def merge(self, data_all, data_new):
         """
         For snapshot collectors, always pick new data (no merging needed).
diff --git a/metrics_utility/anonymized_rollups/helpers.py b/metrics_utility/anonymized_rollups/helpers.py
@@ -5,18 +5,29 @@
 import math
 
 
+try:
+    import numpy as np
+
+    HAS_NUMPY = True
+except ImportError:
+    HAS_NUMPY = False
+
+
 def sanitize_json(obj):
     """
-    Sanitize a Python object to be JSON-serializable by replacing NaN and infinity values.
+    Sanitize a Python object to be JSON-serializable by replacing NaN and infinity values
+    and converting NumPy types to native Python types.
 
     This function recursively traverses dictionaries, lists, and other data structures
-    and replaces any NaN or infinity values with None (which becomes null in JSON).
+    and replaces any NaN or infinity values with None (which becomes null in JSON),
+    and converts NumPy types (int64, float64, etc.) to native Python types.
 
     Args:
         obj: The object to sanitize (can be dict, list, float, int, str, etc.)
 
     Returns:
         The sanitized object with all NaN and infinity values replaced with None
+        and NumPy types converted to native Python types
 
     Examples:
         >>> sanitize_json({'value': float('nan')})
@@ -27,6 +38,10 @@ def sanitize_json(obj):
 
         >>> sanitize_json({'nested': {'value': float('-inf')}})
         {'nested': {'value': None}}
+
+        >>> import numpy as np
+        >>> sanitize_json({'value': np.int64(42)})
+        {'value': 42}
     """
     if isinstance(obj, dict):
         # Recursively sanitize dictionary values
@@ -37,6 +52,15 @@ def sanitize_json(obj):
     elif isinstance(obj, tuple):
         # Recursively sanitize tuple items (convert to list for JSON)
         return [sanitize_json(item) for item in obj]
+    elif HAS_NUMPY and isinstance(obj, (np.integer, np.floating)):
+        # Convert NumPy integer and float types to native Python types
+        # Check for NaN or infinity first
+        if isinstance(obj, np.floating) and (math.isnan(obj) or math.isinf(obj)):
+            return None
+        return obj.item()  # Convert NumPy scalar to native Python type
+    elif HAS_NUMPY and isinstance(obj, np.ndarray):
+        # Convert NumPy arrays to lists
+        return sanitize_json(obj.tolist())
     elif isinstance(obj, float):
         # Check for NaN or infinity
         if math.isnan(obj) or math.isinf(obj):
diff --git a/metrics_utility/anonymized_rollups/jobhostsummary_anonymized_rollup.py b/metrics_utility/anonymized_rollups/jobhostsummary_anonymized_rollup.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 
 
 class JobHostSummaryAnonymizedRollup(BaseAnonymizedRollup):
@@ -267,17 +268,22 @@ def _aggregate_by_ansible_version(self, aggregated_by_job, common_aggregations):
         return aggregations_by_ansible_version
 
     def prepare(self, dataframe):
+        # Convert ID columns to strings at the beginning
+        dataframe = self._convert_id_columns_to_strings(dataframe)
+
         # Count all records before processing
         job_host_pairs_total = len(dataframe)
 
         # Handle empty dataframe
         if dataframe.empty:
-            return {
-                'by_job_type': [],
-                'by_launch_type': [],
-                'by_ansible_version': [],
-                'job_host_pairs_total': job_host_pairs_total,
-            }
+            return sanitize_json(
+                {
+                    'by_job_type': [],
+                    'by_launch_type': [],
+                    'by_ansible_version': [],
+                    'job_host_pairs_total': job_host_pairs_total,
+                }
+            )
 
         # Normalize dataframe columns
         self._normalize_dataframe(dataframe)
@@ -298,13 +304,16 @@ def prepare(self, dataframe):
         by_launch_type = aggregations_by_launch_type.to_dict(orient='records')
         by_ansible_version = aggregations_by_ansible_version.to_dict(orient='records')
 
-        return {
+        result = {
             'by_job_type': by_job_type,
             'by_launch_type': by_launch_type,
             'by_ansible_version': by_ansible_version,
             'job_host_pairs_total': job_host_pairs_total,
         }
 
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(result)
+
     def base(self, data):
         """
         Returns the already-aggregated JSON data from prepare() and merge().
diff --git a/metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py b/metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from metrics_utility.anonymized_rollups.base_anonymized_rollup import BaseAnonymizedRollup
+from metrics_utility.anonymized_rollups.helpers import sanitize_json
 
 
 class JobsAnonymizedRollup(BaseAnonymizedRollup):
@@ -146,21 +147,26 @@ def _extract_metadata(self, dataframe):
         return organizations, job_ids, forks_total, scm_types
 
     def prepare(self, dataframe):
+        # Convert ID columns to strings at the beginning
+        dataframe = self._convert_id_columns_to_strings(dataframe)
+
         # Filter out jobs that are not finished
         dataframe = dataframe[dataframe['finished'].notna()]
 
         # Handle empty dataframe
         if dataframe.empty:
-            return {
-                'by_job_type': [],
-                'by_launch_type': [],
-                'by_ansible_version': [],
-                'organizations': [],
-                'forks_total': 0,
-                'job_ids': [],
-                'scm_types': [],
-                'installed_collections': [],
-            }
+            return sanitize_json(
+                {
+                    'by_job_type': [],
+                    'by_launch_type': [],
+                    'by_ansible_version': [],
+                    'organizations': [],
+                    'forks_total': 0,
+                    'job_ids': [],
+                    'scm_types': [],
+                    'installed_collections': [],
+                }
+            )
 
         # Preprocess dataframe
         dataframe = self._preprocess_dataframe(dataframe)
@@ -185,7 +191,7 @@ def prepare(self, dataframe):
         # Process collections statistics
         collections_stats = self._process_collections_from_jobs(dataframe)
 
-        return {
+        result = {
             'by_job_type': by_job_type,
             'by_launch_type': by_launch_type,
             'by_ansible_version': by_ansible_version,
@@ -196,6 +202,9 @@ def prepare(self, dataframe):
             'installed_collections': collections_stats,
         }
 
+        # Sanitize to convert NumPy types to native Python types for JSON serialization
+        return sanitize_json(result)
+
     def _merge_stats_json(self, stats_all, stats_new, groupby_col):
         """Merge two stats JSON lists by summing numeric columns and unioning lists."""
         if not stats_all:
diff --git a/metrics_utility/anonymized_rollups/table_metadata_anonymized_rollup.py b/metrics_utility/anonymized_rollups/table_metadata_anonymized_rollup.py
diff --git a/metrics_utility/test/test_anonymized_rollups/test_json.py b/metrics_utility/test/test_anonymized_rollups/test_json.py