ansible
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pytest.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/sonar_checks.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/sonar_checks.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎metrics_utility/anonymized_rollups/anonymized_rollups.md‎
Lines changed: 137 additions & 0 deletions b/‎metrics_utility/anonymized_rollups/anonymized_rollups.md‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎metrics_utility/anonymized_rollups/anonymized_rollups.py‎
Lines changed: 64 additions & 9 deletions b/‎metrics_utility/anonymized_rollups/anonymized_rollups.py‎
Lines changed: 64 additions & 9 deletions
@@ -91,7 +91,7 @@ jobs:
 
       - name: "Upload code coverage report"
         if: ${{ github.base_ref }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
         with:
           name: coverage-report
           path: coverage.xml
@@ -102,7 +102,7 @@ jobs:
 
       - name: "Upload PR number"
         if: github.event_name == 'pull_request'
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v7
         with:
           name: pr_number
           path: pr_number.txt
 
@@ -22,15 +22,15 @@ jobs:
           fetch-depth: 0
 
       - name: Fetch Code Coverage Report
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
         with:
           name: coverage-report
           path: coverage-artifact
           github-token: ${{ secrets.GITHUB_TOKEN }}
           run-id: ${{ github.event.workflow_run.id }}
 
       - name: Fetch PR Number
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v8
         with:
           name: pr_number
           path: pr-artifact
 
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.10
+    rev: v0.15.4
     hooks:
       - id: ruff-check
       - id: ruff-format
@@ -0,0 +1,137 @@
+# Documentation of anonymized rollups
+
+## 1. Collectors and Rollups
+
+Each collector type has an associated anonymized rollup class that processes the collected data. Collectors are located in `metrics_utility/library/collectors/controller/`, and their corresponding rollup classes are in `metrics_utility/anonymized_rollups/`.
+
+### Collector Types
+
+Collectors fall into two categories:
+
+- **Since-until collectors (time-series)**: These collectors require `since` and `until` parameters and collect data for a specific time range. They run hourly to collect incremental data. But they can be configured to run whatever we want.
+- **Snapshot collectors**: These collectors do not require time parameters and collect a point-in-time snapshot of the current state. They run once per day (or whever we want).
+
+### Collector List
+
+#### Time-Series Collectors (since-until)
+
+1. **`unified_jobs`**
+   - **Collector**: `metrics_utility/library/collectors/controller/unified_jobs.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/jobs_anonymized_rollup.py` (`JobsAnonymizedRollup`)
+   - **Description**: Collects unified job data including job status, duration, execution environment, inventory, organization, ansible version, installed collections, and job template information. Filters jobs by `finished` timestamp within the time range.
+
+2. **`job_host_summary_service`**
+   - **Collector**: `metrics_utility/library/collectors/controller/job_host_summary_service.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/jobhostsummary_anonymized_rollup.py` (`JobHostSummaryAnonymizedRollup`)
+   - **Description**: Collects job host summary data including task execution statistics (ok, failed, skipped, unreachable, etc.) per job and host. Uses partition-optimized queries for better performance.
+
+3. **`credentials_service`**
+   - **Collector**: `metrics_utility/library/collectors/controller/credentials_service.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/credentials_anonymized_rollup.py` (`CredentialsAnonymizedRollup`)
+   - **Description**: Collects credential usage data showing which credential types are used in jobs within the time range.
+
+4. **`main_jobevent_service`**
+   - **Collector**: `metrics_utility/library/collectors/controller/main_jobevent_service.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/events_modules_anonymized_rollup.py` (`EventModulesAnonymizedRollup`)
+   - **Description**: Collects job event data including module usage, collection usage, role usage, and event statistics. This is the largest collector and uses partition-optimized queries.
+
+#### Snapshot Collectors
+
+5. **`execution_environments`**
+   - **Collector**: `metrics_utility/library/collectors/controller/execution_environments.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/execution_environments_anonymized_rollup.py` (`ExecutionEnvironmentsAnonymizedRollup`)
+   - **Description**: Collects execution environment statistics including count of default and custom execution environments.
+
+6. **`table_metadata`**
+   - **Collector**: `metrics_utility/library/collectors/controller/table_metadata.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/table_metadata_anonymized_rollup.py` (`TableMetadataAnonymizedRollup`)
+   - **Description**: Collects database table metadata including row counts and table sizes for various system tables. It is used for estimation of how many rows customer can have, and how large those tables are in terms of disc size.
+
+7. **`controller_version_service`**
+   - **Collector**: `metrics_utility/library/collectors/controller/controller_version_service.py`
+   - **Rollup**: `metrics_utility/anonymized_rollups/controller_version_anonymized_rollup.py` (`ControllerVersionAnonymizedRollup`)
+   - **Description**: Collects controller version information showing which versions of the controller are running.
+
+## 2. Rollup Flow
+
+The anonymized rollup process follows a multi-stage flow:
+
+### Hourly Collection
+
+1. **Collection**: Each time-series collector runs hourly, collecting data for a specific hour (e.g., 10:00-11:00). This is important, because otherwise we will
+not be able to compute data for whole day because of performance.
+
+The data are then processed in batches (see prepare and merge below). Each batch computes basicaly hourly aggregate, which is much much smaller than raw data - it looks like json data with summaries, total counts, total durations...
+
+Those summaries are updated with each batch (result of two hourly aggregates are then aggregated together - this is call rollups - rollups are basicaly hierarchical aggregates). Then this result is again aggregated with another hour and up until whole day.
+
+The daily rollup is sent to the analytics team, who is then further aggregating our daily rollups into monthly and yearly rolups, but this is not part of our metrics utility.
+
+2. **Prepare**: The raw dataframe from the collector is passed to the rollup's `prepare()` method, which:
+   - Filters and preprocesses the data (e.g., filtering out unfinished jobs)
+   - Performs initial aggregations
+   - Returns a serializable dictionary or list (not a dataframe)
+
+3. **Merge**: The result from `prepare()` is merged with the partial daily rollup using the `merge()` method:
+   - The partial daily rollup is initially empty (None) for the first hour
+   - Each subsequent hour's prepared data is merged into the accumulating daily rollup
+   - Both the partial rollup and prepared data are serializable (JSON-compatible) structures
+   - The merge operation combines these structures appropriately (e.g., concatenating lists, summing counts)
+
+### Daily Base Processing
+
+4. **Base**: After all hours for the day have been processed, the complete daily rollup is passed to the `base()` method, which:
+   - Performs final aggregations and statistics computation if needed
+   - Usualy quite short
+   - Returns a dictionary with a `json` key containing the final rollup data
+
+### Final Merging
+
+5. **Combination**: All rollup results from `base()` are combined in `anonymized_rollups.py`:
+   - Each rollup's `json` output is collected
+   - All rollups are merged together using `anonymize_rollups()` function
+   - The combined data is flattened into a single structure
+   - Sensitive data is anonymized (see section 3)
+
+## 3. Anonymization
+
+After all rollups are merged, the data goes through anonymization:
+
+1. **String Filtering**: Any string value that is not a built-in Python type or part of a public collection (defined in `collections.json`) is either:
+   - Set to `"Unknown"` (for module names, collection names, role names with `collection_source == 'Unknown'`)
+   - Filtered out entirely during collection (e.g., filtered by `manage` DB column or other filters)
+
+3. **Sanitization**: NaN and infinity values are replaced with `None` to ensure valid JSON output.
+
+The anonymization ensures that no sensitive customer data (like custom module names, collection names, or job template names) is exposed in the final output.
+
+## 4. Message Splitting
+
+The final anonymized rollup JSON is split into multiple messages for transmission to Segment.com:
+
+1. **Top-level Key Splitting**: Each top-level key in the JSON dictionary becomes a separate message chunk. For example:
+   - `statistics` → one chunk
+   - `module_stats` → one or more chunks (if it's a list)
+   - `jobs_by_job_type` → one or more chunks (if it's a list)
+
+2. **Array Splitting**: If a top-level key contains an array (list), that array is split into multiple chunks if it exceeds the maximum message size:
+   - Maximum size: 24KB (with empty space reserved for additional metadata)
+   - Each chunk contains as many array items as can fit within the size limit
+   - Items are never split across chunks
+
+3. **Size Calculation**: The size of each chunk is calculated as the JSON-encoded byte size of the data.
+
+4. **Dictionary Handling**: If a top-level key contains a dictionary (not a list), it is sent as a single chunk and cannot be split. Therefore, dictionaries must be smaller than the maximum message size.
+
+The splitting logic is implemented in `metrics_utility/library/storage/segment.py` in the `_split_into_chunks()` method.
+
+## 5. Testing
+
+To test the anonymized rollup system, use the `run_no_events.py` script:
+
+**Location**: `tools/anonymized_tests/run_no_events.py`
+
+See more in the file itself
+
+
+
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import os
+import shutil
 
 from typing import Any, Callable, Dict, List
 
@@ -17,6 +18,9 @@
 from metrics_utility.anonymized_rollups.table_metadata_anonymized_rollup import TableMetadataAnonymizedRollup
 
 
+OUT_BATCHES_DIR = './out/batches'
+
+
 def hash(value, salt):
     # has the value and salt, hash should be string
     combined = (salt + ':' + value).encode('utf-8')
@@ -147,23 +151,25 @@ def _get_default_host_summary_fields() -> Dict[str, int]:
         'skipped_total': 0,
         'ignored_total': 0,
         'rescued_total': 0,
-        'unique_hosts_total': 0,
         'successful_hosts_total': 0,
         'failed_hosts_total': 0,
         'unreachable_hosts_total': 0,
     }
 
 
 def _extract_host_summary_fields(jhs_data: Dict[str, Any]) -> Dict[str, Any]:
-    """Extract host summary fields from job_host_summary data."""
+    """Extract host summary fields from job_host_summary data.
+
+    Note: unique_hosts_total is not included here as it's only computed at the top level,
+    not per grouping.
+    """
     return {
         'dark_total': jhs_data.get('dark_total', 0),
         'failures_total': jhs_data.get('failures_total', 0),
         'ok_total': jhs_data.get('ok_total', 0),
         'skipped_total': jhs_data.get('skipped_total', 0),
         'ignored_total': jhs_data.get('ignored_total', 0),
         'rescued_total': jhs_data.get('rescued_total', 0),
-        'unique_hosts_total': jhs_data.get('unique_hosts_total', 0),
         'successful_hosts_total': jhs_data.get('successful_hosts_total', 0),
         'failed_hosts_total': jhs_data.get('failed_hosts_total', 0),
         'unreachable_hosts_total': jhs_data.get('unreachable_hosts_total', 0),
@@ -204,10 +210,21 @@ def _calculate_sum_from_list(items: List[Dict[str, Any]], field: str) -> Any:
     return sum(item.get(field, 0) for item in items)
 
 
-def _calculate_host_summary_totals(job_host_summary_by_job_type: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Calculate host summary totals from job_type groups."""
+def _calculate_host_summary_totals(job_host_summary_by_job_type: List[Dict[str, Any]], host_ids: List[Any] = None) -> Dict[str, Any]:
+    """Calculate host summary totals from job_type groups.
+
+    Args:
+        job_host_summary_by_job_type: List of job_type group dictionaries
+        host_ids: Top-level list of host IDs to compute unique_hosts_total from
+    """
+    # Compute unique_hosts_total from top-level host_ids list, not from groupings
+    if host_ids is not None and isinstance(host_ids, list) and len(host_ids) > 0:
+        unique_hosts_total = len(set(host_ids))
+    else:
+        unique_hosts_total = 0
+
     return {
-        'unique_hosts_total': _calculate_sum_from_list(job_host_summary_by_job_type, 'unique_hosts_total'),
+        'unique_hosts_total': unique_hosts_total,
         'successful_hosts_total': _calculate_sum_from_list(job_host_summary_by_job_type, 'successful_hosts_total'),
         'failed_hosts_total': _calculate_sum_from_list(job_host_summary_by_job_type, 'failed_hosts_total'),
         'unreachable_hosts_total': _calculate_sum_from_list(job_host_summary_by_job_type, 'unreachable_hosts_total'),
@@ -217,6 +234,7 @@ def _calculate_host_summary_totals(job_host_summary_by_job_type: List[Dict[str,
 def _calculate_job_statistics(jobs_by_job_type: List[Dict[str, Any]]) -> Dict[str, Any]:
     """Calculate job statistics by summing from all job_type groups."""
     return {
+        'rollup_period_jobs_total': _calculate_sum_from_list(jobs_by_job_type, 'jobs_total'),
         'job_templates_total': _calculate_sum_from_list(jobs_by_job_type, 'templates_total'),
         'inventories_total': _calculate_sum_from_list(jobs_by_job_type, 'inventories_total'),
         'rollup_period_jobs_successful': _calculate_sum_from_list(jobs_by_job_type, 'jobs_successful_total'),
@@ -281,8 +299,8 @@ def _build_statistics(
         'rollup_period_execution_environments_total': execution_environments_total,
         'rollup_period_EE_default_total': execution_environments.get('execution_environments_default_total'),
         'rollup_period_EE_custom_total': execution_environments.get('execution_environments_custom_total'),
-        # from jobs (top-level fields)
-        'rollup_period_jobs_total': jobs.get('jobs_total'),
+        # from jobs (computed from jobs_by_job_type aggregation)
+        'rollup_period_jobs_total': job_statistics['rollup_period_jobs_total'],
         'rollup_period_jobs_successful': job_statistics['rollup_period_jobs_successful'],
         'rollup_period_jobs_failed': job_statistics['rollup_period_jobs_failed'],
         'rollup_period_jobs_duration_all_statuses_seconds': job_statistics['rollup_period_jobs_duration_all_statuses_seconds'],
@@ -404,8 +422,11 @@ def flatten_json_report(data: Dict[str, Any]) -> Dict[str, Any]:
     job_host_summary_by_launch_type: List[Dict[str, Any]] = job_host_summary_root.get('by_launch_type', []) or []
     job_host_summary_by_ansible_version: List[Dict[str, Any]] = job_host_summary_root.get('by_ansible_version', []) or []
 
+    # Extract top-level host_ids list to compute unique_hosts_total
+    host_ids: List[Any] = job_host_summary_root.get('host_ids', []) or []
+
     # Calculate statistics using helper functions
-    host_summary_totals = _calculate_host_summary_totals(job_host_summary_by_job_type)
+    host_summary_totals = _calculate_host_summary_totals(job_host_summary_by_job_type, host_ids)
     job_statistics = _calculate_job_statistics(jobs_by_job_type)
     playbooks_total = len(events_modules.get('modules_used_per_playbook_total', {}) or {})
     execution_environments_total = _calculate_execution_environments_total(execution_environments)
@@ -516,6 +537,11 @@ def anonymize_rollups(
 
 
 def compute_anonymized_rollup_from_raw_data(input_data, salt):
+
+    # delete everything in the directory ./out/batches (including subdirectories)
+    if os.path.exists(OUT_BATCHES_DIR):
+        shutil.rmtree(OUT_BATCHES_DIR, ignore_errors=True)
+
     jobs = load_anonymized_rollup_data(JobsAnonymizedRollup(), input_data['unified_jobs'])
     jobs_result = JobsAnonymizedRollup().base(jobs)
 
@@ -549,6 +575,12 @@ def compute_anonymized_rollup_from_raw_data(input_data, salt):
     )
     # Sanitize the result to replace NaN and infinity values with None (valid JSON)
     anonymized_rollup = sanitize_json(anonymized_rollup)
+
+    # save anonymized rollup to file
+    os.makedirs(OUT_BATCHES_DIR, exist_ok=True)
+    file_name = os.path.join(OUT_BATCHES_DIR, 'anonymized_rollup.json')
+    with open(file_name, 'w') as f:
+        f.write(json.dumps(anonymized_rollup, indent=2))
     return anonymized_rollup
 
 
@@ -563,12 +595,35 @@ def load_anonymized_rollup_data(rollup_object: BaseAnonymizedRollup, dataframe_l
 
     concat_data = None
 
+    rollup_object_name = rollup_object.rollup_name
+
+    counter = -1
     for dataframe in dataframe_list:
+        counter += 1
         # compat for CSVs
         if isinstance(dataframe, str):
             dataframe = pd.read_csv(dataframe, encoding='utf-8')
 
         prepared_data = rollup_object.prepare(dataframe)
+        # serialize prepared data to json
+        prepared_data = json.dumps(prepared_data, indent=2)
+        # back to dict/list
+        prepared_data = json.loads(prepared_data)
         concat_data = rollup_object.merge(concat_data, prepared_data)
+        # concat data to json and then back
+        concat_data = json.dumps(concat_data, indent=2)
+        concat_data = json.loads(concat_data)
+
+        # Create a subdirectory for this rollup
+        rollup_batch_dir = os.path.join(OUT_BATCHES_DIR, rollup_object_name)
+        os.makedirs(rollup_batch_dir, exist_ok=True)
+        # save prepare data and concat data to separate files (as json pretty printed)
+        # Each rollup has its own folder
+        file1_name = os.path.join(rollup_batch_dir, f'{counter}_prepare.json')
+        file2_name = os.path.join(rollup_batch_dir, f'{counter}_xconcat.json')
+        with open(file1_name, 'w') as f:
+            f.write(json.dumps(prepared_data, indent=2))
+        with open(file2_name, 'w') as f:
+            f.write(json.dumps(concat_data, indent=2))
 
     return concat_data