diff --git a/.github/workflows/virtual-integration.yml b/.github/workflows/virtual-integration.yml index 155be369d..ae7bdee3e 100644 --- a/.github/workflows/virtual-integration.yml +++ b/.github/workflows/virtual-integration.yml @@ -449,414 +449,233 @@ jobs: run: | mage publish:releaseManifest - deploy-kind: + # ============================================================================ + # TEMPORARY VALIDATION JOB - Remove after validation is complete + # This job embeds diagnostics convergence validation for testing purposes + # ============================================================================ + diagnostics-convergence-validation: + name: Diagnostics Convergence Validation (TEMPORARY) permissions: + actions: read contents: read - name: Deploy Kind Orchestrator and run tests - needs: - - lint-go - - lint-markdown - - lint-shell - - lint-terraform - - lint-version - - lint-helm - - lint-yaml - - check-changed-files - if: | - always() && - needs.lint-version.result == 'success' && - needs.check-changed-files.outputs.only_design_proposals != 'true' && ( - needs.check-changed-files.outputs.orch == 'true' || - needs.check-changed-files.outputs.ci == 'true' || - github.ref == 'refs/heads/main' || - github.ref == 'refs/heads/main-pass-validation' - ) - runs-on: ubuntu-24.04-16core-64GB - timeout-minutes: 90 - env: - ORCH_DEFAULT_PASSWORD: ${{ secrets.ORCH_DEFAULT_PASSWORD }} + # Always run, but do early exit if all target jobs succeeded + if: always() + runs-on: ubuntu-latest + timeout-minutes: 20 steps: - - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - persist-credentials: false - - name: Install DNSmasq - run: | - cd ci/ven - ./dnsmasq-setup.sh "kind.internal" setup - - name: Install Libvirt - env: - LIBVIRT_DEFAULT_URI: 'qemu:///system' - run: | - cd ci/ven - ./libvirt-setup.sh - - name: Deploy Kind Orchestrator - id: deploy-kind-orchestrator - uses: ./.github/actions/deploy_kind - timeout-minutes: 45 - with: - orch_version: ${{ github.event.pull_request.head.sha || github.sha }} - orch_password: ${{ secrets.ORCH_DEFAULT_PASSWORD }} - docker_username: ${{ secrets.SYS_DOCKERHUB_USERNAME }} - docker_password: ${{ secrets.SYS_DOCKERHUB_RO }} - token: ${{ secrets.SYS_EMF_GH_TOKEN }} - deployment_type: all - - name: Config DNSmasq - run: | - cd ci/ven - ./dnsmasq-setup.sh "kind.internal" config - - name: Run policy compliance tests - run: mage test:policyCompliance - - name: Run image pull policy compliance tests - run: mage test:imagePullPolicyCompliance - - - name: Setup Sample Org and Project with default users - id: default-mt-setup - run: mage tenantUtils:createDefaultMtSetup - - name: Deploy Victoria Metrics instance - env: - ORCH_DEFAULT_PASSWORD: ${{ secrets.ORCH_DEFAULT_PASSWORD }} - run: mage deploy:victoriaMetrics apply - - # NOTE: Tenancy tests (e2etenancy) are SKIPPED - covered by Golden Suite CF-1 and CF-11 - - - name: Run e2e tenancy API gateway tests - run: mage test:e2etenancyapigw - - - name: Create default user and run e2e tests - run: mage devUtils:createDefaultUser test:e2e - - - name: "Test Observability SRE Exporter w/o VEN" - env: - ORCH_DEFAULT_PASSWORD: ${{ secrets.ORCH_DEFAULT_PASSWORD }} - run: | - mage test:e2eSreObservabilityNoEnic - - - name: "Test Observability Public Endpoints" - env: - ORCH_DEFAULT_PASSWORD: ${{ secrets.ORCH_DEFAULT_PASSWORD }} - run: | - mage test:e2eObservability - - - name: "Test Observability Orchestrator Stack" - env: - ORCH_DEFAULT_PASSWORD: ${{ secrets.ORCH_DEFAULT_PASSWORD }} - run: | - mage test:e2eOrchObservability + - name: Check if convergence analysis is needed + id: check-need-analysis + run: | + # Check if at least one job is non-success (failure, cancelled, etc.) + need_analysis=true - - name: Read test-automation dependency Version - id: read-test-automation-version - shell: bash - run: | - cat /proc/cpuinfo - version=$(yq '.test-automation.version' ${{ github.workspace }}/.test-dependencies.yaml | tr -d '\n' | xargs) - echo $version - echo "version=$version" >> $GITHUB_OUTPUT - - name: Checkout edge-manage-test-automation repository with submodules - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: open-edge-platform/edge-manage-test-automation - ref: ${{ steps.read-test-automation-version.outputs.version }} - path: edge-manage-test-automation - submodules: 'recursive' - token: ${{ secrets.SYS_EMF_GH_TOKEN }} - persist-credentials: false - - name: Setup virtual environment - working-directory: edge-manage-test-automation - run: | - git submodule update --init --recursive - make asdf-install - make venv_edge-manage-test-automation - # install required versions for Pico - pushd repos/ven/pico - asdf install - sudo apt-get install xsltproc - popd - - # Golden Suite runs the following tests: - # CF-1: Tenancy Management (Create Org/Project) - # CF-2: Tenancy Isolation - # CF-3: Provision EN (VEN onboarding) - # CF-4: Single Node Cluster Creation - # CF-5: Deploy Container App (WordPress) - # CF-6: SKIPPED - VM deployment issues with VEN - # CF-7: Assert Metrics Collected - # CF-8: SKIPPED - Cleanup for apps, CF-6 VM app (not deployed) - # CF-9: Cleanup Cluster - # CF-10: Cleanup Host - # CF-11: Cleanup Tenancy - - name: Run Golden Suite Robot Framework Tests - id: robot-tests - working-directory: edge-manage-test-automation - timeout-minutes: 45 - env: - REQUESTS_CA_BUNDLE: /usr/local/share/ca-certificates/orch-ca.crt - LIBVIRT_DEFAULT_URI: 'qemu:///system' - run: | - kubectl -n orch-platform get secrets platform-keycloak -o yaml || true - KC_ADMIN_PWD=$(kubectl -n orch-platform get secrets platform-keycloak -o jsonpath='{.data.admin-password}' | base64 -d) - # Add the password to the orchestrator config - yq eval ".orchestrator.admin_password = \"${KC_ADMIN_PWD}\"" -i orchestrator-configs/kind.yaml - yq eval '.infra.host.edgenode.hw_info.libvirt_pool_name = "default"' -i tests/core_foundation/data/cf_data_2_ven_VEN-libvirt_ubuntu-24.04-lts.yaml - yq eval '.infra.host.edgenode.hw_info.libvirt_network_name = "default"' -i tests/core_foundation/data/cf_data_2_ven_VEN-libvirt_ubuntu-24.04-lts.yaml - cat tests/core_foundation/data/cf_data_2_ven_VEN-libvirt_ubuntu-24.04-lts.yaml || true - source venv_edge-manage-test-automation/bin/activate - robot -L DEBUG --pythonpath . \ - --name "Golden Suite: Core Foundation" \ - -d robot_output/core_foundation \ - -V orchestrator-configs/kind.yaml \ - -V tests/core_foundation/data/cf_data_2_ven_VEN-libvirt_ubuntu-24.04-lts.yaml \ - --exitonfailure \ - --exclude cf6 \ - --exclude cf8 \ - tests/core_foundation/core_foundation.robot - - - name: Upload test artifacts - if: always() - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: kind-${{ github.event_name }}-${{ github.event.number }}-robot-report - path: | - edge-manage-test-automation/robot_output/**/* - - - name: Collect diagnostics - if: always() - uses: ./.github/actions/collect_diagnostics - timeout-minutes: 15 - with: - k8s_diagnostics_args: "--errors-only --include-logs --output-html --output-json" + echo "need_analysis=$need_analysis" >> $GITHUB_OUTPUT + + if [[ "$need_analysis" == "false" ]]; then + echo "āœ… All target jobs succeeded. Convergence analysis skipped." + echo "## Convergence Analysis Skipped" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "All target deploy jobs succeeded, so convergence analysis was not needed." >> $GITHUB_STEP_SUMMARY + exit 0 + fi + + echo "šŸ” At least one job had issues. Proceeding with convergence analysis." - deploy-on-prem: - permissions: - contents: read - name: Deploy On-Prem Orchestrator and Run Golden Suite Core Foundation Tests - needs: - - lint-go - - lint-markdown - - lint-shell - - lint-terraform - - lint-version - - lint-helm - - lint-yaml - - build-publish - - check-changed-files - if: | - always() && - needs.build-publish.result == 'success' && - needs.check-changed-files.outputs.only_design_proposals != 'true' && ( - needs.check-changed-files.outputs.orch == 'true' || - needs.check-changed-files.outputs.on-prem == 'true' || - needs.check-changed-files.outputs.ci == 'true' || - needs.check-changed-files.outputs.test-automation == 'true' || - github.ref == 'refs/heads/main' || - github.ref == 'refs/heads/main-pass-validation' - ) - runs-on: ubuntu-24.04-16core-64GB - timeout-minutes: 90 - env: - KUBECONFIG: ${{ github.workspace }}/terraform/orchestrator/files/kubeconfig - steps: - - name: Checkout Orchestrator repo + - name: Checkout repository + if: steps.check-need-analysis.outputs.need_analysis == 'true' uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false - - name: Read test-automation dependency Version - id: read-test-automation-version - shell: bash + - name: Set up Python + if: steps.check-need-analysis.outputs.need_analysis == 'true' + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: '3.11' + + - name: List recent workflow runs + if: steps.check-need-analysis.outputs.need_analysis == 'true' + id: list-runs + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + // Number of recent runs to analyze for convergence + const MAX_RUNS = 20; + + const runs = await github.rest.actions.listWorkflowRuns({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'virtual-integration.yml', + per_page: MAX_RUNS, + status: 'completed' + }); + + core.info(`Found ${runs.data.workflow_runs.length} completed workflow runs`); + + // Extract run IDs and metadata + const runData = runs.data.workflow_runs.map(run => ({ + id: run.id, + number: run.run_number, + event: run.event, + status: run.status, + conclusion: run.conclusion, + created_at: run.created_at + })); + + // Save to output file for next step + const fs = require('fs'); + fs.writeFileSync('workflow_runs.json', JSON.stringify(runData, null, 2)); + + return runData; + + - name: Download diagnostics artifacts + if: steps.check-need-analysis.outputs.need_analysis == 'true' + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + with: + script: | + const fs = require('fs'); + const path = require('path'); + + // Read workflow runs from previous step + const runData = JSON.parse(fs.readFileSync('workflow_runs.json', 'utf8')); + + // Create artifacts directory + const artifactsDir = 'downloaded-artifacts'; + fs.mkdirSync(artifactsDir, { recursive: true }); + + const targetJobs = ['deploy-kind', 'deploy-on-prem', 'deploy-oxm-profile']; + let downloadedCount = 0; + let totalAttempted = 0; + + // Number of recent runs to analyze (should match MAX_RUNS above) + const MAX_RUNS = 20; + + // Download artifacts from recent runs + for (const run of runData.slice(0, MAX_RUNS)) { + core.info(`\nChecking run #${run.number} (ID: ${run.id})`); + + try { + const artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: run.id + }); + + for (const artifact of artifacts.data.artifacts) { + // Check if artifact matches our target pattern + for (const job of targetJobs) { + if (artifact.name.startsWith(`diagnostics-${job}-`)) { + totalAttempted++; + core.info(` Found artifact: ${artifact.name}`); + + try { + // Download artifact + const download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: artifact.id, + archive_format: 'zip' + }); + + // Save to file + const artifactPath = path.join(artifactsDir, `${artifact.name}.zip`); + fs.writeFileSync(artifactPath, Buffer.from(download.data)); + downloadedCount++; + + core.info(` āœ“ Downloaded ${artifact.name}`); + } catch (downloadError) { + core.warning(` āœ— Failed to download ${artifact.name}: ${downloadError.message}`); + } + + break; + } + } + } + } catch (error) { + core.warning(`Failed to list artifacts for run ${run.id}: ${error.message}`); + } + } + + core.info(`\nšŸ“Š Download summary: ${downloadedCount}/${totalAttempted} artifacts downloaded`); + + // Save summary for next step + fs.writeFileSync('download_summary.json', JSON.stringify({ + downloaded: downloadedCount, + attempted: totalAttempted + }, null, 2)); + + - name: Extract and organize artifacts + if: steps.check-need-analysis.outputs.need_analysis == 'true' run: | - cat /proc/cpuinfo - version=$(yq '.test-automation.version' ${{ github.workspace }}/.test-dependencies.yaml | tr -d '\n' | xargs) - echo $version - echo "version=$version" >> $GITHUB_OUTPUT - - - name: Deploy On-Prem Orchestrator - id: deploy-on-prem - uses: ./.github/actions/deploy_on_prem - timeout-minutes: 60 - with: - orch_version: ${{ github.event.pull_request.head.sha }} - docker_username: ${{ secrets.SYS_DOCKERHUB_USERNAME }} - docker_password: ${{ secrets.SYS_DOCKERHUB_RO }} - - - name: Run E2E tests - env: - E2E_SVC_DOMAIN: cluster.onprem - EDGE_CLUSTER_NAME: test-cluster - run: mage -v test:e2eOnPrem - - - name: Checkout edge-manage-test-automation repository with submodules - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: open-edge-platform/edge-manage-test-automation - path: edge-manage-test-automation - ref: ${{ steps.read-test-automation-version.outputs.version }} - submodules: 'recursive' - token: ${{ secrets.SYS_EMF_GH_TOKEN }} - persist-credentials: false + mkdir -p extracted-artifacts + cd downloaded-artifacts + + # Extract all downloaded artifacts + for zipfile in *.zip; do + if [[ -f "$zipfile" ]]; then + echo "Extracting $zipfile..." + artifact_name="${zipfile%.zip}" + mkdir -p "../extracted-artifacts/$artifact_name" + unzip -q "$zipfile" -d "../extracted-artifacts/$artifact_name" + fi + done + + cd .. + echo "Extracted artifacts:" + find extracted-artifacts -name "diagnostics_full_*.json" -type f - - name: Install vEN Deps - working-directory: edge-manage-test-automation + - name: Run convergence analysis + if: steps.check-need-analysis.outputs.need_analysis == 'true' run: | - git submodule update --init --recursive + echo "Running diagnostics convergence analysis..." + + # Check if we have any artifacts to analyze + json_count=$(find extracted-artifacts -name "diagnostics_full_*.json" -type f | wc -l) + echo "Found $json_count diagnostics JSON files" + + if [[ $json_count -eq 0 ]]; then + echo "āš ļø No diagnostics data found. Creating empty convergence report." + mkdir -p convergence-output + echo '{"status": "no_data", "message": "No diagnostics artifacts found", "total_runs": 0}' > convergence-output/convergence.json + echo "# No Diagnostics Data Available" > convergence-output/convergence.md + echo "No diagnostics artifacts were found for analysis." >> convergence-output/convergence.md + echo '{"total_artifacts": 0, "artifacts": []}' > convergence-output/manifest.json + else + # Run the convergence analyzer + python3 ci/diagnostics_convergence.py extracted-artifacts --output-dir convergence-output + fi + + echo "Convergence analysis complete!" + ls -lh convergence-output/ - - name: Setup virtual environment - working-directory: edge-manage-test-automation - run: | - make asdf-install - make venv_edge-manage-test-automation - - # install required versions for Pico - pushd repos/ven/pico - asdf install - popd - - - name: Run Golden Suite Robot Framework Tests - id: robot-tests - timeout-minutes: 30 - working-directory: edge-manage-test-automation - env: - KUBECONFIG: ${{ github.workspace }}/terraform/orchestrator/files/kubeconfig - REQUESTS_CA_BUNDLE: /usr/local/share/ca-certificates/orch-ca.crt - LIBVIRT_DEFAULT_URI: 'qemu:///system' - run: | - KC_ADMIN_PWD=$(kubectl -n orch-platform get secrets platform-keycloak -o jsonpath='{.data.admin-password}' | base64 -d) - # Add the password to the orchestrator config - yq eval ".orchestrator.admin_password = \"${KC_ADMIN_PWD}\"" -i orchestrator-configs/on-prem.yaml - - source venv_edge-manage-test-automation/bin/activate - robot -L DEBUG --pythonpath . \ - --name "Golden Suite: Core Foundation" \ - -d robot_output/core_foundation \ - -V orchestrator-configs/on-prem.yaml \ - --exitonfailure \ - --exclude cf6 \ - --exclude cf8 \ - tests/core_foundation/core_foundation.robot - - - name: Upload test artifacts - if: always() + - name: Upload convergence outputs + if: steps.check-need-analysis.outputs.need_analysis == 'true' uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: on-prem-${{ github.event_name }}-${{ github.event.number }}-robot-report + name: convergence-analysis-${{ github.run_number }}-${{ github.run_attempt }} path: | - edge-manage-test-automation/robot_output/**/* - - - name: Collect diagnostics - if: always() - uses: ./.github/actions/collect_diagnostics - timeout-minutes: 15 - with: - k8s_diagnostics_args: "--errors-only --include-logs --output-html --output-json" - - deploy-oxm-profile: - permissions: - contents: read - name: Deploy OXM Orchestrator Profile - needs: - - lint-go - - lint-markdown - - lint-shell - - lint-terraform - - lint-version - - lint-helm - - lint-yaml - - build-publish - - check-changed-files - if: | - always() && - needs.build-publish.result == 'success' && ( - needs.check-changed-files.outputs.on-prem-oxm == 'true' || - needs.check-changed-files.outputs.ci == 'true' ) - runs-on: ubuntu-22.04-16core-64GB - timeout-minutes: 90 - env: - KUBECONFIG: ${{ github.workspace }}/terraform/orchestrator/files/kubeconfig - steps: - - name: Checkout Orchestrator repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - persist-credentials: false - - - name: Deploy OXM Orchestrator Profile - id: deploy-oxm-profile - uses: ./.github/actions/deploy_on_prem - timeout-minutes: 60 - env: - TF_VAR_no_proxy: "localhost,127.0.0.0/8,10.0.0.0/8,192.168.0.0/16,.svc,.cluster.local,.default,.internal,.orch-platform,.orch-app,.orch-cluster,.orch-infra,.orch-database,.cattle-system,.orch-secret,.onprem" - TF_VAR_en_http_proxy: "http://192.168.99.30:8080" - TF_VAR_en_https_proxy: "http://192.168.99.30:8080" - TF_VAR_oxm_pxe_server_int: "orchnet" - TF_VAR_oxm_pxe_server_ip: "192.168.99.20" - TF_VAR_oxm_pxe_server_subnet: "192.168.99.0" - TF_VAR_enable_explicit_proxy: "true" - with: - orch_version: ${{ github.event.pull_request.head.sha }} - orch_profile: onprem-oxm - docker_username: ${{ secrets.SYS_DOCKERHUB_USERNAME }} - docker_password: ${{ secrets.SYS_DOCKERHUB_RO }} - - - name: Create MT Sample Org and Project with default users - run: mage tenantUtils:createDefaultMtSetup - - - name: Test Edge Node onboarding - env: - E2E_SVC_DOMAIN: cluster.onprem - EDGE_CLUSTER_NAME: test-cluster - EN_PROFILE: "microvisor-standalone" - timeout-minutes: 20 - if: | - needs.check-changed-files.outputs.onboarding == 'true' || - needs.check-changed-files.outputs.on-prem == 'true' || - needs.check-changed-files.outputs.orch == 'true' || - needs.check-changed-files.outputs.ci == 'true' + convergence-output/convergence.json + convergence-output/convergence.md + convergence-output/manifest.json + retention-days: 15 + compression-level: 6 + + - name: Write summary to GITHUB_STEP_SUMMARY + if: steps.check-need-analysis.outputs.need_analysis == 'true' run: | - set +e - mage test:onboarding - TEST_EXIT_CODE=$? - sudo cat /var/log/libvirt/qemu/edge-node-EN123456789-console.log || true - exit $TEST_EXIT_CODE - - - name: Collect diagnostics - if: always() - uses: ./.github/actions/collect_diagnostics - timeout-minutes: 15 - with: - k8s_diagnostics_args: "--errors-only --include-logs --output-html --output-json" - - tag-repo: - permissions: - contents: read - name: Tag repo - needs: - - lint-version - - build-publish - - deploy-kind - - deploy-on-prem - - deploy-oxm-profile - if: github.event_name == 'push' && ( github.ref == 'refs/heads/main' || github.ref == 'refs/heads/main-pass-validation' ) - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - name: Checkout code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - # Fetch all history for all tags and branches - fetch-depth: 0 - persist-credentials: false - - - name: Prepare ci tools - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: open-edge-platform/orch-ci - token: ${{ secrets.SYS_EMF_GH_TOKEN }} - path: orch-ci - persist-credentials: false - - - name: Tag repo - env: - GITHUB_TOKEN: ${{ secrets.SYS_EMF_GH_TOKEN }} - run: orch-ci/scripts/version-tag.sh + echo "## šŸ“Š Diagnostics Convergence Analysis" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + # Include the markdown report + if [[ -f convergence-output/convergence.md ]]; then + cat convergence-output/convergence.md >> $GITHUB_STEP_SUMMARY + else + echo "āš ļø Convergence report not generated" >> $GITHUB_STEP_SUMMARY + fi + + echo "" >> $GITHUB_STEP_SUMMARY + echo "---" >> $GITHUB_STEP_SUMMARY + echo "*This is a temporary validation job - will be removed after validation*" >> $GITHUB_STEP_SUMMARY post-merge: permissions: diff --git a/ci/diagnostics_convergence.py b/ci/diagnostics_convergence.py new file mode 100755 index 000000000..a2d868e6f --- /dev/null +++ b/ci/diagnostics_convergence.py @@ -0,0 +1,606 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: (C) 2026 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +""" +Diagnostics Convergence Analyzer +================================= + +Analyzes diagnostics artifacts across multiple workflow runs to identify: +- Error patterns that persist across runs +- Transient vs recurring issues +- Convergence trends (are errors decreasing/increasing?) +- Most frequent issue types +- Likely blockers / dependency chains (from ArgoCD status messages) + +Usage: + python ci/diagnostics_convergence.py + +Where artifacts_dir contains extracted diagnostics artifacts with diagnostics_full_*.json files. + +Outputs: +- convergence.json: Full convergence analysis with error trends + issue-type and blocker aggregates +- convergence.md: Human-readable markdown report +- manifest.json: List of analyzed artifacts with metadata +""" + +import argparse +import hashlib +import json +import os +import re +import sys +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +# Convergence analysis constants +PERSISTENT_ERROR_THRESHOLD = 0.5 # Errors occurring in >50% of runs are considered persistent +IMPROVING_THRESHOLD = 0.8 # Trend is improving if errors decrease by >20% +DEGRADING_THRESHOLD = 1.2 # Trend is degrading if errors increase by >20% + +# Regexes for ArgoCD causal blockers +ARGOCD_WAIT_DEPLOY_RE = re.compile(r"waiting for healthy state of apps/Deployment/([a-zA-Z0-9-_.]+)") +ARGOCD_WAIT_APP_RE = re.compile(r"waiting for healthy state of argoproj\.io/Application/([a-zA-Z0-9-_.]+)") + +# Heuristic to turn pod name into workload: strip "--" +# Example: dm-manager-84987656cf-kvn52 -> dm-manager +POD_WORKLOAD_RE = re.compile(r"^(?P.+)-[a-f0-9]{8,10}-[a-z0-9]{4,6}$") + + +def _short_hash(text: str, length: int = 8) -> str: + if not text: + return "nohash" + return hashlib.sha1(text.encode("utf-8", errors="ignore")).hexdigest()[:length] + + +def _normalize_text(text: str) -> str: + """Normalize noisy text by removing GUID-like tokens and collapsing whitespace.""" + if not text: + return "" + t = text + t = re.sub(r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b", "", t, flags=re.IGNORECASE) + t = re.sub(r"\s+", " ", t).strip() + return t + + +def normalize_pod_to_workload(pod_name: str) -> str: + m = POD_WORKLOAD_RE.match(pod_name or "") + if m: + return m.group("base") + # Fallback: strip last 2 segments if they look like rs-hash + suffix + parts = (pod_name or "").split("-") + if len(parts) >= 3 and re.fullmatch(r"[a-f0-9]{8,10}", parts[-2]) and re.fullmatch(r"[a-z0-9]{4,6}", parts[-1]): + return "-".join(parts[:-2]) + return pod_name or "unknown" + + +def parse_job_from_artifact_dir(artifact_dirname: str) -> Dict[str, str]: + """ + artifact_dirname example: diagnostics-deploy-on-prem-pull_request-1458-1 + Return: + - job: deploy-on-prem + - event: pull_request + - pr_or_run: 1458 (best effort) + - attempt: 1 (best effort) + """ + out = {"job": "unknown", "event": "unknown", "pr_or_run": "", "attempt": ""} + name = artifact_dirname or "" + if not name.startswith("diagnostics-"): + return out + + # Find an event marker that separates job from the rest + for event in ["pull_request", "push", "merge_group", "workflow_dispatch", "schedule"]: + marker = f"-{event}-" + if marker in name: + before, after = name.split(marker, 1) + out["job"] = before[len("diagnostics-") :] + out["event"] = event + # remaining often looks like "-" + m = re.match(r"(?P\d+)(?:-(?P\d+))?", after) + if m: + out["pr_or_run"] = m.group("num") or "" + out["attempt"] = m.group("attempt") or "" + return out + + # Fallback: old format + out["job"] = name[len("diagnostics-") :] + return out + + +def deployment_issue_type(dep: Dict[str, Any]) -> str: + """ + Prefer a non-ready condition reason (Available=False or Progressing=False) over dep['reason']. + """ + conditions = dep.get("conditions") or [] + # Prefer Available=False reason + for c in conditions: + if c.get("type") == "Available" and str(c.get("status")) == "False": + return c.get("reason") or "AvailableFalse" + # Then Progressing=False + for c in conditions: + if c.get("type") == "Progressing" and str(c.get("status")) == "False": + return c.get("reason") or "ProgressingFalse" + # Fallback to top-level reason + return dep.get("reason") or "DeploymentNotReady" + + +def argocd_issue_type(app: Dict[str, Any]) -> str: + health = app.get("health") or "UnknownHealth" + sync = app.get("sync") or "UnknownSync" + phase = app.get("operation_phase") or "UnknownPhase" + return f"{health}::{sync}::{phase}" + + +def extract_blockers_from_argocd_message(message: str) -> List[str]: + blockers: List[str] = [] + if not message: + return blockers + for m in ARGOCD_WAIT_DEPLOY_RE.finditer(message): + blockers.append(f"Deployment/{m.group(1)}") + for m in ARGOCD_WAIT_APP_RE.finditer(message): + blockers.append(f"Application/{m.group(1)}") + return blockers + + +def extract_issue_records(diag: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Convert diagnostics summary into a list of normalized issue records: + { + domain: "pod" | "deploy" | "argocd" | "pvc", + component: "...", + issue_type: "...", + signature: "...", + cause_hints: [...], + blockers: [...], # only for argocd right now + raw: {...} + } + """ + summary = (diag or {}).get("summary", {}) or {} + issues: List[Dict[str, Any]] = [] + + # Pods with errors + for e in summary.get("pods_w_errors", []) or []: + ns = e.get("namespace") or "unknown" + pod = e.get("pod") or e.get("name") or "unknown" + workload = normalize_pod_to_workload(pod) + itype = e.get("reason") or e.get("status") or "PodError" + msg = _normalize_text(e.get("message") or "") + last_event = _normalize_text(e.get("last_event") or "") + hint_bits = [] + if itype: + hint_bits.append(itype) + if e.get("restart_count") is not None: + hint_bits.append(f"restarts={e.get('restart_count')}") + if msg: + hint_bits.append(f"msg={msg[:160]}") + if last_event: + hint_bits.append(f"event={last_event[:160]}") + + detail_hash = _short_hash(msg or last_event) + component = f"pod::{ns}::{workload}" + signature = f"{component}::{itype}::{detail_hash}" + + issues.append( + { + "domain": "pod", + "component": component, + "issue_type": itype, + "signature": signature, + "cause_hints": hint_bits, + "blockers": [], + "raw": e, + } + ) + + # Deployments not ready + for d in summary.get("deployments_not_ready", []) or []: + ns = d.get("namespace") or "unknown" + name = d.get("name") or d.get("deployment") or "unknown" + itype = deployment_issue_type(d) + component = f"deploy::{ns}::{name}" + signature = f"{component}::{itype}" + + hint_bits = [itype] + # Include condition message (normalized) if present for debugging (hashed into signature would be too noisy) + conditions = d.get("conditions") or [] + for c in conditions: + if c.get("type") == "Available" and str(c.get("status")) == "False": + cm = _normalize_text(c.get("message") or "") + if cm: + hint_bits.append(f"available_msg={cm[:160]}") + break + + issues.append( + { + "domain": "deploy", + "component": component, + "issue_type": itype, + "signature": signature, + "cause_hints": hint_bits, + "blockers": [], + "raw": d, + } + ) + + # PVC not bound + for p in summary.get("pvc_not_bound", []) or []: + ns = p.get("namespace") or "unknown" + name = p.get("name") or p.get("pvc") or "unknown" + itype = p.get("reason") or p.get("status") or "PVCNotBound" + component = f"pvc::{ns}::{name}" + signature = f"{component}::{itype}" + hint_bits = [itype] + msg = _normalize_text(p.get("message") or "") + if msg: + hint_bits.append(f"msg={msg[:160]}") + issues.append( + { + "domain": "pvc", + "component": component, + "issue_type": itype, + "signature": signature, + "cause_hints": hint_bits, + "blockers": [], + "raw": p, + } + ) + + # ArgoCD unhealthy + for a in summary.get("argocd_apps_unhealthy", []) or []: + ns = a.get("namespace") or "unknown" + name = a.get("name") or "unknown" + itype = argocd_issue_type(a) + msg = _normalize_text(a.get("message") or "") + detail_hash = _short_hash(msg) + component = f"argocd::{ns}::{name}" + signature = f"{component}::{itype}::{detail_hash}" + + blockers = extract_blockers_from_argocd_message(a.get("message") or "") + hint_bits = [itype] + if a.get("out_of_sync_count") is not None: + hint_bits.append(f"out_of_sync_count={a.get('out_of_sync_count')}") + oos = a.get("out_of_sync_resources") or [] + if oos: + hint_bits.append(f"out_of_sync_examples={'; '.join(list(oos)[:3])}") + if blockers: + hint_bits.append(f"blockers={', '.join(blockers[:3])}") + if msg: + hint_bits.append(f"msg={msg[:160]}") + + issues.append( + { + "domain": "argocd", + "component": component, + "issue_type": itype, + "signature": signature, + "cause_hints": hint_bits, + "blockers": blockers, + "raw": a, + } + ) + + return issues + + +def analyze_convergence(artifacts_data: List[Dict[str, Any]]) -> Dict[str, Any]: + if not artifacts_data: + return { + "status": "no_data", + "message": "No diagnostics data available for analysis", + "total_runs": 0, + } + + artifacts_data.sort(key=lambda x: x.get("timestamp", "")) + + error_occurrences: Dict[str, List[Dict[str, Any]]] = defaultdict(list) + issue_type_counts = Counter() + component_counts = Counter() + blocker_counts = Counter() + issue_type_by_component = defaultdict(Counter) + + run_summaries: List[Dict[str, Any]] = [] + + for idx, artifact in enumerate(artifacts_data): + run_num = idx + 1 + diag = artifact.get("diagnostics", {}) + issues = extract_issue_records(diag) + + # Summary counts per domain + domain_counts = Counter([i["domain"] for i in issues]) + total_errors = len(issues) + + run_summaries.append( + { + "run": run_num, + "timestamp": artifact.get("timestamp"), + "workflow_run_id": artifact.get("workflow_run_id"), + "job_name": artifact.get("job_name"), + "job": artifact.get("job"), + "event": artifact.get("event"), + "total_errors": total_errors, + "pod_errors": domain_counts.get("pod", 0), + "deployment_errors": domain_counts.get("deploy", 0), + "argocd_errors": domain_counts.get("argocd", 0), + "pvc_errors": domain_counts.get("pvc", 0), + } + ) + + # Track issues + for issue in issues: + sig = issue["signature"] + error_occurrences[sig].append( + { + "run": run_num, + "timestamp": artifact.get("timestamp"), + "component": issue["component"], + "issue_type": issue["issue_type"], + "domain": issue["domain"], + "cause_hints": issue.get("cause_hints", []), + "blockers": issue.get("blockers", []), + } + ) + + issue_type_counts[issue["issue_type"]] += 1 + component_counts[issue["component"]] += 1 + issue_type_by_component[issue["component"]][issue["issue_type"]] += 1 + for b in issue.get("blockers", []) or []: + blocker_counts[b] += 1 + + persistent_errors = [] + transient_errors = [] + + total_runs = len(artifacts_data) + for sig, occurrences in error_occurrences.items(): + occurrence_rate = len(occurrences) / total_runs + + # compact examples for output + example = occurrences[0] + error_info = { + "signature": sig, + "domain": example.get("domain"), + "component": example.get("component"), + "issue_type": example.get("issue_type"), + "occurrences": len(occurrences), + "occurrence_rate": occurrence_rate, + "first_seen": occurrences[0]["timestamp"], + "last_seen": occurrences[-1]["timestamp"], + "runs_affected": [occ["run"] for occ in occurrences], + "example_cause_hints": example.get("cause_hints", [])[:5], + "example_blockers": example.get("blockers", [])[:5], + } + + if occurrence_rate > PERSISTENT_ERROR_THRESHOLD: + persistent_errors.append(error_info) + else: + transient_errors.append(error_info) + + # Trend calculation + if len(run_summaries) >= 2: + first_half = run_summaries[: len(run_summaries) // 2] + second_half = run_summaries[len(run_summaries) // 2 :] + + avg_errors_first = sum(r["total_errors"] for r in first_half) / len(first_half) + avg_errors_second = sum(r["total_errors"] for r in second_half) / len(second_half) + + if avg_errors_second < avg_errors_first * IMPROVING_THRESHOLD: + trend = "improving" + elif avg_errors_second > avg_errors_first * DEGRADING_THRESHOLD: + trend = "degrading" + else: + trend = "stable" + else: + trend = "insufficient_data" + avg_errors_first = 0.0 + avg_errors_second = 0.0 + + return { + "status": "success", + "total_runs": total_runs, + "run_summaries": run_summaries, + "persistent_errors": sorted(persistent_errors, key=lambda x: x["occurrence_rate"], reverse=True), + "transient_errors": sorted(transient_errors, key=lambda x: x["occurrences"], reverse=True), + "top_issue_types": [{"issue_type": k, "count": v} for k, v in issue_type_counts.most_common(20)], + "top_components": [{"component": k, "count": v} for k, v in component_counts.most_common(20)], + "top_blockers": [{"blocker": k, "count": v} for k, v in blocker_counts.most_common(20)], + "issue_types_by_component": { + comp: [{"issue_type": it, "count": c} for it, c in cnt.most_common(10)] + for comp, cnt in issue_type_by_component.items() + }, + "convergence_trend": { + "trend": trend, + "avg_errors_first_half": avg_errors_first, + "avg_errors_second_half": avg_errors_second, + }, + } + + +def generate_markdown_report(convergence: Dict[str, Any]) -> str: + md = ["# Diagnostics Convergence Analysis\n"] + + if convergence.get("status") != "success": + md.append(f"**Status:** {convergence.get('status')}\n") + md.append(f"{convergence.get('message', 'Analysis incomplete')}\n") + return "\n".join(md) + + md.append(f"**Analysis Date:** {datetime.now().isoformat()}\n") + md.append(f"**Total Runs Analyzed:** {convergence['total_runs']}\n\n") + + # Trend + trend = convergence["convergence_trend"] + md.append("## Convergence Trend\n") + md.append(f"**Overall Trend:** {trend['trend'].upper()}\n") + md.append(f"- Average issues (first half): {trend['avg_errors_first_half']:.1f}\n") + md.append(f"- Average issues (second half): {trend['avg_errors_second_half']:.1f}\n\n") + + # NEW: Top issue types + md.append("## Top Issue Types (Most Observed)\n") + top_issue_types = convergence.get("top_issue_types", []) + if top_issue_types: + md.append("| Issue Type | Count |\n") + md.append("|-----------|-------|\n") + for row in top_issue_types[:15]: + md.append(f"| {row['issue_type']} | {row['count']} |\n") + md.append("\n") + else: + md.append("*No issue types detected* āœ…\n\n") + + # NEW: Top blockers (ArgoCD dependency hints) + md.append("## Top Blockers / Dependencies (from ArgoCD messages)\n") + top_blockers = convergence.get("top_blockers", []) + if top_blockers: + md.append("| Blocker | Count |\n") + md.append("|--------|-------|\n") + for row in top_blockers[:15]: + md.append(f"| {row['blocker']} | {row['count']} |\n") + md.append("\n") + else: + md.append("*No blockers detected* āœ…\n\n") + + # Persistent errors + persistent = convergence["persistent_errors"] + threshold_pct = int(PERSISTENT_ERROR_THRESHOLD * 100) + md.append(f"## Persistent Patterns ({len(persistent)})\n") + md.append(f"These signatures occur in >{threshold_pct}% of runs:\n\n") + + if persistent: + for err in persistent[:10]: + md.append(f"### `{err['signature']}`\n") + md.append(f"- **Domain:** {err.get('domain')}\n") + md.append(f"- **Component:** {err.get('component')}\n") + md.append(f"- **Issue Type:** {err.get('issue_type')}\n") + md.append(f"- **Occurrence Rate:** {err['occurrence_rate']*100:.1f}% ({err['occurrences']}/{convergence['total_runs']} runs)\n") + md.append(f"- **First Seen:** {err['first_seen']}\n") + md.append(f"- **Runs Affected:** {', '.join(map(str, err['runs_affected']))}\n") + hints = err.get("example_cause_hints") or [] + if hints: + md.append(f"- **Cause hints (example):** {', '.join(hints[:4])}\n") + blockers = err.get("example_blockers") or [] + if blockers: + md.append(f"- **Blockers (example):** {', '.join(blockers[:4])}\n") + md.append("\n") + else: + md.append("*No persistent patterns detected* āœ…\n\n") + + # Transient errors + transient = convergence["transient_errors"] + md.append(f"## Transient Patterns ({len(transient)})\n") + md.append("These signatures occur sporadically (<50% of runs):\n\n") + + if transient: + for err in transient[:10]: + md.append(f"- `{err['signature']}` ({err['occurrences']}x)\n") + else: + md.append("*No transient patterns detected* āœ…\n") + md.append("\n") + + # Run summary + md.append("## Run Summary\n\n") + md.append("| Run | Timestamp | Artifact Dir | Job | Event | Total | Pod | Deploy | ArgoCD | PVC |\n") + md.append("|-----|-----------|--------------|-----|-------|-------|-----|--------|--------|-----|\n") + + for run in convergence["run_summaries"]: + ts = run["timestamp"][:19] if run.get("timestamp") else "N/A" + md.append( + f"| {run['run']} | {ts} | {run.get('job_name','')} | {run.get('job','')} | {run.get('event','')} | " + f"{run['total_errors']} | {run['pod_errors']} | {run['deployment_errors']} | {run['argocd_errors']} | {run['pvc_errors']} |\n" + ) + + return "".join(md) + + +def process_artifacts(artifacts_dir: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + artifacts_path = Path(artifacts_dir) + artifacts_data: List[Dict[str, Any]] = [] + manifest: List[Dict[str, Any]] = [] + + json_files = list(artifacts_path.glob("**/diagnostics_full_*.json")) + print(f"Found {len(json_files)} diagnostics JSON files", file=sys.stderr) + + for json_file in json_files: + try: + with open(json_file, "r") as f: + diag_data = json.load(f) + + relative_path = json_file.relative_to(artifacts_path) + parts = relative_path.parts + + artifact_dir = parts[0] if len(parts) > 1 else "unknown" + parsed = parse_job_from_artifact_dir(artifact_dir) + + artifact_info = { + "job_name": artifact_dir, # keep original artifact dir for traceability + "job": parsed["job"], + "event": parsed["event"], + "timestamp": diag_data.get("ts", ""), + "workflow_run_id": "unknown", # TODO: wire from manifest builder if available + "artifact_path": str(relative_path), + "diagnostics": diag_data, + } + + artifacts_data.append(artifact_info) + + manifest.append( + { + "job_name": artifact_dir, + "job": parsed["job"], + "event": parsed["event"], + "timestamp": diag_data.get("ts", ""), + "artifact_path": str(relative_path), + "has_errors": diag_data.get("has_errors", False), + } + ) + + except Exception as e: + print(f"Warning: Failed to process {json_file}: {e}", file=sys.stderr) + + return artifacts_data, manifest + + +def main() -> None: + parser = argparse.ArgumentParser(description="Analyze diagnostics convergence across workflow runs") + parser.add_argument("artifacts_dir", help="Directory containing extracted diagnostics artifacts") + parser.add_argument("--output-dir", default=".", help="Directory to write outputs (default: current directory)") + args = parser.parse_args() + + if not os.path.isdir(args.artifacts_dir): + print(f"Error: Artifacts directory not found: {args.artifacts_dir}", file=sys.stderr) + sys.exit(1) + + print(f"Processing artifacts from: {args.artifacts_dir}", file=sys.stderr) + + artifacts_data, manifest = process_artifacts(args.artifacts_dir) + + if not artifacts_data: + convergence = { + "status": "no_data", + "message": "No diagnostics JSON files found in artifacts directory", + "total_runs": 0, + } + else: + convergence = analyze_convergence(artifacts_data) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + with open(output_dir / "convergence.json", "w") as f: + json.dump(convergence, f, indent=2) + print("āœ“ Written convergence.json", file=sys.stderr) + + markdown_report = generate_markdown_report(convergence) + with open(output_dir / "convergence.md", "w") as f: + f.write(markdown_report) + print("āœ“ Written convergence.md", file=sys.stderr) + + with open(output_dir / "manifest.json", "w") as f: + json.dump({"total_artifacts": len(manifest), "artifacts": manifest}, f, indent=2) + print("āœ“ Written manifest.json", file=sys.stderr) + + print("\nāœ… Convergence analysis complete!", file=sys.stderr) + print(f" Analyzed {len(artifacts_data)} diagnostics artifacts", file=sys.stderr) + print(f" Trend: {convergence.get('convergence_trend', {}).get('trend', 'N/A')}", file=sys.stderr) + + +if __name__ == "__main__": + main()