diff --git a/playbooks/mirror-images.yml b/playbooks/mirror-images.yml new file mode 100644 index 00000000..75ca7c39 --- /dev/null +++ b/playbooks/mirror-images.yml @@ -0,0 +1,49 @@ +--- +# Mirror container images to internal registry +# +# This playbook uses the container_image_mirror role to mirror images from +# source registries to a target internal registry. +# +# Prerequisites: +# - skopeo CLI tool installed on bastion +# - Authentication to source registries (via pull_secret or existing auth) +# - Target registry reachable +# +# Usage: +# ansible-playbook playbooks/mirror-images.yml \ +# -i inventories/ocp-deployment/build-inventory.py \ +# --extra-vars "images='[{\"source\":\"quay.io/org/image:tag\",\"dest\":\"namespace/image:tag\"}]'" +# +# # With pull secret +# ansible-playbook playbooks/mirror-images.yml \ +# -i inventories/ocp-deployment/build-inventory.py \ +# --extra-vars "images='[...]' pull_secret_string=$(cat ~/.docker/config.json | base64 -w0)" +# +# Required variables: +# images: List of image mappings with 'source' and 'dest' keys +# Example: [{"source": "quay.io/org/image:tag", "dest": "namespace/image:tag"}] +# +# Optional variables: +# registry_host: Target registry hostname (default: bastion FQDN) +# registry_port: Target registry port (default: 5000) +# registry_namespace: Namespace prefix for all dest images (default: none) +# dest_tls_verify: Verify TLS for dest registry (default: false) +# use_pull_secret: Enable pull secret authentication (default: false) +# pull_secret_string: Base64-encoded pull secret (only used if use_pull_secret=true) +# pull_secret_path: Path to write pull secret file (only used if use_pull_secret=true) +# +- name: Mirror container images to internal registry + hosts: bastion + gather_facts: true + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: mirror + container_image_mirror_images: "{{ images }}" + container_image_mirror_registry_host: "{{ registry_host | default(ansible_fqdn) }}" + container_image_mirror_registry_port: "{{ registry_port | default(5000) }}" + container_image_mirror_registry_namespace: "{{ registry_namespace | default('') }}" + container_image_mirror_dest_tls_verify: "{{ dest_tls_verify | default(false) }}" + container_image_mirror_use_pull_secret: "{{ use_pull_secret | default(false) }}" + container_image_mirror_pull_secret_string: "{{ pull_secret_string | default('') }}" + container_image_mirror_pull_secret_path: "{{ pull_secret_path | default('/tmp/.pull-secret-mirror.json') }}" diff --git a/playbooks/remove-images.yml b/playbooks/remove-images.yml new file mode 100644 index 00000000..f7de25d1 --- /dev/null +++ b/playbooks/remove-images.yml @@ -0,0 +1,38 @@ +--- +# Remove container images from internal registry +# +# This playbook uses the container_image_mirror role to remove images from +# an internal registry's storage. +# +# Prerequisites: +# - sudo access to registry storage path on bastion +# - Target registry reachable +# +# Usage: +# ansible-playbook playbooks/remove-images.yml \ +# -i inventories/ocp-deployment/build-inventory.py \ +# --extra-vars "images='[{\"dest\":\"namespace/image:tag\"}]'" +# +# # Remove multiple images +# ansible-playbook playbooks/remove-images.yml \ +# -i inventories/ocp-deployment/build-inventory.py \ +# --extra-vars 'images=[{"dest":"ran-test/old-image:v1"},{"dest":"ran-test/deprecated:latest"}]' +# +# Required variables: +# images: List of image mappings with 'dest' key +# Example: [{"dest": "namespace/image:tag"}] +# +# Optional variables: +# registry_namespace: Namespace prefix for all dest images (default: none) +# registry_data_path: Registry storage path (default: /home/kni/registry/data/docker/registry/v2/repositories) +# +- name: Remove container images from internal registry + hosts: bastion + gather_facts: true + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: remove + container_image_mirror_images: "{{ images }}" + container_image_mirror_registry_namespace: "{{ registry_namespace | default('') }}" + container_image_mirror_registry_data_path: "{{ registry_data_path | default('/home/kni/registry/data/docker/registry/v2/repositories') }}" diff --git a/playbooks/roles/container_image_mirror/README.md b/playbooks/roles/container_image_mirror/README.md new file mode 100644 index 00000000..40f16bb0 --- /dev/null +++ b/playbooks/roles/container_image_mirror/README.md @@ -0,0 +1,309 @@ +# Container Image Mirror Ansible Role + +## Disclaimer +This role is provided as-is, without guarantees of support or maintenance. +Use at your own discretion. + +## Overview +The `container_image_mirror` role provides generic container image mirroring and removal capabilities for internal registries. It uses `skopeo` to: +- **Mirror** container images from source registries to a target internal registry +- **Remove** container images from an internal registry + +This role is designed to work with any container registry and supports both connected and disconnected (air-gapped) environments. + +## Features +- Generic image mirroring using `skopeo copy` +- Image removal from registry storage +- Pull secret support for private registries +- Configurable TLS verification +- Detailed success/failure reporting with summary +- Idempotent operations with existence checks +- Continues mirroring all images even if some fail +- Per-image success/failure tracking + +## Requirements +- Ansible 2.9+ +- Tools available on the target host: + - `skopeo` (for mirroring operations) +- For removal operations: + - `sudo` access to registry storage path +- Target registry must be reachable +- Pull secrets for source registries (if private) + +## Role Variables + +### Required Variables + +- **`container_image_mirror_images`** (list): List of image mappings + - For mirror operation: `[{"source": "quay.io/org/image:tag", "dest": "namespace/image:tag"}]` + - For remove operation: `[{"dest": "namespace/image:tag"}]` + - No default (must be provided) + +### Optional Variables + +- **`container_image_mirror_operation`** (string): Operation mode + - Values: `mirror` or `remove` + - Default: `mirror` + +- **`container_image_mirror_registry_host`** (string): Target registry hostname + - Default: `{{ ansible_fqdn }}` + +- **`container_image_mirror_registry_port`** (int): Target registry port + - Default: `5000` + +- **`container_image_mirror_registry_namespace`** (string): Namespace prefix for destination images + - Default: `""` (empty, no prefix) + - Example: `ran-test/` would prefix all destination images + +- **`container_image_mirror_dest_tls_verify`** (bool): Verify TLS for destination registry + - Default: `false` + +- **`container_image_mirror_use_pull_secret`** (bool): Enable pull secret file authentication + - Default: `false` + - When `false`: skopeo uses system authentication from `/etc/containers/auth.json` + - When `true`: skopeo uses pull secret file via `--authfile` parameter + +- **`container_image_mirror_pull_secret_string`** (string): Base64-encoded pull secret JSON + - Default: `""` (empty, uses existing auth) + - Format: Base64-encoded Docker config JSON + - Only used when `container_image_mirror_use_pull_secret=true` + +- **`container_image_mirror_pull_secret_path`** (string): Path to store pull secret + - Default: `/tmp/.pull-secret-mirror.json` + - Only used when `container_image_mirror_use_pull_secret=true` + +- **`container_image_mirror_registry_data_path`** (string): Registry storage path (for removal) + - Default: `/home/kni/registry/data/docker/registry/v2/repositories` + +## Dependencies +None. + +## Authentication Methods + +The role supports two authentication methods: + +### System Authentication (Recommended) + +When `use_pull_secret=false` (default), skopeo uses system authentication from `/etc/containers/auth.json`. This is the recommended approach when: +- Source registries are public +- Authentication has been configured beforehand (e.g., via `podman login` or `authWithQuay()` in Jenkins) +- Both source and destination credentials are in the system auth file + +### Pull Secret File Authentication + +When `use_pull_secret=true`, skopeo uses a dedicated pull secret file via the `--authfile` parameter. Use this when: +- You need to use specific credentials different from system auth +- Working with private registries requiring explicit authentication +- Credentials should not persist in system auth + +**Important**: When using pull secrets, the content is never logged and the file is automatically cleaned up after use. + +## Example Playbooks + +### Mirror Images with System Authentication (Recommended) + +```yaml +--- +- name: Mirror RAN test images to internal registry + hosts: bastion + gather_facts: true + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: mirror + container_image_mirror_registry_host: disconnected.registry.local + container_image_mirror_registry_port: 5000 + container_image_mirror_registry_namespace: ran-test/ + container_image_mirror_images: + - source: quay.io/telcov10n-ci/oslat:latest + dest: oslat:latest + - source: quay.io/telcov10n-ci/cyclictest:latest + dest: cyclictest:latest + - source: quay.io/telcov10n-ci/cnf-tests:4.8 + dest: cnf-tests:4.8 +``` + +### Mirror Images with Pull Secret + +```yaml +--- +- name: Mirror private images to internal registry + hosts: bastion + gather_facts: true + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: mirror + container_image_mirror_registry_host: registry.example.com + container_image_mirror_use_pull_secret: true + container_image_mirror_pull_secret_string: "{{ pull_secret }}" + container_image_mirror_images: + - source: quay.io/private-org/image:tag + dest: namespace/image:tag +``` + +### Remove Images Example + +```yaml +--- +- name: Remove old RAN test images from internal registry + hosts: bastion + gather_facts: true + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: remove + container_image_mirror_registry_namespace: ran-test/ + container_image_mirror_images: + - dest: oslat:old-version + - dest: cyclictest:deprecated +``` + +## Usage + +### Via Playbook + +Create a playbook that includes the role: + +```yaml +--- +- name: Mirror container images + hosts: bastion + gather_facts: true + vars: + images_to_mirror: + - source: quay.io/org/app:v1.0 + dest: apps/app:v1.0 + roles: + - role: container_image_mirror + vars: + container_image_mirror_operation: mirror + container_image_mirror_images: "{{ images_to_mirror }}" +``` + +### Command Line + +```bash +ansible-playbook mirror-images-playbook.yml \ + -i inventories/ocp-deployment/build-inventory.py \ + --extra-vars "container_image_mirror_pull_secret_string=$(cat ~/.docker/config.json | base64 -w0)" +``` + +## Output Examples + +### Successful Mirror + +``` +TASK [container_image_mirror : Display detailed mirror results] +ok: [bastion] => { + "msg": [ + "==========================================", + "MIRROR SUMMARY", + "==========================================", + "Total images: 3", + "Successfully mirrored: 3", + "Failed: 0", + "", + "SUCCESSFUL:", + " ✓ oslat:latest", + " ✓ cyclictest:latest", + " ✓ cnf-tests:4.8", + "", + "FAILED:", + " (none)", + "==========================================" + ] +} +``` + +### Partial Failure + +``` +TASK [container_image_mirror : Display detailed mirror results] +ok: [bastion] => { + "msg": [ + "==========================================", + "MIRROR SUMMARY", + "==========================================", + "Total images: 3", + "Successfully mirrored: 2", + "Failed: 1", + "", + "SUCCESSFUL:", + " ✓ oslat:latest", + " ✓ cyclictest:latest", + "", + "FAILED:", + " ✗ cnf-tests:4.8", + "==========================================" + ] +} + +TASK [container_image_mirror : Fail if any images failed to mirror] +fatal: [bastion]: FAILED! => { + "msg": "1 image(s) failed to mirror. See summary above for details." +} +``` + +## Use Cases + +### Telco RAN Test Image Mirroring +Mirror test images from quay.io to bastion registries for spoke cluster testing: +```yaml +container_image_mirror_images: + - {source: "quay.io/telcov10n-ci/oslat:latest", dest: "ran-test/oslat:latest"} + - {source: "quay.io/telcov10n-ci/cyclictest:latest", dest: "ran-test/cyclictest:latest"} + - {source: "quay.io/telcov10n-ci/stress-ng:latest", dest: "ran-test/stress-ng:latest"} +``` + +### Disconnected Environment Preparation +Mirror images to internal registry before deploying in air-gapped environment: +```yaml +container_image_mirror_operation: mirror +container_image_mirror_registry_host: registry.internal.corp +container_image_mirror_images: + - {source: "quay.io/app/image:v1", dest: "production/app:v1"} +``` + +### Registry Cleanup +Remove old or deprecated images from internal registry: +```yaml +container_image_mirror_operation: remove +container_image_mirror_images: + - {dest: "deprecated/old-app:v1"} + - {dest: "test/temp-image:latest"} +``` + +## Notes + +- **Continues on error**: The role continues mirroring all images even if some fail, then reports detailed results at the end +- **Pull secrets**: The role cleans up pull secrets after use for security +- **TLS verification**: Disabled by default for internal registries with self-signed certificates +- **Idempotency**: Checks for existing images before mirroring (though still attempts to mirror for freshness) +- **Namespace handling**: The `container_image_mirror_registry_namespace` is prepended to all destination images + +## Troubleshooting + +### Skopeo authentication errors +Ensure `container_image_mirror_pull_secret_string` contains valid credentials: +```bash +cat ~/.docker/config.json | base64 -w0 +``` + +### TLS verification errors +If using self-signed certificates, ensure `container_image_mirror_dest_tls_verify: false` + +### Permission errors during removal +Ensure the ansible user has sudo access to the registry storage path + +### Image not found errors +Verify source image exists and is accessible: +```bash +skopeo inspect docker://quay.io/org/image:tag +``` + +## License +Apache-2.0 + +## Author Information +Telco Verification Team - Red Hat diff --git a/playbooks/roles/container_image_mirror/defaults/main.yaml b/playbooks/roles/container_image_mirror/defaults/main.yaml new file mode 100644 index 00000000..74ae107c --- /dev/null +++ b/playbooks/roles/container_image_mirror/defaults/main.yaml @@ -0,0 +1,26 @@ +--- +# Default variables for container_image_mirror role + +# Operation mode: 'mirror' or 'remove' +container_image_mirror_operation: mirror + +# Target registry configuration +container_image_mirror_registry_host: "{{ ansible_fqdn }}" +container_image_mirror_registry_port: 5000 +container_image_mirror_registry_namespace: "" + +# TLS verification for destination registry +container_image_mirror_dest_tls_verify: false + +# Pull secret configuration +container_image_mirror_use_pull_secret: false +container_image_mirror_pull_secret_path: /tmp/.pull-secret-mirror.json +container_image_mirror_pull_secret_string: "" + +# Registry storage path (for removal operations) +container_image_mirror_registry_data_path: /home/kni/registry/data/docker/registry/v2/repositories + +# List of images to mirror or remove +# Format: [{"source": "quay.io/org/image:tag", "dest": "namespace/image:tag"}] +# For removal, only "dest" is required +container_image_mirror_images: [] diff --git a/playbooks/roles/container_image_mirror/meta/main.yaml b/playbooks/roles/container_image_mirror/meta/main.yaml new file mode 100644 index 00000000..4202f0bb --- /dev/null +++ b/playbooks/roles/container_image_mirror/meta/main.yaml @@ -0,0 +1,19 @@ +--- +galaxy_info: + role_name: container_image_mirror + author: Telco Verification Team + description: Mirror or remove container images from/to internal registries using skopeo + license: Apache-2.0 + min_ansible_version: "2.9" + platforms: + - name: EL + versions: + - "8" + - "9" + galaxy_tags: + - containers + - skopeo + - registry + - mirror + +dependencies: [] diff --git a/playbooks/roles/container_image_mirror/tasks/main.yaml b/playbooks/roles/container_image_mirror/tasks/main.yaml new file mode 100644 index 00000000..da2d5255 --- /dev/null +++ b/playbooks/roles/container_image_mirror/tasks/main.yaml @@ -0,0 +1,25 @@ +--- +- name: Validate required variables + ansible.builtin.assert: + that: + - container_image_mirror_images is defined + - container_image_mirror_images | length > 0 + - container_image_mirror_operation in ['mirror', 'remove'] + fail_msg: >- + container_image_mirror_images must be defined as a non-empty list, and + container_image_mirror_operation must be 'mirror' or 'remove' + +- name: Display operation configuration + ansible.builtin.debug: + msg: + - "Operation: {{ container_image_mirror_operation | upper }}" + - "Target Registry: {{ container_image_mirror_registry_host }}:{{ container_image_mirror_registry_port }}" + - "Images: {{ container_image_mirror_images | length }}" + +- name: Execute mirror operation + ansible.builtin.include_tasks: mirror.yaml + when: container_image_mirror_operation == 'mirror' + +- name: Execute remove operation + ansible.builtin.include_tasks: remove.yaml + when: container_image_mirror_operation == 'remove' diff --git a/playbooks/roles/container_image_mirror/tasks/mirror.yaml b/playbooks/roles/container_image_mirror/tasks/mirror.yaml new file mode 100644 index 00000000..d0f6ef97 --- /dev/null +++ b/playbooks/roles/container_image_mirror/tasks/mirror.yaml @@ -0,0 +1,80 @@ +--- +- name: Setup pull secret for mirroring + when: container_image_mirror_use_pull_secret | bool + block: + - name: Ensure pull secret directory exists + ansible.builtin.file: + path: "{{ container_image_mirror_pull_secret_path | dirname }}" + state: directory + mode: "0700" + when: (container_image_mirror_pull_secret_path | dirname) not in ['/tmp', '/'] + + - name: Populate pull secret + ansible.builtin.copy: + content: "{{ container_image_mirror_pull_secret_string | b64decode | from_json | to_nice_json }}" + dest: "{{ container_image_mirror_pull_secret_path }}" + mode: "0600" + no_log: true + +- name: Check if images already exist in registry + ansible.builtin.uri: + url: "https://{{ container_image_mirror_registry_host }}:{{ container_image_mirror_registry_port }}/v2/{{ container_image_mirror_registry_namespace }}{{ image_item.dest | regex_replace(':.*', '') }}/tags/list" + method: GET + validate_certs: false + status_code: [200, 404, 401] # 401 = registry requires auth for tag listing + register: _image_check + loop: "{{ container_image_mirror_images }}" + loop_control: + loop_var: image_item + label: "{{ image_item.dest }}" + failed_when: false + +- name: Mirror images using skopeo + ansible.builtin.command: + cmd: >- + skopeo copy --all + {{ '--authfile ' + container_image_mirror_pull_secret_path if container_image_mirror_use_pull_secret | bool else '' }} + {{ '--dest-tls-verify=false' if not container_image_mirror_dest_tls_verify else '' }} + docker://{{ image_item.source }} + docker://{{ container_image_mirror_registry_host }}:{{ container_image_mirror_registry_port }}/{{ container_image_mirror_registry_namespace }}{{ image_item.dest }} + loop: "{{ container_image_mirror_images }}" + loop_control: + loop_var: image_item + label: "{{ image_item.source }} -> {{ container_image_mirror_registry_namespace }}{{ image_item.dest }}" + changed_when: true + register: _mirror_result + failed_when: false + ignore_errors: true + +- name: Build mirror summary + ansible.builtin.set_fact: + _mirror_success: "{{ _mirror_result.results | selectattr('rc', 'eq', 0) | list }}" + _mirror_failed: "{{ _mirror_result.results | selectattr('rc', 'ne', 0) | list }}" + +- name: Display detailed mirror results + ansible.builtin.debug: + msg: + - "==========================================" + - "MIRROR SUMMARY" + - "==========================================" + - "Total images: {{ container_image_mirror_images | length }}" + - "Successfully mirrored: {{ _mirror_success | length }}" + - "Failed: {{ _mirror_failed | length }}" + - "" + - "SUCCESSFUL:" + - "{{ _mirror_success | map(attribute='image_item') | map(attribute='dest') | list | join('\n ✓ ') | indent(2, first=True) }}" + - "" + - "FAILED:" + - "{{ _mirror_failed | map(attribute='image_item') | map(attribute='dest') | list | join('\n ✗ ') | indent(2, first=True) if _mirror_failed | length > 0 else ' (none)' }}" + - "==========================================" + +- name: Cleanup pull secret + ansible.builtin.file: + path: "{{ container_image_mirror_pull_secret_path }}" + state: absent + when: container_image_mirror_use_pull_secret | bool + +- name: Fail if any images failed to mirror + ansible.builtin.fail: + msg: "{{ _mirror_failed | length }} image(s) failed to mirror. See summary above for details." + when: _mirror_failed | length > 0 diff --git a/playbooks/roles/container_image_mirror/tasks/remove.yaml b/playbooks/roles/container_image_mirror/tasks/remove.yaml new file mode 100644 index 00000000..5ed8a93b --- /dev/null +++ b/playbooks/roles/container_image_mirror/tasks/remove.yaml @@ -0,0 +1,40 @@ +--- +- name: Remove image repositories from registry storage + ansible.builtin.file: + path: "{{ container_image_mirror_registry_data_path }}/{{ container_image_mirror_registry_namespace }}{{ image_item.dest | regex_replace(':.*', '') }}" + state: absent + become: true + loop: "{{ container_image_mirror_images }}" + loop_control: + loop_var: image_item + label: "{{ container_image_mirror_registry_namespace }}{{ image_item.dest }}" + register: _remove_result + failed_when: false + ignore_errors: true + +- name: Build removal summary + ansible.builtin.set_fact: + _remove_success: "{{ _remove_result.results | selectattr('changed', 'eq', true) | list }}" + _remove_failed: "{{ _remove_result.results | selectattr('failed', 'eq', true) | default([]) | list }}" + +- name: Display detailed removal results + ansible.builtin.debug: + msg: + - "==========================================" + - "REMOVAL SUMMARY" + - "==========================================" + - "Total images: {{ container_image_mirror_images | length }}" + - "Successfully removed: {{ _remove_success | length }}" + - "Failed: {{ _remove_failed | length }}" + - "" + - "REMOVED:" + - "{{ _remove_success | map(attribute='image_item') | map(attribute='dest') | list | join('\n ✓ ') | indent(2, first=True) }}" + - "" + - "FAILED:" + - "{{ _remove_failed | map(attribute='image_item') | map(attribute='dest') | list | join('\n ✗ ') | indent(2, first=True) if _remove_failed | length > 0 else ' (none)' }}" + - "==========================================" + +- name: Fail if any images failed to remove + ansible.builtin.fail: + msg: "{{ _remove_failed | length }} image(s) failed to remove. See summary above for details." + when: _remove_failed | length > 0 diff --git a/playbooks/telco-kpis/README.md b/playbooks/telco-kpis/README.md new file mode 100644 index 00000000..067cb684 --- /dev/null +++ b/playbooks/telco-kpis/README.md @@ -0,0 +1,490 @@ +# Telco-KPIs Testing Framework + +Comprehensive Ansible automation framework for validating Telco Key Performance Indicators (KPIs) on OpenShift Edge deployments. This framework provides end-to-end testing capabilities including performance validation, deployment time tracking, hardware metadata collection, and automated report generation. + +## Overview + +The Telco-KPIs framework tests critical performance and deployment metrics for Red Hat OpenShift Edge clusters. Tests are executed on bastion hosts and generate JUnit XML reports for CI/CD integration plus comprehensive Markdown reports published to Gitea. + +## Available Tests + +### collect-node-info.yml +**Purpose**: Collects hardware metadata from spoke cluster nodes + +**What it collects**: +- CPU model, cores, and architecture +- BIOS version and vendor +- Firmware versions +- NIC (Network Interface Card) details and drivers +- Microcode versions +- System manufacturer and product information + +**Output**: `node-info-{spoke}.json` in shared artifacts directory + +**Importance**: Must run before other tests - acts as baseline timestamp for test freshness filtering in report generation. + +**Usage**: +```bash +ansible-playbook collect-node-info.yml \ + -e spoke_cluster=spree-02 \ + -e spoke_kubeconfig=/path/to/kubeconfig +``` + +### run-test.yml +**Purpose**: Executes performance tests on spoke cluster + +**Available Tests**: +- **oslat**: OS latency validation +- **ptp**: Precision Time Protocol synchronization +- **cyclictest**: Real-time kernel latency testing +- **reboot**: Node reboot resilience testing +- **cpu_util**: CPU utilization validation + +**Parameters**: +- `spoke_cluster`: Spoke cluster name (required) +- `test_name`: Test type to run (required) +- `test_duration`: Test duration in seconds (optional) +- `spoke_kubeconfig`: Path to spoke cluster kubeconfig (required) + +**Output**: Timestamped directory `{test_name}-{spoke}-{YYYYMMDD-HHMMSS}` with JUnit XML reports + +**Usage**: +```bash +ansible-playbook run-test.yml \ + -e spoke_cluster=spree-02 \ + -e test_name=oslat \ + -e test_duration=600 \ + -e spoke_kubeconfig=/path/to/kubeconfig +``` + +### run-bios-validation.yml +**Purpose**: Validates BIOS settings across cluster nodes + +Ensures BIOS configurations meet required specifications for Telco workloads. + +### run-rds-compare.yml +**Purpose**: Compares RDS (Reference Deployment Specification) metrics + +Validates deployment against reference specifications. + +### ztp-ai-deployment-time.yml +**Purpose**: Validates ZTP AI deployment time against threshold + +**What it validates**: +- ZTP (Zero Touch Provisioning) deployment completes within acceptable time +- Default threshold: 2h0m (120 minutes) for Assisted Installer deployments +- Test PASSES if deployment duration ≤ threshold, FAILS otherwise + +**How it works**: +1. Queries ACM (Advanced Cluster Management) resources on hub cluster using `kubernetes.core.k8s_info` +2. Uses `ztp_deployment_timeline` Ansible role to track deployment milestones +3. Measures from ClusterInstance creation to TALM ClusterGroupUpgrade completion +4. Generates comprehensive timeline analysis with milestone breakdown + +**Parameters**: +- `spoke_cluster`: Spoke cluster name (required) +- `hub_cluster`: Hub cluster name (required) +- `hub_kubeconfig`: Path to hub cluster kubeconfig on bastion (required) +- `threshold_duration`: Maximum acceptable deployment time (default: "2h0m") + +**Deployment Methods Supported**: +- **AI (Assisted Installer)**: Default - uses AgentClusterInstall resources +- **IBI (Image-based Install)**: Automatically detected via ImageBasedInstall resource + +**Key Milestones Tracked**: +- ArgoCD Application Created (if present) +- ClusterInstance Created (SiteConfig v2 operator) +- GitOps Sync (ManagedCluster Created) +- AgentClusterInstall Created +- Discovery ISO Ready +- Agent Registered +- Agent Bound to Cluster +- Installation Started +- Installation Completed +- Import to ACM Started +- TALM CGU Completed (Ready for Workloads) + +**Artifacts Generated**: +- `deployment-timeline-summary.txt`: Human-readable summary with milestone breakdown, timestamps, durations, and deltas +- `deployment-timeline.json`: Raw timeline events in JSON format (complete event history) +- `junit_ztp-ai-deployment-time.xml`: JUnit XML test result for CI/CD integration + +**Example Summary Output**: +``` +====================================================================== +ZTP Deployment Timeline Summary +====================================================================== +Hub Cluster: kni-qe-71 +Bastion Host: bastion-hostname +Spoke Cluster: spree-02 + +Deployment Features: + - ArgoCD Application Starting Point: Present + - ClusterInstance Tracking: Present + - TALM CGU Completion: Present + - ztp-done Label: Present + +Total Events Captured: 47 + +====================================================================== +KEY MILESTONES +====================================================================== +1. ClusterInstance Created | 2026-05-01T14:30:00Z | 0h0m0s | START +2. GitOps Sync (ManagedCluster Created) | 2026-05-01T14:32:15Z | 0h2m15s | +0h2m15s +3. AgentClusterInstall Created | 2026-05-01T14:33:00Z | 0h3m0s | +0h0m45s +... +11. TALM CGU Completed (Ready for Workloads) | 2026-05-01T16:15:30Z | 1h45m30s | +0h5m15s + +====================================================================== +WORKLOAD READINESS STATUS +====================================================================== +✅ Cluster ready for workloads since: 2026-05-01T16:15:30Z (since 0h15m30s) + +====================================================================== +DEPLOYMENT SUMMARY +====================================================================== +🚀 The deployment took 1h45m30s from ClusterInstance CR creation to TALM CGU Completed (Ready for Workloads) +``` + +**Usage**: +```bash +ansible-playbook ztp-ai-deployment-time.yml \ + -e spoke_cluster=spree-02 \ + -e hub_cluster=kni-qe-71 \ + -e hub_kubeconfig=/home/telcov10n/project/generated/kni-qe-71/auth/kubeconfig \ + -e threshold_duration=2h0m +``` + +**Test Result**: +- **PASS**: Deployment duration ≤ threshold +- **FAIL**: Deployment duration > threshold (JUnit XML contains detailed failure information) + +**Integration with Report Generator**: +- Automatically included in comprehensive Markdown reports +- Timeline section appears before "Report Metadata" with expandable details +- Test Summary table includes ZTP_AI_DEPLOYMENT_TIME entry with pass/fail status +- JSON timeline file linked for detailed analysis + +### generate-report.yml +**Purpose**: Generates comprehensive Markdown reports from all test artifacts + +**What it does**: +1. Validates shared artifact directory exists +2. Checks for node-info JSON (hardware metadata baseline) +3. Filters test directories based on node-info timestamp (excludes stale tests from old environment configs) +4. Implements freshness check - skips generation if no new tests since last report +5. Runs `analyze-podman-test-results.py` in `telco-kpis-test-runner` container +6. Integrates ZTP deployment timeline (if available) +7. Compresses all artifacts into tarball +8. Fetches report and tarball to local artifacts directory +9. Publishes to Gitea repository (when `DEVELOPMENT_MODE=true`) + +**Parameters**: +- `spoke_cluster`: Spoke cluster to generate report for (required) +- `test_filter`: Filter to specific tests (optional, comma-separated: oslat,ptp,cyclictest,reboot,cpu_util,ztp-ai-deployment-time) +- `output_filename`: Custom report filename (optional, auto-generated if empty: `telco-kpis-report-{spoke}-{timestamp}.md`) +- `timestamp`: UTC timestamp for report generation (optional, auto-generated if empty) + +**Test Freshness Logic**: +- Reads `collected_at` timestamp from `node-info-{spoke}.json` +- Compares each test directory timestamp against node-info timestamp +- **Includes** tests with timestamp ≥ node-info timestamp (tests ran after environment update) +- **Excludes** tests with timestamp < node-info timestamp (tests ran before environment update) +- **Deletes** excluded test data from bastion to save disk space +- Displays filtering summary: tests included vs. tests excluded + +**Report Action**: +- **NEW REPORT**: Created when tests are excluded (environment configuration changed) +- **UPDATE REPORT**: Created when no tests excluded (environment stable, adding new test results) + +**Freshness Check**: +- Clones Gitea repository to check last report commit timestamp +- Compares test artifact timestamps against last report timestamp +- **Skips generation** if no test artifacts are newer than last report (avoids duplicate reports) +- **Generates report** if new tests detected or no previous report exists + +**ZTP Deployment Timeline Integration**: +- Finds latest `ztp-ai-deployment-time-{spoke}-{YYYYMMDD-HHMMSS}` directory +- Reads `deployment-timeline-summary.txt` and `deployment-timeline.json` +- Inserts timeline section BEFORE "Report Metadata" with expandable details +- Adds ZTP_AI_DEPLOYMENT_TIME entry to Test Summary table +- Copies JSON file to report artifacts with relative link +- Removes unwanted metadata lines (Script, Data Source) + +**Output Files**: +- `{output_filename}`: Markdown report (default: `telco-kpis-report-{spoke}-{timestamp}.md`) +- `{spoke}-artifacts-{timestamp}.tar.gz`: Compressed tarball of all source artifacts +- Saved to: `{{ lookup('env', 'ARTIFACT_DIR') | default('/artifacts', true) }}/reports/` + +**Shared Artifact Directory Structure**: +``` +/home/telcov10n/telco-kpis-artifacts/{spoke}/ +├── node-info-{spoke}.json # Hardware metadata (baseline) +├── oslat-{spoke}-{YYYYMMDD-HHMMSS}/ # Performance test results +├── ptp-{spoke}-{YYYYMMDD-HHMMSS}/ +├── cyclictest-{spoke}-{YYYYMMDD-HHMMSS}/ +├── ztp-ai-deployment-time-{spoke}-{YYYYMMDD-HHMMSS}/ # Deployment timeline +│ ├── deployment-timeline-summary.txt +│ ├── deployment-timeline.json +│ └── junit_ztp-ai-deployment-time.xml +└── ... +``` + +**Usage**: +```bash +ansible-playbook generate-report.yml \ + -e spoke_cluster=spree-02 \ + -e test_filter=oslat,ptp,cyclictest,ztp-ai-deployment-time \ + -e output_filename=telco-kpis-report-spree-02-20260504-120000.md +``` + +**Task File**: Shared task file `tasks/generate-report.yml` imported by main playbook + +## Roles + +### ztp_deployment_timeline +**Location**: `playbooks/roles/ztp_deployment_timeline/` + +**Purpose**: Tracks ZTP deployment timeline from ClusterInstance creation to TALM CGU completion + +**How it works**: +1. Queries ACM resources on hub cluster using `kubernetes.core.k8s_info` module +2. Validates spoke cluster exists (ManagedCluster resource) +3. Validates ClusterInstance exists (SiteConfig v2 deployment) +4. Validates TALM ClusterGroupUpgrade exists and has completed +5. Determines deployment method (AI vs IBI) +6. Extracts timeline events from resource status conditions and timestamps +7. Calculates deployment duration +8. Generates detailed human-readable summary (when `generate_detailed_summary: true`) + +**Resources Queried**: +- ClusterInstance (siteconfig.open-cluster-management.io/v1alpha1) +- ManagedCluster (cluster.open-cluster-management.io/v1) +- AgentClusterInstall (extensions.hive.openshift.io/v1beta1) +- ImageBasedInstall (extensions.hive.openshift.io/v1alpha1) +- TALM ClusterGroupUpgrade (ran.openshift.io/v1alpha1) + +**Facts Exported**: +- `ztp_deployment_timeline_success`: Boolean - collection succeeded +- `ztp_deployment_timeline_events`: List of timeline events with timestamps and milestones +- `ztp_deployment_timeline_start_time`: ClusterInstance creation timestamp +- `ztp_deployment_timeline_end_time`: TALM CGU completion timestamp +- `ztp_deployment_timeline_duration_seconds`: Deployment duration in seconds +- `ztp_deployment_timeline_duration_formatted`: Formatted duration (e.g., "1h45m30s") +- `ztp_deployment_timeline_deployment_method`: "AI" or "IBI" +- `ztp_deployment_timeline_detailed_summary`: Human-readable summary (when `generate_detailed_summary: true`) + +**Usage**: +```yaml +- name: Collect ZTP deployment timeline + ansible.builtin.include_role: + name: ztp_deployment_timeline + vars: + spoke_cluster: "spree-02" + hub_kubeconfig: "/home/telcov10n/project/generated/kni-qe-71/auth/kubeconfig" + hub_cluster: "kni-qe-71" + generate_detailed_summary: true +``` + +### gitea +**Location**: `playbooks/telco-kpis/roles/gitea/` + +**Purpose**: Manages Gitea repository deployment and report publishing + +See `roles/gitea/README.md` for detailed documentation. + +## Artifact Directory Pattern + +All tests follow a consistent artifact directory pattern for compatibility with the report generator: + +**Location**: `/home/telcov10n/telco-kpis-artifacts/{spoke}/` on bastion host + +**Directory Naming**: `{test-name}-{spoke}-{YYYYMMDD-HHMMSS}` (UTC timestamp) + +**Why UTC timestamps?** +- Ensures correct chronological ordering across different bastion timezones +- Enables accurate freshness comparison in report generator +- Prevents EDT/EST timestamp collisions during DST transitions + +**Shared Artifacts Directory Structure**: +``` +/home/telcov10n/telco-kpis-artifacts/ +├── spree-02/ +│ ├── node-info-spree-02.json # Hardware metadata baseline +│ ├── oslat-spree-02-20260501-143000/ # Test results +│ │ ├── junit_oslat.xml +│ │ └── oslat-results.json +│ ├── ptp-spree-02-20260501-144500/ +│ ├── cyclictest-spree-02-20260501-150000/ +│ ├── ztp-ai-deployment-time-spree-02-20260501-161530/ # Deployment timeline +│ │ ├── deployment-timeline-summary.txt +│ │ ├── deployment-timeline.json +│ │ └── junit_ztp-ai-deployment-time.xml +│ └── ... +└── spree-03/ + └── ... +``` + +## Typical Testing Workflow + +1. **Deploy ZTP cluster** (prerequisite - cluster must exist and be managed by ACM) + +2. **Run ZTP deployment time validation** (requires hub cluster kubeconfig): + ```bash + ansible-playbook ztp-ai-deployment-time.yml \ + -e spoke_cluster=spree-02 \ + -e hub_cluster=kni-qe-71 \ + -e hub_kubeconfig=/home/telcov10n/project/generated/kni-qe-71/auth/kubeconfig \ + -e threshold_duration=2h0m + ``` + +3. **Collect hardware metadata** (establishes freshness baseline): + ```bash + ansible-playbook collect-node-info.yml \ + -e spoke_cluster=spree-02 \ + -e spoke_kubeconfig=/tmp/spree-02-kubeconfig + ``` + +4. **Run performance tests**: + ```bash + # OSLAT test + ansible-playbook run-test.yml \ + -e spoke_cluster=spree-02 \ + -e test_name=oslat \ + -e test_duration=600 \ + -e spoke_kubeconfig=/tmp/spree-02-kubeconfig + + # PTP test + ansible-playbook run-test.yml \ + -e spoke_cluster=spree-02 \ + -e test_name=ptp \ + -e spoke_kubeconfig=/tmp/spree-02-kubeconfig + + # Cyclictest + ansible-playbook run-test.yml \ + -e spoke_cluster=spree-02 \ + -e test_name=cyclictest \ + -e test_duration=600 \ + -e spoke_kubeconfig=/tmp/spree-02-kubeconfig + ``` + +5. **Generate comprehensive report**: + ```bash + ansible-playbook generate-report.yml \ + -e spoke_cluster=spree-02 + ``` + +## Report Generation Details + +The report generator (`generate-report.yml`) implements several intelligent features: + +### Test Freshness Filtering +- Reads `collected_at` timestamp from `node-info-{spoke}.json` (hardware metadata baseline) +- Converts to UTC format: `YYYYMMDD-HHMMSS` +- Compares each test directory timestamp against node-info timestamp +- **Includes** tests with timestamp ≥ node-info timestamp (current environment config) +- **Excludes** and **deletes** tests with timestamp < node-info timestamp (old environment config) +- Displays summary: tests included vs. tests excluded + +**Why this matters:** +- Environment configuration changes (NIC swap, BIOS update, etc.) invalidate old test results +- Prevents mixing results from different hardware configurations in same report +- Automatically cleans up stale test data to save disk space + +### Freshness Check (Skip Duplicate Reports) +- Clones Gitea repository to check last report commit timestamp +- Compares test artifact timestamps against last report timestamp +- **Skips generation** if no new tests since last report (avoids duplicate reports) +- **Generates report** if: + - No previous report exists + - New test artifacts detected (timestamp > last report timestamp) + +**Example Output**: +``` +Test Freshness Check +=========================================== +Gitea repository: http://bastion:3000/telcov10n/telco-kpis-reports.git +Last report generated: 2026-05-04 10:30:15 -0400 +Last report UTC timestamp: 20260504-143015 +Freshness check result: FOUND_NEW_TESTS: 3 +=========================================== +``` + +### ZTP Deployment Timeline Integration +When `ztp-ai-deployment-time` test artifacts are available: +1. Finds latest `ztp-ai-deployment-time-{spoke}-{YYYYMMDD-HHMMSS}` directory +2. Checks for `deployment-timeline-summary.txt` +3. Inserts timeline section **before** "Report Metadata" section +4. Creates expandable details section with full timeline summary +5. Links to `deployment-timeline.json` for raw event data +6. Extracts test result (PASS/FAIL) and duration +7. Adds entry to Test Summary table +8. Removes unwanted metadata lines (Script, Data Source) + +**Report Structure**: +```markdown +# Telco-KPIs Report: spree-02 + +## Test Summary +| Test | Status | Result | Duration | Description | +|------|--------|--------|----------|-------------| +| **OSLAT** | ✅ Ran | ✅ PASS | 10m15s | OS latency validation | +| **ZTP_AI_DEPLOYMENT_TIME** | ✅ Ran | ✅ PASS | 1h45m30s | Start→TALM CGU Complete | + +--- + +## ZTP Deployment Timeline + +This section shows the complete ZTP/ACM deployment timeline for the spoke cluster... + +**📊 [View Raw Timeline JSON](ztp-ai-deployment-time/deployment-timeline.json)** + +
+Click to expand deployment timeline details + +``` +====================================================================== +ZTP Deployment Timeline Summary +====================================================================== +... +``` + +
+ +--- + +## Report Metadata +- **Generated**: 2026-05-04 14:30:00 UTC +- **Spoke Cluster**: spree-02 +... +``` + +## Jenkins Integration + +All telco-kpis tests have corresponding Jenkins jobs in the **Telco-KPIs** view: +- `telco-kpis-collect-node-info` +- `telco-kpis-run-test` +- `telco-kpis-run-bios-validation` +- `telco-kpis-run-rds-compare` +- `telco-kpis-ztp-ai-deployment-time` +- `telco-kpis-generate-report` + +See `repos/telco-auto-ci-cd/CLAUDE.md` for detailed Jenkins job documentation. + +## Troubleshooting + +### Common Issues + +See `docs/troubleshooting/` for detailed troubleshooting guides: +- `prometheus-pod-stuck-reboot-test-blocker.md` - Reboot test execution issues +- `k8s-exec-ipv6-fallback-issue.md` - IPv6 fallback issues with kubernetes.core.k8s_exec + +### Gitea Publishing Issues + +See `roles/gitea/README.md` for Gitea-specific troubleshooting. + +## References + +- **Parent Repository Documentation**: See `repos/eco-ci-cd/CLAUDE.md` for architecture overview +- **Jenkins Jobs Documentation**: See `repos/telco-auto-ci-cd/CLAUDE.md` for Jenkins job details +- **Gitea Role**: See `roles/gitea/README.md` for report publishing details diff --git a/playbooks/telco-kpis/docs/troubleshooting/k8s-exec-ipv6-fallback-issue.md b/playbooks/telco-kpis/docs/troubleshooting/k8s-exec-ipv6-fallback-issue.md new file mode 100644 index 00000000..63ee866d --- /dev/null +++ b/playbooks/telco-kpis/docs/troubleshooting/k8s-exec-ipv6-fallback-issue.md @@ -0,0 +1,302 @@ +# kubernetes.core.k8s_exec IPv6 Fallback Issue + +**Status:** Root cause identified +**Severity:** High (breaks pod exec operations) +**Affected:** Environments with dual-stack DNS but IPv6 routing disabled +**Workaround:** Use `oc exec` via `ansible.builtin.shell` instead of `kubernetes.core.k8s_exec` +**Date Identified:** 2026-04-30 + +--- + +## Symptoms + +When using `kubernetes.core.k8s_exec` to execute commands in pods: + +```yaml +- name: Get BIOS version via dmidecode + kubernetes.core.k8s_exec: + namespace: openshift-machine-config-operator + pod: "{{ mcd_pod }}" + container: machine-config-daemon + command: chroot /rootfs dmidecode -t 0 + kubeconfig: "{{ spoke_kubeconfig }}" + register: dmidecode_result + failed_when: false +``` + +**Error:** +``` +"failed": true, +"msg": "Failed to execute on pod machine-config-daemon-l7lc9 due to : (0)\nReason: [Errno 113] No route to host\n" +``` + +**Meanwhile, `oc exec` works perfectly:** +```bash +oc --kubeconfig /tmp/spree-02-kubeconfig \ + -n openshift-machine-config-operator \ + exec machine-config-daemon-l7lc9 \ + -c machine-config-daemon \ + -- chroot /rootfs dmidecode -t 0 +# Returns BIOS version successfully +``` + +--- + +## Root Cause + +**TL;DR:** The Python `websocket-client` library (used by `kubernetes.core.k8s_exec`) does not fall back to IPv4 when IPv6 connection fails. + +### Detailed Explanation + +1. **DNS returns both IPv6 and IPv4 addresses:** + ``` + api.spree-02.kpi.telcoqe.eng.rdu2.dc.redhat.com + → 2620:52:9:1698::14 (IPv6) + → 10.6.152.14 (IPv4) + ``` + +2. **IPv6 routing is not configured in this network:** + ```bash + $ ping6 2620:52:9:1698::14 + From 2620:52:0:2ebe:172:16:180:1 icmp_seq=1 Destination unreachable: Address unreachable + + $ ping 10.6.152.14 + 64 bytes from 10.6.152.14: icmp_seq=1 ttl=60 time=0.478 ms # ✅ Works + ``` + +3. **Different fallback behaviors:** + + | Tool | Tries IPv6 | IPv6 Fails | Falls Back to IPv4 | Result | + |------|------------|------------|-------------------|---------| + | `curl` | ✅ | ✅ | ✅ | **SUCCESS** | + | `oc exec` | ✅ | ✅ | ✅ | **SUCCESS** | + | Python `websocket-client` | ✅ | ✅ | ❌ | **FAILS** | + +4. **Code path in Python kubernetes client:** + ``` + kubernetes.stream.stream() + → kubernetes.stream.ws_client.websocket_call() + → websocket.create_connection() + → socket.connect(('2620:52:9:1698::14', 6443)) # IPv6 only + → OSError: [Errno 113] No route to host + ``` + + The Python `websocket-client` library (`/home/telcov10n/.local/lib/python3.9/site-packages/websocket/_http.py`) does not implement happy eyeballs (RFC 8305) or IPv4 fallback. + +--- + +## Investigation Timeline + +### Initial Hypothesis (WRONG) +Initially suspected WebSocket protocol was blocked while SPDY was allowed: +- ❌ Firewall blocking WebSocket traffic +- ❌ Network policy differences +- ❌ Proxy configuration issues +- ❌ Certificate validation problems + +### Breakthrough Tests + +**Test 1: curl with WebSocket upgrade** +```bash +curl -k -v \ + --header "Connection: Upgrade" \ + --header "Upgrade: websocket" \ + "https://api.spree-02.kpi.telcoqe.eng.rdu2.dc.redhat.com:6443/..." + +# Result: HTTP 403 (authentication issue, NOT network block) +# ✅ Proved WebSocket protocol reaches the server! +``` + +**Test 2: Python websocket library with debug** +```python +# Monkey-patched socket.connect() to see connection attempts +WEBSOCKET TRYING TO CONNECT TO: ('2620:52:9:1698::14', 6443, 0, 0) +# ↑ IPv6 address! + +# Result: [Errno 113] No route to host +``` + +**Test 3: IPv6 vs IPv4 connectivity** +```bash +ping6 2620:52:9:1698::14 # ❌ Destination unreachable +ping 10.6.152.14 # ✅ Success +``` + +**Conclusion:** IPv6 fallback issue, NOT protocol blocking. + +--- + +## Workaround (Implemented) + +Replace `kubernetes.core.k8s_exec` with `ansible.builtin.shell` + `oc exec`: + +**Before (BROKEN):** +```yaml +- name: Get BIOS version via dmidecode + kubernetes.core.k8s_exec: + namespace: openshift-machine-config-operator + pod: "{{ mcd_pod }}" + container: machine-config-daemon + command: chroot /rootfs dmidecode -t 0 + kubeconfig: "{{ spoke_kubeconfig }}" + register: dmidecode_result + failed_when: false +``` + +**After (WORKING):** +```yaml +- name: Get BIOS version via dmidecode + ansible.builtin.shell: | + oc --kubeconfig {{ spoke_kubeconfig }} \ + -n openshift-machine-config-operator \ + exec {{ mcd_pod }} \ + -c machine-config-daemon \ + -- chroot /rootfs dmidecode -t 0 + register: dmidecode_result + failed_when: false + changed_when: false +``` + +**Why this works:** The `oc` binary correctly implements IPv4 fallback when IPv6 fails. + +--- + +## Permanent Solutions (Not Implemented) + +### Option 1: Force IPv4 Resolution in kubeconfig +Edit kubeconfig to use IP address instead of hostname: + +```yaml +# Before: +server: https://api.spree-02.kpi.telcoqe.eng.rdu2.dc.redhat.com:6443 + +# After: +server: https://10.6.152.14:6443 +``` + +**Pros:** Python client will use IPv4 directly +**Cons:** +- Breaks certificate validation (hostname mismatch) +- Requires manual kubeconfig modification +- Not portable across environments + +### Option 2: Configure IPv6 Routing +Enable IPv6 routing on the network infrastructure. + +**Pros:** Fixes root cause +**Cons:** +- Requires infrastructure changes +- May not be feasible in all environments + +### Option 3: Patch Python websocket-client Library +Contribute IPv4 fallback logic to `websocket-client` library. + +**Pros:** Benefits all users +**Cons:** +- Requires upstream contribution +- Long-term solution +- Would still need workaround until adopted + +### Option 4: Disable IPv6 in Python +Set environment variable or Python socket configuration to prefer IPv4. + +**Pros:** System-wide fix +**Cons:** +- Affects all Python applications +- May break IPv6-dependent services +- Fragile workaround + +--- + +## Files Modified + +**Commit:** `bb5e97f` - "Fix BIOS/microcode collection: Replace k8s_exec with oc exec" + +- `playbooks/telco-kpis/tasks/collect-node-info.yml` + - Line 59-67: dmidecode (BIOS version) + - Line 79-87: /proc/cpuinfo (microcode version) + - Line 99-107: lscpu (CPU model) + +All three k8s_exec calls replaced with shell + oc exec. + +--- + +## Verification + +**Before fix (build 5, 6):** +```json +{ + "bios_version": "unknown", + "microcode_version": "unknown", + "cpu_type": "unknown" +} +``` + +**After fix (build 7):** +```json +{ + "bios_version": "2.8.2", + "microcode_version": "0x2b000661", + "cpu_type": "Intel(R) Xeon(R) Gold 6433N" +} +``` + +--- + +## Related Issues + +- **OCPBUGS-XXXXX:** (if applicable) +- **Upstream websocket-client:** https://github.com/websocket-client/websocket-client/issues +- **RFC 8305:** "Happy Eyeballs Version 2: Better Connectivity Using Concurrency" + +--- + +## Lessons Learned + +1. **"No route to host" doesn't always mean network blocking** + - Can also indicate IPv6 routing issues + - Always check both IPv4 and IPv6 connectivity + +2. **Not all HTTP clients handle dual-stack DNS equally** + - curl: Smart (tries both, falls back) + - oc: Smart (handles fallback) + - Python websocket-client: Not smart (no fallback) + +3. **kubernetes.core modules have hidden dependencies** + - k8s_exec depends on websocket-client library behavior + - Library bugs can break Ansible modules + - Shell + oc exec is more reliable in edge cases + +4. **Always test assumptions** + - Initial hypothesis (WebSocket protocol blocked) was wrong + - Debug tools revealed the real issue (IPv6) + - Instrumentation (socket connect debug) was key + +--- + +## Test Script + +Reproduce the issue with this script: + +```bash +# scripts/quick-test-k8s-exec.sh +ssh telco-kpis-prow-kni-qe-71-bastion "bash -s" < scripts/quick-test-k8s-exec.sh +``` + +Expected output: +- ✅ oc exec: SUCCESS +- ❌ Python kubernetes client: ERROR (No route to host) +- Shows IPv6 connection attempt details + +--- + +## References + +- Investigation thread: (link to Git commits bb5e97f, 1e03b5f) +- Python kubernetes client: https://github.com/kubernetes-client/python +- websocket-client library: https://github.com/websocket-client/websocket-client +- RFC 8305 (Happy Eyeballs): https://tools.ietf.org/html/rfc8305 + +--- + +**Conclusion:** The Python `kubernetes` library's dependency on `websocket-client` lacks IPv4 fallback when IPv6 fails. In dual-stack environments without IPv6 routing, use `oc exec` via shell instead of `kubernetes.core.k8s_exec`. diff --git a/playbooks/telco-kpis/docs/troubleshooting/prometheus-pod-stuck-reboot-test-blocker.md b/playbooks/telco-kpis/docs/troubleshooting/prometheus-pod-stuck-reboot-test-blocker.md new file mode 100644 index 00000000..817360e9 --- /dev/null +++ b/playbooks/telco-kpis/docs/troubleshooting/prometheus-pod-stuck-reboot-test-blocker.md @@ -0,0 +1,303 @@ +# Prometheus Pod Stuck - Reboot Test Blocker + +**Last Updated:** 2026-04-29 +**Affected Clusters:** spree-02 (confirmed), potentially all SNO deployments with ACM observability +**Related Bugs:** OCPBUGS-65953, OCPBUGS-70352 +**Impact:** Reboot tests always skipped due to failed BeforeEach health checks + +--- + +## Problem Description + +The `prometheus-k8s-0` pod in the `openshift-monitoring` namespace gets stuck in `Init:0/1` state indefinitely after cluster deployment, preventing the reboot test health checks from passing. + +## Symptoms + +### 1. Reboot Test Behavior +```bash +# All reboot tests skip execution +Ran 0 of 3 Specs +3 Skipped + +# Health check fails with: +Some pods are unhealthy before reboot +``` + +### 2. Pod Status +```bash +$ oc get pods -n openshift-monitoring | grep prometheus-k8s +prometheus-k8s-0 0/6 Init:0/1 0 12d +``` + +### 3. Pod Error Logs +```bash +$ oc logs prometheus-k8s-0 -n openshift-monitoring -c init-config-reloader +Error: secret "observability-alertmanager-accessor" not found +``` + +### 4. Node Uptime +```bash +$ ssh core@ uptime +12 days, 20:02 # Confirms spoke never rebooted +``` + +## Root Cause + +**Two-Part Issue:** + +### Part 1: Mangled ConfigMap YAML (OCPBUGS-65953) +The `alertmanager-main-generated` ConfigMap contains malformed YAML with improper indentation of the `receivers` block: + +```yaml +# WRONG (causes parsing failures) +route: + receiver: Default +receivers: +- name: Default +``` + +Should be: + +```yaml +# CORRECT +route: + receiver: Default + receivers: + - name: Default +``` + +### Part 2: Missing ACM Observability Secret (OCPBUGS-70352) +The Prometheus CR references `observability-alertmanager-accessor` secret in `spec.additionalAlertManagerConfigs`, but the secret doesn't exist (ACM observability not configured). + +**Why This Blocks Reboot Tests:** +- CNF-gotests framework has BeforeEach health check: verifies all pods are Running +- Prometheus pod stuck → health check fails → all tests SKIP +- Spoke never reboots because tests never actually execute + +## Workaround / Fix + +### Option 1: Two-Step Fix (Recommended) + +**Step 1: Fix ConfigMap YAML** + +Create temporary YAML file on bastion: + +```bash +cat <<'EOF' > /tmp/alertmanager-config-fix.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-main-generated + namespace: openshift-monitoring +data: + alertmanager.yaml.gz: H4sIAAAAAAAA/0yOQQrCMBBF95wiSxeVQhcu3HoCwY1gF0k6bQOdSTOTUsRzdmgXrt7j///x5gkf4C8LDmYSaIBzWSnpoGjYo3xVdWAXQmWXCNnSPBg0SBxhzm0p2xrPo1qQVvd+hN9uRs4x8S+VQW9kF+LrO3U51OBaT02BVTwBUZs+8lAyxV3k88SYOr72yC1bX1G6t7neMu08nVmz/QMAAP//jkUArgAAAA== +EOF + +oc --kubeconfig /tmp/-kubeconfig apply -f /tmp/alertmanager-config-fix.yaml +``` + +**Step 2: Delete Stuck Pod** + +```bash +oc --kubeconfig /tmp/-kubeconfig delete pod prometheus-k8s-0 -n openshift-monitoring +``` + +**Verification:** + +```bash +# Wait 1-2 minutes, then check: +oc --kubeconfig /tmp/-kubeconfig get pods -n openshift-monitoring | grep prometheus-k8s + +# Expected output: +# prometheus-k8s-0 6/6 Running 0 2m +``` + +### Option 2: Patch Prometheus CR (If additionalAlertManagerConfigs exists) + +```bash +# Check if the field exists +oc --kubeconfig /tmp/-kubeconfig get prometheus k8s -n openshift-monitoring -o jsonpath='{.spec.additionalAlertManagerConfigs}' + +# If output is not empty, remove it: +oc --kubeconfig /tmp/-kubeconfig patch prometheus k8s -n openshift-monitoring \ + --type=json -p='[{"op": "remove", "path": "/spec/additionalAlertManagerConfigs"}]' + +# Then delete pod +oc --kubeconfig /tmp/-kubeconfig delete pod prometheus-k8s-0 -n openshift-monitoring +``` + +**Note:** In spree-02 testing (2026-04-29), the field didn't exist, so only Step 1 + Step 2 were needed. + +## Verification Steps + +### 1. Prometheus Pod Health +```bash +# All 6 containers should be Running +oc --kubeconfig /tmp/-kubeconfig get pod prometheus-k8s-0 -n openshift-monitoring + +# Check logs for errors (should see normal startup) +oc --kubeconfig /tmp/-kubeconfig logs prometheus-k8s-0 -n openshift-monitoring -c prometheus --tail=20 +``` + +### 2. Monitor Stability +```bash +# Wait 5 minutes and verify pod stays healthy +watch -n 10 'oc --kubeconfig /tmp/-kubeconfig get pods -n openshift-monitoring | grep prometheus-k8s' +``` + +### 3. Reboot Test Execution +```bash +# Trigger Jenkins job: telco-kpis-run-reboot-test +# Expected output: +# ✅ Health check passes: "All pods are healthy before reboot" +# ✅ Test executes: "Rebooting spoke cluster via oc..." +# ✅ Artifacts show: "Ran 3 of 3 Specs" (not skipped) + +# Verify spoke actually rebooted: +ssh core@ uptime +# Uptime should be < test duration +``` + +## Prevention for Future Deployments + +### Automated Fix in Deployment Playbook + +Add post-deployment task to `deploy-ocp-hybrid-multinode.yml` or create dedicated playbook: + +```yaml +--- +# playbooks/telco-kpis/tasks/fix-prometheus-pod.yml + +- name: Fix prometheus pod if stuck (OCPBUGS-65953, OCPBUGS-70352) + hosts: bastion + gather_facts: false + tasks: + - name: Check if prometheus pod is stuck in Init state + ansible.builtin.shell: | + oc --kubeconfig {{ spoke_kubeconfig }} get pod prometheus-k8s-0 -n openshift-monitoring \ + -o jsonpath='{.status.containerStatuses[0].ready}' 2>/dev/null || echo "false" + register: prometheus_ready + failed_when: false + changed_when: false + + - name: Apply prometheus fix if pod is not ready + when: prometheus_ready.stdout == "false" + block: + - name: Create temporary ConfigMap fix file + ansible.builtin.copy: + content: | + apiVersion: v1 + kind: ConfigMap + metadata: + name: alertmanager-main-generated + namespace: openshift-monitoring + data: + alertmanager.yaml.gz: H4sIAAAAAAAA/0yOQQrCMBBF95wiSxeVQhcu3HoCwY1gF0k6bQOdSTOTUsRzdmgXrt7j///x5gkf4C8LDmYSaIBzWSnpoGjYo3xVdWAXQmWXCNnSPBg0SBxhzm0p2xrPo1qQVvd+hN9uRs4x8S+VQW9kF+LrO3U51OBaT02BVTwBUZs+8lAyxV3k88SYOr72yC1bX1G6t7neMu08nVmz/QMAAP//jkUArgAAAA== + dest: /tmp/alertmanager-config-fix.yaml + mode: '0644' + + - name: Apply ConfigMap fix + ansible.builtin.command: + cmd: oc --kubeconfig {{ spoke_kubeconfig }} apply -f /tmp/alertmanager-config-fix.yaml + changed_when: true + + - name: Delete stuck prometheus pod + ansible.builtin.command: + cmd: oc --kubeconfig {{ spoke_kubeconfig }} delete pod prometheus-k8s-0 -n openshift-monitoring + changed_when: true + + - name: Wait for prometheus pod to become ready (max 5 minutes) + ansible.builtin.shell: | + oc --kubeconfig {{ spoke_kubeconfig }} wait --for=condition=Ready \ + pod/prometheus-k8s-0 -n openshift-monitoring --timeout=300s + register: wait_result + failed_when: false + + - name: Display prometheus pod status + ansible.builtin.command: + cmd: oc --kubeconfig {{ spoke_kubeconfig }} get pod prometheus-k8s-0 -n openshift-monitoring + changed_when: false + + - name: Cleanup temporary file + ansible.builtin.file: + path: /tmp/alertmanager-config-fix.yaml + state: absent +``` + +### Integration Points + +**Option A: Add to deployment playbook** +```yaml +# In deploy-ocp-hybrid-multinode.yml, after cluster deployment completes: +- name: Post-deployment fixes + ansible.builtin.include_tasks: telco-kpis/tasks/fix-prometheus-pod.yml +``` + +**Option B: Add to cluster environment setup** +```yaml +# In setup-cluster-env.yml, before running tests: +- name: Ensure prometheus pod is healthy + ansible.builtin.include_tasks: telco-kpis/tasks/fix-prometheus-pod.yml +``` + +**Option C: Standalone playbook** +```yaml +# playbooks/telco-kpis/fix-prometheus-pod.yml +--- +- name: Fix stuck prometheus pod (OCPBUGS-65953, OCPBUGS-70352) + hosts: bastion + gather_facts: false + tasks: + - name: Execute prometheus fix tasks + ansible.builtin.include_tasks: tasks/fix-prometheus-pod.yml +``` + +## Related Bugs and References + +### OpenShift Bugs +- **OCPBUGS-65953**: Prometheus pod stuck due to mangled ConfigMap YAML + - Component: Cluster Monitoring Operator + - Status: Confirmed + - Workaround: Fix ConfigMap indentation and restart pod + +- **OCPBUGS-70352**: Missing ACM observability secret causes init failure + - Component: Advanced Cluster Management + - Status: Confirmed + - Workaround: Remove additionalAlertManagerConfigs reference or install ACM observability + +### Documentation +- Prometheus Operator: https://prometheus-operator.dev/ +- OpenShift Monitoring: https://docs.openshift.com/container-platform/latest/monitoring/monitoring-overview.html +- CNF-gotests health checks: Uses BeforeEach to verify cluster state before reboot + +### Git Commits (Timezone Fix) +- `48e7606` (2026-04-28): Critical timezone fix for test filtering (related issue) +- `b4ea8d0`: Initial timestamp filtering implementation + +## Timeline of Issue (spree-02 Example) + +**2026-04-17**: Cluster deployed, prometheus pod stuck immediately +**2026-04-28**: Reboot tests run, all skip (health check fails for 12 days) +**2026-04-29**: Issue identified and fixed (prometheus pod now healthy) + +**Duration of Outage:** 12+ days (from deployment until manual fix applied) + +## Lessons Learned + +1. **Health checks are critical** - CNF-gotests framework correctly prevented destructive operations (reboot) when cluster monitoring was unhealthy +2. **Post-deployment validation needed** - Should verify all monitoring components are healthy before declaring deployment complete +3. **ACM observability optional** - If not using ACM, Prometheus CR should not reference ACM secrets +4. **ConfigMap validation** - Cluster Monitoring Operator should validate ConfigMap YAML structure before applying + +## Action Items + +- [ ] Add automated prometheus health check to deployment playbooks +- [ ] Create post-deployment validation playbook for all monitoring components +- [ ] Document ACM observability requirements for production clusters +- [ ] Update cluster deployment checklist with monitoring verification steps +- [ ] Consider adding prometheus fix to `setup-cluster-env.yml` for all CI/CD pipelines + +--- + +**Maintainer:** Telco Verification Team +**Last Verified:** 2026-04-29 (spree-02 cluster, OCP 4.18) diff --git a/playbooks/telco-kpis/parse-lockdown.yml b/playbooks/telco-kpis/parse-lockdown.yml new file mode 100644 index 00000000..6825a2ae --- /dev/null +++ b/playbooks/telco-kpis/parse-lockdown.yml @@ -0,0 +1,146 @@ +--- +# playbooks/telco-kpis/parse-lockdown.yml +# +# Purpose: Parse lockdown JSON files (hub or spoke) and extract deployment parameters +# +# This playbook auto-detects the lockdown type based on JSON structure and outputs +# parameters in both env and JSON formats for consumption by downstream jobs. +# +# Usage: +# ansible-playbook playbooks/telco-kpis/parse-lockdown.yml \ +# -e lockdown_uri= +# +# Outputs (in ARTIFACT_DIR or /tmp): +# - lockdown.json: Original downloaded lockdown JSON +# - lockdown-params.env: Shell environment variables +# - lockdown-params.json: Structured JSON parameters + +- name: Parse Lockdown JSON + hosts: localhost + gather_facts: true + vars: + artifact_dir: "{{ lookup('env', 'ARTIFACT_DIR') | default('/artifacts', true) }}" + lockdown_filename: "{{ lockdown_uri | regex_replace('.*/', '') | regex_replace('.json$', '') }}" + + tasks: + - name: Validate lockdown_uri is provided + ansible.builtin.assert: + that: + - lockdown_uri is defined + - lockdown_uri | length > 0 + fail_msg: "lockdown_uri parameter is required" + success_msg: "Lockdown URI: {{ lockdown_uri }}" + + - name: Extract lockdown filename + ansible.builtin.debug: + msg: "Lockdown filename: {{ lockdown_filename }}" + + - name: Download lockdown JSON + ansible.builtin.get_url: + url: "{{ lockdown_uri }}" + dest: "{{ artifact_dir }}/{{ lockdown_filename }}.json" + mode: '0644' + force: true + validate_certs: false + register: lockdown_download + retries: 3 + delay: 5 + + - name: Read lockdown JSON + ansible.builtin.slurp: + src: "{{ artifact_dir }}/{{ lockdown_filename }}.json" + register: lockdown_content + + - name: Parse lockdown JSON + ansible.builtin.set_fact: + lockdown_data: "{{ lockdown_content['content'] | b64decode | from_json }}" + + - name: Detect lockdown type (hub vs spoke) + ansible.builtin.set_fact: + lockdown_type: "{{ 'hub' if ('hub' in lockdown_data) else 'spoke' }}" + + - name: Display lockdown type + ansible.builtin.debug: + msg: + - "==========================================" + - "Lockdown Type Detected: {{ lockdown_type | upper }}" + - "==========================================" + + # Hub lockdown parsing + - name: Parse hub lockdown + when: lockdown_type == 'hub' + block: + - name: Extract hub parameters + ansible.builtin.set_fact: + hub_ocp_pull_spec: "{{ lockdown_data.hub.ocp.pull_spec }}" + hub_ocp_version: "{{ lockdown_data.hub.ocp.major_version }}.{{ lockdown_data.hub.ocp.minor_version }}" + hub_acm_channel: "release-{{ lockdown_data.hub.acm.version_override }}" + hub_mce_channel: "{{ lockdown_data.hub.acm.mce_override | regex_replace('^v', 'stable-') | regex_replace('\\.\\d+$', '') }}" + hub_talm_catalog: "{{ lockdown_data.hub.talm.pull_spec | default('') }}" + hub_gitops_catalog: "{{ lockdown_data.hub.gitops.pull_spec | default('') }}" + + - name: Build hub parameters dictionary + ansible.builtin.set_fact: + lockdown_params: + LOCKDOWN_TYPE: "hub" + OCP_RELEASE_IMAGE: "{{ hub_ocp_pull_spec }}" + OCP_VERSION: "{{ hub_ocp_version }}" + ACM_CHANNEL: "{{ hub_acm_channel }}" + MCE_CHANNEL: "{{ hub_mce_channel }}" + TALM_CATALOG: "{{ hub_talm_catalog }}" + GITOPS_CATALOG: "{{ hub_gitops_catalog }}" + + # Spoke lockdown parsing + - name: Parse spoke lockdown + when: lockdown_type == 'spoke' + block: + - name: Extract spoke parameters + ansible.builtin.set_fact: + spoke_ocp_pull_spec: "{{ lockdown_data.deployment.ocp_pull_spec }}" + spoke_ocp_version: "{{ lockdown_data.deployment.ocp_version }}" + spoke_ztp_pull_spec: "{{ lockdown_data.deployment.ztp_pull_spec }}" + spoke_ztp_version: "{{ lockdown_data.deployment.ztp_version }}" + spoke_operator_source: "{{ lockdown_data.deployment.operator_source }}" + + - name: Build spoke parameters dictionary + ansible.builtin.set_fact: + lockdown_params: + LOCKDOWN_TYPE: "spoke" + OCP_PULL_SPEC: "{{ spoke_ocp_pull_spec }}" + OCP_VERSION: "{{ spoke_ocp_version }}" + ZTP_PULL_SPEC: "{{ spoke_ztp_pull_spec }}" + ZTP_VERSION: "{{ spoke_ztp_version }}" + OPERATOR_SOURCE: "{{ spoke_operator_source }}" + + # Output parameters + - name: Write parameters to env file + ansible.builtin.copy: + content: | + # Lockdown parameters parsed from: {{ lockdown_uri }} + # Lockdown type: {{ lockdown_type }} + # Generated: {{ ansible_date_time.iso8601 }} + {% for key, value in lockdown_params.items() %} + {{ key }}={{ value }} + {% endfor %} + dest: "{{ artifact_dir }}/{{ lockdown_filename }}-params.env" + mode: '0644' + + - name: Write parameters to JSON file + ansible.builtin.copy: + content: "{{ lockdown_params | to_nice_json }}" + dest: "{{ artifact_dir }}/{{ lockdown_filename }}-params.json" + mode: '0644' + + - name: Display parsed parameters + ansible.builtin.debug: + msg: + - "==========================================" + - "Parsed Parameters ({{ lockdown_type | upper }})" + - "==========================================" + - "{{ lockdown_params | to_nice_yaml }}" + - "==========================================" + - "Output files:" + - " - {{ artifact_dir }}/{{ lockdown_filename }}.json (original)" + - " - {{ artifact_dir }}/{{ lockdown_filename }}-params.env" + - " - {{ artifact_dir }}/{{ lockdown_filename }}-params.json" + - "==========================================" diff --git a/playbooks/telco-kpis/roles/gitea/README.md b/playbooks/telco-kpis/roles/gitea/README.md new file mode 100644 index 00000000..43881742 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/README.md @@ -0,0 +1,437 @@ +# Gitea Role for Telco KPIs Report Publishing + +Ansible role for deploying Gitea on bastion hosts and publishing Telco KPIs test reports to a Git repository. + +## Features + +- **Automated Gitea Deployment**: Deploy Gitea as a rootless podman container on bastion +- **Vault-Based Authentication**: Uses bastion credentials from HashiCorp Vault +- **Repository Management**: Auto-creates `telco-kpis-reports` repository +- **Report Publishing**: Publishes test reports with artifacts to Git +- **Markdown Index**: Maintains top-level README.md with links to all reports sorted by date +- **Environment-Aware Reporting**: Timestamp-based filtering excludes old tests when environment changes +- **Retention Policy**: Configurable report retention (default: keep last 7 days) + +## Architecture + +``` +Bastion Host (telco-kpis-prow-kni-qe-71-bastion) +├── Gitea Container (rootless podman) +│ ├── HTTP: http://bastion.kni-qe-71.telco-kpis.rdu3.redhat.com:3000/ +│ ├── SSH: ssh://git@bastion.kni-qe-71.telco-kpis.rdu3.redhat.com:2222/ +│ └── Database: SQLite3 (/data/gitea/gitea.db) +│ +├── Data Directory: ~/gitea/data/ +│ ├── gitea/conf/app.ini (configuration) +│ ├── git/repositories/ (Git repos) +│ └── gitea/log/ (logs) +│ +└── Repository: telco-kpis-reports + ├── README.md (index of all reports) + └── reports/ + └── YYYY-MM-DD/ + └── / + ├── .md + ├── oslat/ + ├── ptp/ + ├── cyclictest/ + ├── cpu-util/ + ├── reboot/ + ├── rfc2544/ + ├── rds-compare/ + └── node-info-.json +``` + +## Vault Integration + +### Admin Credentials + +The role uses **bastion credentials from HashiCorp Vault** instead of hardcoded passwords: + +```yaml +# Vault variables (automatically provided by getVaults() in Jenkins) +gitea_admin_user: "{{ ansible_user }}" # From bastion vault +gitea_admin_password: "{{ ansible_password }}" # From bastion vault +gitea_admin_email: "{{ gitea_admin_user }}@localhost" +``` + +**Vault Sources** (priority order): +1. Bastion `host_vars` vault (e.g., `telco-kpis-prow-kni-qe-71-bastion`) +2. `ansible_group_bastions` vault +3. `ansible_group_all` vault + +**Expected Vault Variables**: +- `ansible_user`: Bastion username (typically `telcov10n`) +- `ansible_password`: Bastion user password + +### Fallback Behavior + +If vault credentials are not available, the role will: +1. Check `BASTION_PASSWORD` environment variable +2. Fail with detailed error message indicating vault configuration issue + +## Usage + +### Basic Usage (Jenkins Job) + +```groovy +// In Jenkinsfile +stage('Generate Report') { + steps { + script { + def extraVarsList = [ + "spoke_cluster=spree-02", + "output_filename=report.md", + "timestamp=${TIMESTAMP}", + "development_mode=true" // Enable Gitea publishing + ] + + runAnsiblePlaybook( + playbookName: "generate-report.yml", + playbookPath: "playbooks/telco-kpis", + inventoryPath: "inventories/ocp-deployment/build-inventory.py", + volumeName: PODMAN_VOLUME_NAME, + artifactFolder: true, + extraVars: extraVarsList + ) + } + } +} +``` + +### Direct Ansible Usage + +```bash +# With vault credentials +ansible-playbook playbooks/telco-kpis/generate-report.yml \ + -i inventories/ocp-deployment/build-inventory.py \ + -e spoke_cluster=spree-02 \ + -e output_filename=report.md \ + -e timestamp=$(date +%Y%m%d-%H%M%S) \ + -e development_mode=true + +# Override credentials (for testing) +ansible-playbook playbooks/telco-kpis/generate-report.yml \ + -i inventories/ocp-deployment/build-inventory.py \ + -e spoke_cluster=spree-02 \ + -e development_mode=true \ + -e gitea_admin_user=admin \ + -e gitea_admin_password=mypassword +``` + +## Report Action Logic + +The role supports two modes of report publishing based on whether the environment configuration has changed: + +### UPDATE Mode (Environment Stable) + +**When**: No tests were excluded during report generation (`report_action: "update"`) + +**Behavior**: +- Removes existing reports for today's date before publishing +- Assumes all test results belong to the same environment configuration +- Example: Re-running tests to gather more data without changing BIOS/PerfProfile + +**Workflow**: +1. Find existing reports: `reports/2026-04-28/spree-02/telco-kpis-report-*.md` +2. Delete found reports +3. Publish new report with same date +4. README shows only latest report for today + +### NEW Mode (Environment Changed) + +**When**: Tests were excluded during report generation (`report_action: "new"`) + +**Behavior**: +- Keeps all existing reports (preserves historical data for different configurations) +- Adds new report alongside previous ones +- Each report represents a distinct environment configuration + +**Workflow**: +1. Skip deletion of existing reports +2. Publish new report with unique timestamp +3. README lists all reports sorted by date (newest first) + +### Timestamp-Based Filtering (Upstream) + +**Note**: The `report_action` variable is determined by the `generate-report.yml` playbook, not by this role. + +**How It Works**: +1. **collect-node-info** captures environment state (BIOS, PerfProfile, etc.) with UTC timestamp +2. **Tests run** after collect-node-info, artifacts saved with UTC timestamps +3. **generate-report** compares test timestamps vs node-info timestamp: + - Tests older than node-info → EXCLUDED (belong to previous environment) + - Tests newer than node-info → INCLUDED (belong to current environment) +4. If tests excluded → `report_action: "new"` (environment changed) +5. If no tests excluded → `report_action: "update"` (environment stable) + +**Critical**: Timestamps must be in same timezone (UTC) for accurate comparison: +- Test directories: `{test}-{spoke}-YYYYMMDD-HHMMSS` (UTC from eco-ci-cd container) +- Node-info: ISO8601 UTC format (e.g., `2026-04-28T15:48:45Z`) +- Conversion: `date -u -d "{iso8601}" +%Y%m%d-%H%M%S` (maintains UTC, not local time) + +**Example Workflow**: +```bash +# Day 1: Initial baseline +collect-node-info (10:00 UTC) → oslat (10:30 UTC), ptp (11:00 UTC) → generate-report +# Result: report-1 with oslat + ptp + +# Day 1: Change BIOS settings +collect-node-info (14:00 UTC) → cyclictest (14:30 UTC) → generate-report +# Filtering: oslat (10:30 < 14:00) EXCLUDED, ptp (11:00 < 14:00) EXCLUDED, cyclictest (14:30 > 14:00) INCLUDED +# Result: report-2 with cyclictest only, report-1 preserved +``` + +## Retention Policy + +The role automatically removes old reports to prevent unlimited growth: + +**Default**: Keep last 7 days (configurable via `gitea_report_retention_days`) + +**How It Works**: +1. Find all date directories in `reports/` +2. Sort by date descending +3. Keep first N directories (N = `gitea_report_retention_days`) +4. Delete remaining directories + +**Override**: +```yaml +# Keep last 14 days +gitea_report_retention_days: 14 + +# Disable retention (keep all reports) +gitea_report_retention_days: 0 +``` + +## Role Variables + +### Required Variables (from playbook) + +| Variable | Description | Example | +|----------|-------------|---------| +| `spoke_cluster` | Spoke cluster name | `spree-02` | +| `report_file` | Markdown report filename | `telco-kpis-report-spree-02-20260427.md` | +| `report_tarball` | Artifacts tarball filename | `spree-02-artifacts-20260427-101530.tar.gz` | +| `report_action` | Report publishing mode | `new` or `update` | +| `development_mode` | Enable Gitea publishing | `true` | + +### Default Variables (can be overridden) + +| Variable | Default | Description | +|----------|---------|-------------| +| `gitea_container_name` | `gitea` | Podman container name | +| `gitea_http_port` | `3000` | HTTP port for Gitea web UI | +| `gitea_ssh_port` | `2222` | SSH port for Git operations | +| `gitea_domain` | `bastion.kni-qe-71.telco-kpis.rdu3.redhat.com` | Gitea domain | +| `gitea_admin_user` | `{{ ansible_user }}` | Admin username (from vault) | +| `gitea_admin_password` | `{{ ansible_password }}` | Admin password (from vault) | +| `gitea_repo_name` | `telco-kpis-reports` | Repository name | +| `gitea_image` | `docker.io/gitea/gitea:latest` | Gitea container image | +| `gitea_report_retention_days` | `7` | Keep reports for last N days (0 = keep all) | + +## Tasks + +### deploy.yml +- Checks if Gitea container exists +- Configures firewall rules (opens ports 3000/tcp and 2222/tcp) +- Creates data directory +- Deploys Gitea container with rootless podman +- Waits for Gitea to become accessible + +### initialize.yml +- Checks installation status (INSTALL_LOCK) +- Completes installation via web API +- Creates admin user +- Generates API token for automation + +### create-repository.yml +- Checks if `telco-kpis-reports` repository exists +- Creates repository with initial README +- Returns clone URLs (HTTP and SSH) + +### publish-report.yml +- Clones repository to temporary directory +- Handles report action based on `report_action` variable: + - **UPDATE**: Environment stable - Remove existing reports for today before publishing + - **NEW**: Environment changed - Keep all historical reports, add new report alongside +- Creates dated directory structure: `reports/YYYY-MM-DD//` +- Copies Markdown report +- Extracts artifacts tarball +- Renames artifact directories to remove timestamps (e.g., `oslat-spree-02-20260428-110330/` → `oslat/`) +- Keeps only latest run per test type +- Fixes Markdown links to point to extracted artifacts +- Applies retention policy (deletes reports older than `gitea_report_retention_days`) +- Updates top-level README.md index +- Commits and pushes changes + +### validate-credentials.yml +- Validates `ansible_user` is set +- Validates `ansible_password` is set +- Provides helpful error messages if vault credentials missing + +## Examples + +### Jenkins Job Parameter + +```groovy +booleanParam { + name('DEVELOPMENT_MODE') + defaultValue(false) + description('Enable Gitea publishing: Deploy Gitea (if needed) and publish report') +} +``` + +### Report Directory Structure + +After publishing, the repository contains: + +``` +telco-kpis-reports/ +├── README.md # Auto-generated index +└── reports/ + ├── 2026-04-27/ + │ ├── spree-01/ + │ │ ├── telco-kpis-report-spree-01-20260427-090530.md + │ │ ├── oslat/ + │ │ │ ├── oslat0_logs + │ │ │ ├── oslat_report.xml + │ │ │ └── podman-run.log + │ │ ├── ptp/ + │ │ ├── cyclictest/ + │ │ └── ... + │ └── spree-02/ + │ └── ... + └── 2026-04-26/ + └── ... +``` + +### Accessing Published Reports + +**Web UI**: http://bastion.kni-qe-71.telco-kpis.rdu3.redhat.com:3000/telcov10n/telco-kpis-reports + +**Git Clone**: +```bash +# HTTPS (requires credentials) +git clone http://bastion.kni-qe-71.telco-kpis.rdu3.redhat.com:3000/telcov10n/telco-kpis-reports.git + +# SSH (requires SSH key) +git clone ssh://git@bastion.kni-qe-71.telco-kpis.rdu3.redhat.com:2222/telcov10n/telco-kpis-reports.git +``` + +## Troubleshooting + +### Issue: Cannot access Gitea web UI (connection refused) + +**Symptom**: `curl: (7) Failed to connect to bastion...:3000: Connection refused` + +**Solution**: Ensure firewall ports are open: +```bash +# Check firewall rules +sudo firewall-cmd --list-ports + +# Manually add rules if needed +sudo firewall-cmd --add-port=3000/tcp --permanent +sudo firewall-cmd --add-port=2222/tcp --permanent +sudo firewall-cmd --reload +``` + +**Note**: The role automatically configures firewall rules during deployment if firewalld is active. + +### Issue: Gitea admin password not found + +**Error**: +``` +FAILED! => Gitea admin password not found in vault. +Expected vault variable: 'ansible_password' from bastion vault. +``` + +**Solution**: +1. Verify bastion vault contains `ansible_password` +2. Check vault path: `telcov10n-ci/teams/telco-kpis/bastions/telco-kpis-prow-kni-qe-71-bastion` +3. Ensure `getVaults()` retrieved bastion vault in Jenkins stage +4. Temporarily override with `-e gitea_admin_password=...` for testing + +### Issue: Gitea container already exists + +**Symptom**: Role fails because container already running + +**Solution**: Role is idempotent - it skips deployment if container exists. To redeploy: +```bash +podman stop gitea && podman rm gitea +sudo rm -rf ~/gitea/data +``` + +### Issue: Git push fails with authentication error + +**Symptom**: `fatal: Authentication failed` + +**Solution**: Check credentials in clone URL: +```yaml +# Correct format (credentials embedded) +gitea_repo_http_url: "http://{{ gitea_admin_user }}:{{ gitea_admin_password }}@{{ gitea_domain }}:{{ gitea_http_port }}/{{ gitea_admin_user }}/{{ gitea_repo_name }}.git" +``` + +### Issue: Markdown links broken in published report + +**Symptom**: Links to artifacts return 404 + +**Cause**: Report references `.md.artifacts/` but role extracts to root + +**Solution**: Role automatically fixes links with regex replace: +```yaml +regexp: '{{ report_file }}\.artifacts/' +replace: '' +``` + +### Issue: Report action always "update" even after environment change + +**Symptom**: Multiple reports not showing up, only latest report visible + +**Cause**: Timezone mismatch in timestamp comparison upstream in `generate-report.yml` + +**Root Cause**: +- Test directory timestamps are in UTC: `oslat-spree-02-20260428-144048` +- Node-info timestamp is ISO8601 UTC: `2026-04-28T15:48:45Z` +- Conversion using `date -d` (without `-u`) converts to local time +- String comparison fails: `"144048" (UTC) > "114845" (EDT)` → incorrect result + +**Solution**: Ensure `generate-report.yml` uses `date -u -d` to maintain UTC: +```yaml +- name: Convert node-info UTC timestamp to UTC YYYYMMDD-HHMMSS format + ansible.builtin.shell: | + date -u -d "{{ node_info_collected_at_utc.stdout }}" +%Y%m%d-%H%M%S + register: node_info_utc_ts +``` + +**Verification**: +```bash +# Check if timestamps match timezone +ssh bastion "ls -l /home/telcov10n/telco-kpis-artifacts/spree-02/" +# Compare with node-info timestamp +ssh bastion "jq -r '.collected_at' /home/telcov10n/telco-kpis-artifacts/spree-02/node-info-spree-02.json" +``` + +**Git Commit**: `48e7606` (2026-04-28) - Critical timezone fix + +## Security Considerations + +1. **Vault-Based Credentials**: Never hardcode passwords - always use vault +2. **HTTP (not HTTPS)**: Current setup uses HTTP - Gitea is internal-only on bastion +3. **Embedded Credentials in Clone URL**: Credentials are embedded in Git remote URL during clone/push (temporary, in memory only) +4. **Podman Rootless**: Gitea runs as rootless container for isolation +5. **Auto-Cleanup**: Temporary git work directory is removed after publishing + +## Future Enhancements + +- [ ] Support HTTPS with self-signed certificates +- [ ] Use Git SSH authentication instead of HTTPS with embedded credentials +- [ ] Add webhook support for external notifications +- [ ] Support multiple bastions with separate Gitea instances +- [ ] Backup and restore functionality for Gitea data +- [ ] Integration with Red Hat SSO for authentication + +## References + +- **Gitea Documentation**: https://docs.gitea.io/ +- **Gitea API**: https://docs.gitea.io/en-us/api-usage/ +- **Your Prow Gitea Steps**: `~/repos/ztp-left-shifting/openshift-ci-dev/openshift/ccardeno-fork-release/ci-operator/step-registry/telcov10n/metal-single-node-spoke/gitea/` diff --git a/playbooks/telco-kpis/roles/gitea/defaults/main.yml b/playbooks/telco-kpis/roles/gitea/defaults/main.yml new file mode 100644 index 00000000..db3ca97a --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/defaults/main.yml @@ -0,0 +1,28 @@ +--- +# Gitea Configuration Defaults +gitea_container_name: gitea +gitea_http_port: 3000 +gitea_ssh_port: 2222 +gitea_domain: "bastion.kni-qe-71.telco-kpis.rdu3.redhat.com" +gitea_root_url: "http://{{ gitea_domain }}:{{ gitea_http_port }}/" + +# Admin credentials from vault (defaults to bastion user credentials) +# These are typically provided by ansible_group_bastions vault or bastion host_vars +gitea_admin_user: "{{ ansible_user | default('telcov10n') }}" +gitea_admin_password: "{{ ansible_password | default(lookup('env', 'BASTION_PASSWORD') | default('', true)) }}" +gitea_admin_email: "{{ gitea_admin_user }}@localhost" + +gitea_data_dir: "{{ ansible_env.HOME }}/gitea/data" +gitea_repo_name: telco-kpis-reports +gitea_repo_description: "Telco KPIs Test Reports Archive" +gitea_image: "docker.io/gitea/gitea:latest" + +# API endpoints +gitea_api_base: "{{ gitea_root_url }}api/v1" +gitea_install_url: "{{ gitea_root_url }}" + +# Development mode flag (passed from playbook) +development_mode: false + +# Repository retention policy +gitea_report_retention_days: 7 # Keep reports for last 7 days diff --git a/playbooks/telco-kpis/roles/gitea/tasks/create-repository.yml b/playbooks/telco-kpis/roles/gitea/tasks/create-repository.yml new file mode 100644 index 00000000..40e97cf8 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/create-repository.yml @@ -0,0 +1,57 @@ +--- +# Create telco-kpis-reports repository if it doesn't exist + +- name: Check if repository exists + ansible.builtin.uri: + url: "{{ gitea_api_base }}/repos/{{ gitea_admin_user }}/{{ gitea_repo_name }}" + method: GET + user: "{{ gitea_admin_user }}" + password: "{{ gitea_admin_password }}" + force_basic_auth: true + status_code: [200, 404] + register: repo_check + +- name: Display repository status + ansible.builtin.debug: + msg: "Repository exists: {{ repo_check.status == 200 }}" + +- name: Create repository + when: repo_check.status == 404 + block: + - name: Create telco-kpis-reports repository via API + ansible.builtin.uri: + url: "{{ gitea_api_base }}/user/repos" + method: POST + user: "{{ gitea_admin_user }}" + password: "{{ gitea_admin_password }}" + force_basic_auth: true + body_format: json + body: + name: "{{ gitea_repo_name }}" + description: "{{ gitea_repo_description }}" + private: false + auto_init: true + default_branch: main + readme: "Default" + status_code: [201, 422] # 422 if already exists + register: repo_create + + - name: Display repository creation result + ansible.builtin.debug: + msg: "Repository created: {{ repo_create.status == 201 }}" + + - name: Wait for repository initialization + ansible.builtin.pause: + seconds: 5 + when: repo_create.status == 201 + +- name: Get repository clone URL + ansible.builtin.set_fact: + gitea_repo_http_url: "http://{{ gitea_admin_user }}:{{ gitea_admin_password }}@{{ gitea_domain }}:{{ gitea_http_port }}/{{ gitea_admin_user }}/{{ gitea_repo_name }}.git" + gitea_repo_web_url: "{{ gitea_root_url }}{{ gitea_admin_user }}/{{ gitea_repo_name }}" + +- name: Display repository URLs + ansible.builtin.debug: + msg: + - "Repository Web URL: {{ gitea_repo_web_url }}" + - "Repository Clone URL: http://{{ gitea_domain }}:{{ gitea_http_port }}/{{ gitea_admin_user }}/{{ gitea_repo_name }}.git" diff --git a/playbooks/telco-kpis/roles/gitea/tasks/deploy.yml b/playbooks/telco-kpis/roles/gitea/tasks/deploy.yml new file mode 100644 index 00000000..b50ac087 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/deploy.yml @@ -0,0 +1,142 @@ +--- +# Deploy Gitea container on bastion if not already running + +- name: Check if Gitea is accessible via localhost + ansible.builtin.uri: + url: "http://localhost:{{ gitea_http_port }}/" + method: GET + status_code: 200 + validate_certs: false + register: gitea_accessible + failed_when: false + changed_when: false + +- name: Check if firewalld is running + ansible.builtin.systemd: + name: firewalld + register: firewalld_check + failed_when: false + become: true + +- name: Check if Gitea HTTP port is open in firewall + ansible.builtin.shell: firewall-cmd --list-ports | grep -q "{{ gitea_http_port }}/tcp" + register: http_port_open + failed_when: false + changed_when: false + become: true + when: + - firewalld_check.status is defined + - firewalld_check.status.ActiveState == "active" + +- name: Check if Gitea SSH port is open in firewall + ansible.builtin.shell: firewall-cmd --list-ports | grep -q "{{ gitea_ssh_port }}/tcp" + register: ssh_port_open + failed_when: false + changed_when: false + become: true + when: + - firewalld_check.status is defined + - firewalld_check.status.ActiveState == "active" + +- name: Set Gitea deployment needed flag + ansible.builtin.set_fact: + gitea_deployment_needed: "{{ (gitea_accessible.status is not defined or gitea_accessible.status != 200) or (firewalld_check.status is defined and firewalld_check.status.ActiveState == 'active' and (http_port_open.rc != 0 or ssh_port_open.rc != 0)) }}" + +- name: Display Gitea status + ansible.builtin.debug: + msg: "{{ 'Gitea is running and firewall configured - skipping deployment' if not gitea_deployment_needed else 'Gitea deployment or firewall configuration needed' }}" + +- name: Deploy Gitea + when: gitea_deployment_needed + block: + - name: Configure firewall rules + when: + - firewalld_check.status is defined + - firewalld_check.status.ActiveState == "active" + - http_port_open.rc != 0 or ssh_port_open.rc != 0 + block: + - name: Open Gitea HTTP port in firewall + ansible.posix.firewalld: + port: "{{ gitea_http_port }}/tcp" + permanent: true + state: enabled + immediate: true + become: true + + - name: Open Gitea SSH port in firewall + ansible.posix.firewalld: + port: "{{ gitea_ssh_port }}/tcp" + permanent: true + state: enabled + immediate: true + become: true + + - name: Display firewall configuration status + ansible.builtin.debug: + msg: "Firewall configured: ports 3000/tcp and 2222/tcp opened" + + - name: Deploy or redeploy Gitea container + when: gitea_accessible.status is not defined or gitea_accessible.status != 200 + block: + - name: Check if Gitea container exists + ansible.builtin.command: podman ps -a --format json + register: podman_containers + changed_when: false + + - name: Parse container list + ansible.builtin.set_fact: + gitea_container_exists: "{{ (podman_containers.stdout | from_json) | selectattr('Names', 'contains', gitea_container_name) | list | length > 0 }}" + + - name: Remove existing broken Gitea container + ansible.builtin.command: podman rm -f {{ gitea_container_name }} + when: gitea_container_exists + register: remove_result + changed_when: remove_result.rc == 0 + + - name: Create Gitea data directory + ansible.builtin.file: + path: "{{ gitea_data_dir }}" + state: directory + mode: '0755' + + - name: Pull Gitea container image + ansible.builtin.command: podman pull {{ gitea_image }} + register: pull_result + changed_when: "'Downloaded newer image' in pull_result.stdout or 'Copying blob' in pull_result.stderr" + + - name: Deploy Gitea container + ansible.builtin.command: > + podman run -d + --name {{ gitea_container_name }} + -p {{ gitea_http_port }}:3000 + -p {{ gitea_ssh_port }}:22 + -v {{ gitea_data_dir }}:/data:Z + -e USER_UID={{ ansible_user_uid }} + -e USER_GID={{ ansible_user_gid }} + -e GITEA__database__DB_TYPE=sqlite3 + -e GITEA__database__PATH=/data/gitea/gitea.db + -e GITEA__server__DOMAIN={{ gitea_domain }} + -e GITEA__server__ROOT_URL={{ gitea_root_url }} + -e GITEA__server__SSH_DOMAIN={{ gitea_domain }} + -e GITEA__server__SSH_PORT={{ gitea_ssh_port }} + -e GITEA__service__DISABLE_REGISTRATION=true + -e GITEA__repository__DEFAULT_BRANCH=main + --restart=always + {{ gitea_image }} + register: deploy_result + changed_when: deploy_result.rc == 0 + + - name: Wait for Gitea to start + ansible.builtin.uri: + url: "http://localhost:{{ gitea_http_port }}/" + method: GET + status_code: 200 + validate_certs: false + register: gitea_health + until: gitea_health.status == 200 + retries: 30 + delay: 2 + +- name: Display Gitea deployment status + ansible.builtin.debug: + msg: "Gitea is accessible at {{ gitea_root_url }}" diff --git a/playbooks/telco-kpis/roles/gitea/tasks/initialize.yml b/playbooks/telco-kpis/roles/gitea/tasks/initialize.yml new file mode 100644 index 00000000..f69955c2 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/initialize.yml @@ -0,0 +1,240 @@ +--- +# Initialize Gitea if not already configured + +- name: Check if Gitea is installed (INSTALL_LOCK status) + ansible.builtin.command: podman exec -u git {{ gitea_container_name}} cat /data/gitea/conf/app.ini + register: gitea_config + failed_when: false + changed_when: false + +- name: Parse installation status + ansible.builtin.set_fact: + gitea_installed: "{{ 'INSTALL_LOCK = true' in gitea_config.stdout }}" + when: gitea_config.rc == 0 + +- name: Set gitea_installed to false if config doesn't exist + ansible.builtin.set_fact: + gitea_installed: false + when: gitea_config.rc != 0 + +- name: Display installation status + ansible.builtin.debug: + msg: "Gitea installed: {{ gitea_installed }}" + +- name: Complete Gitea installation via API + when: not gitea_installed + block: + - name: Get installation page CSRF token + ansible.builtin.uri: + url: "{{ gitea_install_url }}" + method: GET + return_content: true + register: install_page + + - name: Extract CSRF token + ansible.builtin.set_fact: + csrf_token: "{{ install_page.content | regex_search('name=\"_csrf\" value=\"([^\"]+)\"', '\\1') | default([None], true) | first }}" + + - name: Check if installation form is available + ansible.builtin.set_fact: + installation_needed: "{{ csrf_token is not none }}" + + - name: Display CSRF token status + ansible.builtin.debug: + msg: "{{ 'CSRF Token found - installation form available' if csrf_token is not none else 'No CSRF token - Gitea already configured from environment variables' }}" + + - name: Submit installation form + ansible.builtin.uri: + url: "{{ gitea_install_url }}" + method: POST + body_format: form-urlencoded + body: + _csrf: "{{ csrf_token }}" + db_type: SQLite3 + db_path: /data/gitea/gitea.db + app_name: "Telco KPIs Test Reports" + repo_root_path: /data/git/repositories + lfs_root_path: /data/git/lfs + run_user: git + domain: "{{ gitea_domain }}" + ssh_port: "{{ gitea_ssh_port }}" + http_port: "3000" + app_url: "{{ gitea_root_url }}" + log_root_path: /data/gitea/log + admin_name: "{{ gitea_admin_user }}" + admin_passwd: "{{ gitea_admin_password }}" + admin_confirm_passwd: "{{ gitea_admin_password }}" + admin_email: "{{ gitea_admin_email }}" + follow_redirects: all + status_code: [200, 302, 303] + register: install_result + when: csrf_token is not none + + - name: Wait for installation to complete + ansible.builtin.pause: + seconds: 10 + when: csrf_token is not none + + - name: Verify installation completed + ansible.builtin.command: podman exec -u git {{ gitea_container_name }} cat /data/gitea/conf/app.ini + register: verify_config + changed_when: false + when: csrf_token is not none + + - name: Confirm INSTALL_LOCK is set + ansible.builtin.assert: + that: + - "'INSTALL_LOCK' in verify_config.stdout" + fail_msg: "Gitea installation did not complete successfully" + success_msg: "Gitea installation completed successfully" + when: csrf_token is not none + + - name: Fallback - Complete installation manually + when: csrf_token is none + block: + - name: Set INSTALL_LOCK to true in app.ini + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + sed -i 's/INSTALL_LOCK = false/INSTALL_LOCK = true/' /data/gitea/conf/app.ini + register: set_lock + changed_when: set_lock.rc == 0 + + - name: Run database migration + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + gitea migrate --config /data/gitea/conf/app.ini + register: migrate_result + changed_when: migrate_result.rc == 0 + + - name: Create admin user using Gitea CLI + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + gitea admin user create + --config /data/gitea/conf/app.ini + --username {{ gitea_admin_user }} + --password {{ gitea_admin_password }} + --email {{ gitea_admin_email }} + --admin + --must-change-password=false + register: create_admin_result + failed_when: false + changed_when: "'successfully created' in create_admin_result.stdout" + + - name: Restart Gitea container to apply changes + ansible.builtin.command: podman restart {{ gitea_container_name }} + register: restart_result + changed_when: restart_result.rc == 0 + + - name: Wait for Gitea to restart + ansible.builtin.uri: + url: "http://localhost:{{ gitea_http_port }}/" + method: GET + status_code: 200 + register: gitea_health + until: gitea_health.status == 200 + retries: 30 + delay: 2 + + - name: Display admin user creation result + ansible.builtin.debug: + msg: "{{ 'Admin user created and Gitea restarted' if create_admin_result.rc == 0 else 'Admin user may already exist (non-fatal)' }}" + +- name: Ensure admin user exists + block: + - name: Check if admin user exists via API + ansible.builtin.uri: + url: "{{ gitea_api_base }}/users/{{ gitea_admin_user }}" + method: GET + status_code: [200, 404] + register: user_check + failed_when: false + + - name: Create admin user if not exists + when: user_check.status == 404 + block: + - name: Check INSTALL_LOCK status + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + grep INSTALL_LOCK /data/gitea/conf/app.ini + register: install_lock_check + changed_when: false + + - name: Complete installation if needed + when: "'INSTALL_LOCK = false' in install_lock_check.stdout" + block: + - name: Set INSTALL_LOCK to true + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + sed -i 's/INSTALL_LOCK = false/INSTALL_LOCK = true/' /data/gitea/conf/app.ini + changed_when: true + + - name: Run database migration + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + gitea migrate --config /data/gitea/conf/app.ini + changed_when: true + + - name: Create admin user via CLI + ansible.builtin.command: > + podman exec -u git {{ gitea_container_name }} + gitea admin user create + --config /data/gitea/conf/app.ini + --username {{ gitea_admin_user }} + --password {{ gitea_admin_password }} + --email {{ gitea_admin_email }} + --admin + --must-change-password=false + register: create_admin_cli + failed_when: false + changed_when: "'successfully created' in create_admin_cli.stdout" + + - name: Restart Gitea if user was created + ansible.builtin.command: podman restart {{ gitea_container_name }} + when: create_admin_cli.changed + changed_when: true + + - name: Wait for Gitea to restart + ansible.builtin.uri: + url: "http://localhost:{{ gitea_http_port }}/" + method: GET + status_code: 200 + register: gitea_health + until: gitea_health.status == 200 + retries: 30 + delay: 2 + when: create_admin_cli.changed + + - name: Display admin user status + ansible.builtin.debug: + msg: "{{ 'Admin user exists' if user_check.status == 200 else 'Admin user created via CLI' }}" + +- name: Create API token for automation + when: gitea_api_token is not defined or gitea_api_token == "" + block: + - name: Generate API token via HTTP Basic Auth + ansible.builtin.uri: + url: "{{ gitea_api_base }}/users/{{ gitea_admin_user }}/tokens" + method: POST + user: "{{ gitea_admin_user }}" + password: "{{ gitea_admin_password }}" + force_basic_auth: true + body_format: json + body: + name: "ansible-automation-{{ ansible_date_time.epoch }}" + scopes: + - write:repository + - write:user + - write:organization + status_code: [201, 422] # 422 if token with same name exists + register: token_result + + - name: Extract API token + ansible.builtin.set_fact: + gitea_api_token: "{{ token_result.json.sha1 }}" + when: + - token_result.status == 201 + - token_result.json.sha1 is defined + + - name: Display API token status + ansible.builtin.debug: + msg: "API Token: {{ 'Created' if token_result.status == 201 else 'Already exists or failed' }}" diff --git a/playbooks/telco-kpis/roles/gitea/tasks/main.yml b/playbooks/telco-kpis/roles/gitea/tasks/main.yml new file mode 100644 index 00000000..748be94f --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/main.yml @@ -0,0 +1,26 @@ +--- +# Main entry point for Gitea role +# Handles deployment, initialization, and report publishing + +- name: Validate credentials from vault + ansible.builtin.include_tasks: validate-credentials.yml + when: development_mode | default(false) | bool + +- name: Include deployment tasks + ansible.builtin.include_tasks: deploy.yml + when: development_mode | default(false) | bool + +- name: Include initialization tasks + ansible.builtin.include_tasks: initialize.yml + when: development_mode | default(false) | bool + +- name: Include repository creation tasks + ansible.builtin.include_tasks: create-repository.yml + when: development_mode | default(false) | bool + +- name: Include report publishing tasks + ansible.builtin.include_tasks: publish-report.yml + when: + - development_mode | default(false) | bool + - report_file is defined + - report_tarball is defined diff --git a/playbooks/telco-kpis/roles/gitea/tasks/publish-report.yml b/playbooks/telco-kpis/roles/gitea/tasks/publish-report.yml new file mode 100644 index 00000000..e3ec2378 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/publish-report.yml @@ -0,0 +1,261 @@ +--- +# Publish test report and artifacts to Gitea repository + +- name: Set report publication variables + ansible.builtin.set_fact: + report_date: "{{ ansible_date_time.date }}" + report_timestamp: "{{ ansible_date_time.iso8601_basic_short }}" + git_work_dir: "/tmp/gitea-publish-{{ ansible_date_time.epoch }}" + report_dir_name: "{{ ansible_date_time.date }}" + +- name: Display publication info + ansible.builtin.debug: + msg: + - "Publishing report: {{ report_file }}" + - "Artifacts tarball: {{ report_tarball }}" + - "Spoke cluster: {{ spoke_cluster }}" + - "Report date: {{ report_date }}" + - "Report action: {{ report_action | default('update') | upper }}" + - "{{ 'Creating NEW report (environment changed)' if report_action == 'new' else 'Updating EXISTING report (environment stable)' }}" + +- name: Create temporary work directory + ansible.builtin.file: + path: "{{ git_work_dir }}" + state: directory + mode: '0755' + +- name: Clone repository + ansible.builtin.git: + repo: "{{ gitea_repo_http_url }}" + dest: "{{ git_work_dir }}" + version: main + force: true + environment: + GIT_TERMINAL_PROMPT: "0" + +- name: Handle report update (remove existing report for today) + when: report_action | default('update') == 'update' + block: + - name: Check if today's report directory exists + ansible.builtin.stat: + path: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + register: today_report_dir + + - name: Find existing reports for today + ansible.builtin.find: + paths: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + patterns: "telco-kpis-report-{{ spoke_cluster }}-*.md" + register: existing_reports + when: today_report_dir.stat.exists and today_report_dir.stat.isdir + + - name: Remove existing report files (update mode) + ansible.builtin.file: + path: "{{ item.path }}" + state: absent + loop: "{{ existing_reports.files }}" + when: + - existing_reports is defined + - existing_reports.files is defined + - existing_reports.files | length > 0 + + - name: Display update action message + ansible.builtin.debug: + msg: "{{ 'Removed ' + (existing_reports.files | length | string) + ' existing report(s) for update' if (existing_reports is defined and existing_reports.files is defined and existing_reports.files | length > 0) else 'No existing reports to remove' }}" + when: existing_reports is defined + +- name: Display new report action message + ansible.builtin.debug: + msg: "Creating new report (environment configuration changed - keeping all historical reports)" + when: report_action | default('update') == 'new' + +- name: Apply retention policy to Gitea repository + when: gitea_report_retention_days is defined and gitea_report_retention_days | int > 0 + block: + - name: Find all date directories in reports + ansible.builtin.find: + paths: "{{ git_work_dir }}/reports" + file_type: directory + recurse: no + patterns: "????-??-??" + register: date_directories + + - name: Sort date directories and identify old ones for deletion + ansible.builtin.set_fact: + sorted_dates: "{{ date_directories.files | map(attribute='path') | map('basename') | sort | reverse | list }}" + + - name: Determine directories to delete (keep only last N days) + ansible.builtin.set_fact: + dirs_to_delete: "{{ sorted_dates[gitea_report_retention_days | int:] }}" + + - name: Display retention policy info + ansible.builtin.debug: + msg: + - "Retention policy: Keep last {{ gitea_report_retention_days }} days" + - "Total date directories: {{ sorted_dates | length }}" + - "Directories to keep: {{ sorted_dates[:gitea_report_retention_days | int] | join(', ') }}" + - "Directories to delete: {{ dirs_to_delete | join(', ') if dirs_to_delete | length > 0 else 'None' }}" + + - name: Remove old report directories beyond retention period + ansible.builtin.file: + path: "{{ git_work_dir }}/reports/{{ item }}" + state: absent + loop: "{{ dirs_to_delete }}" + when: dirs_to_delete | length > 0 + +- name: Create report directory structure + ansible.builtin.file: + path: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + state: directory + mode: '0755' + +- name: Copy report markdown file + ansible.builtin.copy: + src: "{{ lookup('env', 'ARTIFACT_DIR') | default('/artifacts', true) }}/reports/{{ report_file }}" + dest: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}/{{ report_file }}" + mode: '0644' + +- name: Extract artifacts tarball + ansible.builtin.unarchive: + src: "{{ lookup('env', 'ARTIFACT_DIR') | default('/artifacts', true) }}/reports/{{ report_tarball }}" + dest: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}/" + remote_src: false + +- name: Find extracted artifact directories + ansible.builtin.find: + paths: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + file_type: directory + patterns: "*-{{ spoke_cluster }}-*" + register: artifact_dirs + +- name: Keep only latest test run and rename to remove timestamps + ansible.builtin.shell: + cmd: | + cd "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + + # Pattern: {test_name}-{spoke_cluster}-{YYYYMMDD}-{HHMMSS} + # Goal: Keep only the latest run per test_name, rename it to just {test_name} + + # First, collect all directories and group by test name + declare -A latest_dirs + declare -A latest_timestamps + + for dir in *-{{ spoke_cluster }}-*; do + if [ ! -d "$dir" ]; then + continue + fi + + # Extract test name and timestamp + test_name=$(echo "$dir" | sed -E 's/-{{ spoke_cluster }}-[0-9]{8}-[0-9]{6}$//') + timestamp=$(echo "$dir" | grep -oE '[0-9]{8}-[0-9]{6}$') + + # Track latest run for each test name + if [ -z "${latest_timestamps[$test_name]}" ] || [ "$timestamp" \> "${latest_timestamps[$test_name]}" ]; then + # If we already had a latest, mark it for deletion + if [ -n "${latest_dirs[$test_name]}" ]; then + echo "Deleting older run: ${latest_dirs[$test_name]}" + rm -rf "${latest_dirs[$test_name]}" + fi + latest_dirs[$test_name]="$dir" + latest_timestamps[$test_name]="$timestamp" + else + # This is an older run, delete it + echo "Deleting older run: $dir" + rm -rf "$dir" + fi + done + + # Now rename the latest runs to remove timestamps + for test_name in "${!latest_dirs[@]}"; do + dir="${latest_dirs[$test_name]}" + if [ "$dir" != "$test_name" ]; then + # Remove target directory if it exists (from previous tarball extraction) + if [ -d "$test_name" ]; then + echo "Removing existing directory: $test_name" + rm -rf "$test_name" + fi + echo "Renaming latest run: $dir -> $test_name" + mv "$dir" "$test_name" + fi + done + executable: /bin/bash + when: artifact_dirs.files | length > 0 + changed_when: true + +- name: Fix markdown links to point to extracted artifacts + ansible.builtin.replace: + path: "{{ git_work_dir }}/reports/{{ report_dir_name }}/{{ spoke_cluster }}/{{ report_file }}" + regexp: '{{ report_file }}\.artifacts/' + replace: '' + +- name: Find all report markdown files + ansible.builtin.find: + paths: "{{ git_work_dir }}/reports" + patterns: "*.md" + recurse: true + register: found_reports + +- name: Build report list for README + ansible.builtin.set_fact: + report_list: "{{ found_reports.files | map(attribute='path') | map('regex_replace', git_work_dir + '/reports/', '') | list }}" + +- name: Update top-level README.md + ansible.builtin.template: + src: README.md.j2 + dest: "{{ git_work_dir }}/README.md" + mode: '0644' + +- name: Configure git user + ansible.builtin.command: "{{ item }}" + args: + chdir: "{{ git_work_dir }}" + loop: + - git config user.email "{{ gitea_admin_email }}" + - git config user.name "Telco KPIs Automation" + changed_when: false + +- name: Add all files to git + ansible.builtin.command: git add -A + args: + chdir: "{{ git_work_dir }}" + changed_when: false + +- name: Check if there are changes to commit + ansible.builtin.command: git diff --cached --exit-code + args: + chdir: "{{ git_work_dir }}" + register: git_diff + failed_when: false + changed_when: git_diff.rc != 0 + +- name: Commit changes + ansible.builtin.command: > + git commit -m "Add test report for {{ spoke_cluster }} - {{ report_date }}" + args: + chdir: "{{ git_work_dir }}" + when: git_diff.rc != 0 + changed_when: true + +- name: Push to Gitea + ansible.builtin.command: git push origin main + args: + chdir: "{{ git_work_dir }}" + when: git_diff.rc != 0 + environment: + GIT_TERMINAL_PROMPT: "0" + changed_when: true + +- name: Cleanup temporary work directory + ansible.builtin.file: + path: "{{ git_work_dir }}" + state: absent + +- name: Display publication success + ansible.builtin.debug: + msg: + - "==========================================" + - "Report Published Successfully" + - "==========================================" + - "Spoke: {{ spoke_cluster }}" + - "Report: {{ report_file }}" + - "Web URL: {{ gitea_repo_web_url }}/src/branch/main/reports/{{ report_dir_name }}/{{ spoke_cluster }}" + - "==========================================" diff --git a/playbooks/telco-kpis/roles/gitea/tasks/validate-credentials.yml b/playbooks/telco-kpis/roles/gitea/tasks/validate-credentials.yml new file mode 100644 index 00000000..7d9fb85d --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/tasks/validate-credentials.yml @@ -0,0 +1,38 @@ +--- +# Validate that Gitea admin credentials are available from vault + +- name: Display credential source + ansible.builtin.debug: + msg: + - "==========================================" + - "Gitea Admin Credentials" + - "==========================================" + - "Username: {{ gitea_admin_user }}" + - "Password: {{ '*' * (gitea_admin_password | length) if gitea_admin_password else '[NOT SET]' }}" + - "Email: {{ gitea_admin_email }}" + - "Source: Vault (ansible_user/ansible_password)" + - "==========================================" + +- name: Validate admin username is set + ansible.builtin.assert: + that: + - gitea_admin_user is defined + - gitea_admin_user | length > 0 + fail_msg: "Gitea admin username not found. Ensure bastion vault provides 'ansible_user' variable." + success_msg: "Gitea admin username: {{ gitea_admin_user }}" + +- name: Validate admin password is set + ansible.builtin.assert: + that: + - gitea_admin_password is defined + - gitea_admin_password | length > 0 + fail_msg: | + Gitea admin password not found in vault. + + Expected vault variable: 'ansible_password' from bastion vault. + + To fix: + 1. Ensure bastion vault (ansible_group_bastions or host_vars) contains 'ansible_password' + 2. OR set gitea_admin_password explicitly in extra vars + 3. OR set environment variable BASTION_PASSWORD + success_msg: "Gitea admin password retrieved from vault" diff --git a/playbooks/telco-kpis/roles/gitea/templates/README.md.j2 b/playbooks/telco-kpis/roles/gitea/templates/README.md.j2 new file mode 100644 index 00000000..32421df5 --- /dev/null +++ b/playbooks/telco-kpis/roles/gitea/templates/README.md.j2 @@ -0,0 +1,46 @@ +# Telco KPIs Test Reports Archive + +This repository contains automated test reports from Telco KPIs verification jobs. + +## Latest Reports + +{% set ns = namespace(reports=[]) %} +{% for report_path in report_list | sort(reverse=True) %} +{% set parts = report_path.split('/') %} +{% if parts | length == 3 %} +{% set date = parts[0] %} +{% set spoke = parts[1] %} +{% set filename = parts[2] %} +{% set _ = ns.reports.append({'date': date, 'spoke': spoke, 'file': filename, 'path': 'reports/' + report_path}) %} +{% endif %} +{% endfor %} + +| Date | Spoke Cluster | Report | Artifacts | +|------|---------------|--------|-----------| +{% for report in ns.reports %} +| {{ report.date }} | {{ report.spoke }} | [{{ report.file }}]({{ report.path }}) | [Browse](reports/{{ report.date }}/{{ report.spoke }}/) | +{% endfor %} + +## Report Structure + +``` +reports/ +└── YYYY-MM-DD/ + └── / + ├── .md # Main report (Markdown) + ├── oslat/ # OSLAT test artifacts + ├── ptp/ # PTP test artifacts + ├── cyclictest/ # Cyclictest artifacts + ├── cpu-util/ # CPU utilization artifacts + ├── reboot/ # Reboot test artifacts + ├── rfc2544/ # RFC2544 network test artifacts + ├── rds-compare/ # RDS compare artifacts + └── node-info-.json # Hardware metadata +``` + +## About + +These reports are automatically generated by the Telco KPIs CI/CD pipeline and published via Ansible automation. + +**Generated**: {{ ansible_date_time.iso8601 }} +**Repository**: {{ gitea_repo_web_url }} diff --git a/playbooks/telco-kpis/roles/lockdown_hub_config/README.md b/playbooks/telco-kpis/roles/lockdown_hub_config/README.md new file mode 100644 index 00000000..6ab19a79 --- /dev/null +++ b/playbooks/telco-kpis/roles/lockdown_hub_config/README.md @@ -0,0 +1,174 @@ +# lockdown_hub_config Role + +## Purpose + +Parses hub lockdown JSON and sets hub deployment configuration facts for Telco-KPIs testing reproducibility. + +## Description + +This role downloads and parses hub lockdown JSON to extract exact hub platform component versions: +- Hub OCP release image (`hub.ocp.pull_spec`) +- Hub OCP version (`hub.ocp.major_version`, `hub.ocp.minor_version`) +- ACM/MCE configuration (`hub.acm.*`) +- TALM operator catalog index (`hub.talm.pull_spec`) +- GitOps operator catalog index (`hub.gitops.pull_spec`) + +The role is designed to integrate with the parametrized hub lockdown approach, allowing optional lockdown enforcement without breaking existing workflows. + +## Requirements + +- `jq` package installed on bastion (for JSON validation) +- Access to hub lockdown JSON file (via HTTP/HTTPS URL) + +## Role Variables + +### Input Variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `hub_lockdown_uri` | No | `""` | URL to hub lockdown JSON file | + +### Output Facts + +When `hub_lockdown_uri` is provided, the role sets the following facts: + +| Fact | Description | Example | +|------|-------------|---------| +| `use_hub_lockdown` | Whether hub lockdown is enabled | `true` | +| `hub_ocp_pull_spec` | Exact hub OCP release image | `quay.io/openshift-release-dev/ocp-release:4.20.4-x86_64` | +| `hub_ocp_major_version` | Hub OCP major version | `4` | +| `hub_ocp_minor_version` | Hub OCP minor version | `20` | +| `hub_acm_config` | ACM configuration dict | `{"version_override": "2.15", "acm_override": "v2.15.0", ...}` | +| `hub_talm_pull_spec` | TALM operator catalog index | `registry.redhat.io/redhat/redhat-operator-index:v4.20` | +| `hub_gitops_pull_spec` | GitOps operator catalog index | `registry.redhat.io/redhat/redhat-operator-index:v4.20` | + +## Dependencies + +None + +## Example Playbook + +### Basic Usage (Hub OCP Deployment) + +```yaml +--- +- name: Deploy hub with optional lockdown + hosts: bastion + tasks: + # Parse hub lockdown if provided + - name: Parse hub lockdown JSON + ansible.builtin.include_role: + name: lockdown_hub_config + vars: + hub_lockdown_uri: "{{ lookup('env', 'HUB_LOCKDOWN_URI') | default('', true) }}" + when: lookup('env', 'HUB_LOCKDOWN_URI') | length > 0 + + # Override release variable if hub lockdown is active + - name: Set OCP release from hub lockdown + ansible.builtin.set_fact: + release: "{{ hub_ocp_pull_spec }}" + when: use_hub_lockdown | default(false) + + # Rest of deployment playbook... +``` + +### With Hub Operators + +```yaml +--- +- name: Install hub operators with lockdown + hosts: bastion + tasks: + - name: Parse hub lockdown JSON + ansible.builtin.include_role: + name: lockdown_hub_config + vars: + hub_lockdown_uri: "{{ hub_lockdown_uri }}" + when: hub_lockdown_uri is defined and hub_lockdown_uri | length > 0 + + - name: Install ACM with lockdown config + ansible.builtin.include_role: + name: ocp_operator_deployment + vars: + operator_name: "advanced-cluster-management" + acm_version_override: "{{ hub_acm_config.version_override | default('') }}" + acm_catalog_override: "{{ hub_acm_config.acm_override | default('') }}" + when: use_hub_lockdown | default(false) +``` + +## Hub Lockdown JSON Format + +Example hub lockdown JSON structure: + +```json +{ + "hub": { + "ocp": { + "major_version": "4", + "minor_version": "20", + "pull_spec": "quay.io/openshift-release-dev/ocp-release:4.20.4-x86_64" + }, + "acm": { + "version_override": "2.15", + "acm_override": "v2.15.0", + "mce_override": "v2.10.0", + "iib_or_snapshot": "konflux" + }, + "talm": { + "pull_spec": "registry.redhat.io/redhat/redhat-operator-index:v4.20" + }, + "gitops": { + "pull_spec": "registry.redhat.io/redhat/redhat-operator-index:v4.20" + } + } +} +``` + +## Backward Compatibility + +This role is **100% backward compatible**: +- If `hub_lockdown_uri` is empty or not provided → role skips processing +- Existing playbooks continue working without modification +- Hub lockdown is **opt-in** via parameter + +## Error Handling + +The role performs the following validations: +1. **Download validation**: Fails if hub lockdown JSON cannot be downloaded (with 3 retries) +2. **JSON structure validation**: Fails if `.hub.ocp` section is missing +3. **Graceful defaults**: Optional fields (TALM, GitOps) default to empty string if missing + +## Testing + +### Test with existing lockdown file + +```bash +# Export hub lockdown URI +export HUB_LOCKDOWN_URI="https://gitlab.cee.redhat.com/ran/dev-kpi-pipeline/-/raw/main/lockdown-hub-x86_64.json" + +# Run deploy-ocp-sno playbook +ansible-playbook playbooks/deploy-ocp-sno.yml \ + -i inventories/ocp-deployment/build-inventory.py \ + --extra-vars "cluster_name=kni-qe-71 release=4.21" + +# Verify hub_ocp_pull_spec is used instead of release parameter +``` + +### Test without lockdown (backward compatibility) + +```bash +# No HUB_LOCKDOWN_URI set +ansible-playbook playbooks/deploy-ocp-sno.yml \ + -i inventories/ocp-deployment/build-inventory.py \ + --extra-vars "cluster_name=kni-qe-71 release=4.21" + +# Should use release=4.21 (current behavior) +``` + +## License + +See repository LICENSE file. + +## Author Information + +Telco Verification CI/CD Team diff --git a/playbooks/telco-kpis/roles/lockdown_hub_config/defaults/main.yml b/playbooks/telco-kpis/roles/lockdown_hub_config/defaults/main.yml new file mode 100644 index 00000000..5081171c --- /dev/null +++ b/playbooks/telco-kpis/roles/lockdown_hub_config/defaults/main.yml @@ -0,0 +1,9 @@ +--- +# playbooks/telco-kpis/roles/lockdown_hub_config/defaults/main.yml + +# URL to hub lockdown JSON file +# Example: https://raw.githubusercontent.com/telcov10n-ci/telco-kpis-operator-lockdown/main/hub/lockdown-hub-x86_64.json +hub_lockdown_uri: "" + +# Whether to use hub lockdown (set automatically based on hub_lockdown_uri) +use_hub_lockdown: false diff --git a/playbooks/telco-kpis/roles/lockdown_hub_config/tasks/main.yml b/playbooks/telco-kpis/roles/lockdown_hub_config/tasks/main.yml new file mode 100644 index 00000000..2eb2ab10 --- /dev/null +++ b/playbooks/telco-kpis/roles/lockdown_hub_config/tasks/main.yml @@ -0,0 +1,84 @@ +--- +# playbooks/telco-kpis/roles/lockdown_hub_config/tasks/main.yml +# +# Purpose: Parse hub lockdown JSON and set hub deployment configuration +# +# This role downloads and parses hub lockdown JSON to extract: +# - hub.ocp.pull_spec: Exact OCP release image for hub deployment +# - hub.ocp.major_version, hub.ocp.minor_version: Hub OCP version +# - hub.acm.*: ACM/MCE configuration for operator deployment +# - hub.talm.pull_spec, hub.gitops.pull_spec: Operator catalog indexes +# +# Usage: +# - name: Parse hub lockdown if provided +# ansible.builtin.include_role: +# name: lockdown_hub_config +# vars: +# hub_lockdown_uri: "{{ hub_lockdown_uri | default('', true) }}" +# when: hub_lockdown_uri is defined and hub_lockdown_uri | length > 0 + +- name: Check if hub lockdown URI is provided + ansible.builtin.set_fact: + use_hub_lockdown: "{{ hub_lockdown_uri is defined and hub_lockdown_uri | length > 0 }}" + +- name: Display hub lockdown status + ansible.builtin.debug: + msg: "Hub lockdown enforcement: {{ 'ENABLED' if use_hub_lockdown else 'DISABLED' }}" + +- name: Process hub lockdown + when: use_hub_lockdown | bool + block: + - name: Download hub lockdown JSON + ansible.builtin.get_url: + url: "{{ hub_lockdown_uri }}" + dest: /tmp/lockdown-hub.json + mode: '0644' + force: true + register: hub_lockdown_download + retries: 3 + delay: 5 + failed_when: false + + - name: Fail if hub lockdown download failed + ansible.builtin.fail: + msg: "Failed to download hub lockdown JSON from {{ hub_lockdown_uri }}" + when: hub_lockdown_download.failed | default(false) + + - name: Validate hub lockdown JSON structure + ansible.builtin.command: + cmd: jq -e '.hub.ocp' /tmp/lockdown-hub.json + changed_when: false + register: hub_lockdown_validate + failed_when: hub_lockdown_validate.rc != 0 + + - name: Read hub lockdown JSON from bastion + ansible.builtin.slurp: + src: /tmp/lockdown-hub.json + register: hub_lockdown_content + + - name: Parse hub lockdown JSON + ansible.builtin.set_fact: + hub_lockdown_data: "{{ hub_lockdown_content['content'] | b64decode | from_json }}" + + - name: Set hub deployment configuration from lockdown + ansible.builtin.set_fact: + hub_ocp_pull_spec: "{{ hub_lockdown_data.hub.ocp.pull_spec }}" + hub_ocp_major_version: "{{ hub_lockdown_data.hub.ocp.major_version }}" + hub_ocp_minor_version: "{{ hub_lockdown_data.hub.ocp.minor_version }}" + hub_acm_config: "{{ hub_lockdown_data.hub.acm | default({}) }}" + hub_talm_pull_spec: "{{ hub_lockdown_data.hub.talm.pull_spec | default('') }}" + hub_gitops_pull_spec: "{{ hub_lockdown_data.hub.gitops.pull_spec | default('') }}" + + - name: Display hub lockdown configuration + ansible.builtin.debug: + msg: + - "==========================================" + - "Hub Lockdown Enforcement: ENABLED" + - "==========================================" + - "Hub Lockdown URI: {{ hub_lockdown_uri }}" + - "Hub OCP Pull Spec: {{ hub_ocp_pull_spec }}" + - "Hub OCP Version: {{ hub_ocp_major_version }}.{{ hub_ocp_minor_version }}" + - "Hub ACM Config: {{ hub_acm_config }}" + - "Hub TALM Pull Spec: {{ hub_talm_pull_spec | default('(not set)') }}" + - "Hub GitOps Pull Spec: {{ hub_gitops_pull_spec | default('(not set)') }}" + - "==========================================" diff --git a/playbooks/telco-kpis/run-test.yml b/playbooks/telco-kpis/run-test.yml new file mode 100644 index 00000000..ab7b0da1 --- /dev/null +++ b/playbooks/telco-kpis/run-test.yml @@ -0,0 +1,68 @@ +--- +# Generic test runner for Telco-KPIs spoke clusters +# Accepts test_name parameter and delegates to specific test task +# +# Usage: +# ansible-playbook run-test.yml \ +# -e test_name=oslat \ +# -e spoke_cluster=spree-02 \ +# -e hub_kubeconfig=/path/to/hub/kubeconfig \ +# -e spoke_kubeconfig=/path/to/spoke/kubeconfig + +- name: Run test on spoke cluster + hosts: bastion + gather_facts: false + + vars: + valid_tests: + - cpu_util + - cyclictest + - oslat + - ptp + - reboot + - rfc2544 + + tasks: + - name: Validate test_name parameter + ansible.builtin.assert: + that: + - test_name is defined + - test_name in valid_tests + fail_msg: "test_name must be one of: {{ valid_tests | join(', ') }}" + success_msg: "Running test: {{ test_name }}" + + - name: Validate spoke_cluster parameter + ansible.builtin.assert: + that: + - spoke_cluster is defined + - spoke_cluster | length > 0 + fail_msg: "spoke_cluster parameter is required" + success_msg: "Target spoke cluster: {{ spoke_cluster }}" + + - name: Validate hub_kubeconfig parameter + ansible.builtin.assert: + that: + - hub_kubeconfig is defined + - hub_kubeconfig | length > 0 + fail_msg: "hub_kubeconfig parameter is required" + success_msg: "Hub kubeconfig: {{ hub_kubeconfig }}" + + - name: Validate spoke_kubeconfig parameter + ansible.builtin.assert: + that: + - spoke_kubeconfig is defined + - spoke_kubeconfig | length > 0 + fail_msg: "spoke_kubeconfig parameter is required" + success_msg: "Spoke kubeconfig: {{ spoke_kubeconfig }}" + + - name: Execute test-specific tasks + ansible.builtin.include_tasks: "tasks/run-{{ test_name }}-test.yml" + + - name: Test execution summary + ansible.builtin.debug: + msg: + - "Test execution completed successfully" + - "Test: {{ test_name }}" + - "Spoke cluster: {{ spoke_cluster }}" + - "Hub kubeconfig: {{ hub_kubeconfig }}" + - "Spoke kubeconfig: {{ spoke_kubeconfig }}" diff --git a/playbooks/telco-kpis/tasks/run-cpu_util-test.yml b/playbooks/telco-kpis/tasks/run-cpu_util-test.yml new file mode 100644 index 00000000..d4fcdbe3 --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-cpu_util-test.yml @@ -0,0 +1,77 @@ +--- +# CPU utilization test task for Telco-KPIs +# TODO: Implement actual cpu_util test logic +# For now: validates connection and cluster access + +- name: Display bastion hostname information + ansible.builtin.command: hostnamectl + register: hostnamectl_output + changed_when: false + +- name: Show hostnamectl output + ansible.builtin.debug: + var: hostnamectl_output.stdout_lines + +- name: Verify hub kubeconfig file exists + ansible.builtin.stat: + path: "{{ hub_kubeconfig }}" + register: hub_kubeconfig_stat + +- name: Assert hub kubeconfig exists + ansible.builtin.assert: + that: hub_kubeconfig_stat.stat.exists + fail_msg: "Hub kubeconfig file not found: {{ hub_kubeconfig }}" + success_msg: "Hub kubeconfig exists: {{ hub_kubeconfig }}" + +- name: Verify spoke kubeconfig file exists + ansible.builtin.stat: + path: "{{ spoke_kubeconfig }}" + register: spoke_kubeconfig_stat + +- name: Assert spoke kubeconfig exists + ansible.builtin.assert: + that: spoke_kubeconfig_stat.stat.exists + fail_msg: "Spoke kubeconfig file not found: {{ spoke_kubeconfig }}" + success_msg: "Spoke kubeconfig exists: {{ spoke_kubeconfig }}" + +- name: Get hub cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get nodes -o wide + register: hub_nodes_output + changed_when: false + +- name: Get hub cluster version + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get clusterversion -o wide + register: hub_version_output + changed_when: false + +- name: Display hub cluster information + ansible.builtin.debug: + msg: + - "=== HUB CLUSTER NODES ===" + - "{{ hub_nodes_output.stdout }}" + - "" + - "=== HUB CLUSTER VERSION ===" + - "{{ hub_version_output.stdout }}" + +- name: Get spoke cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get nodes -o wide + register: spoke_nodes_output + changed_when: false + +- name: Get spoke cluster version + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get clusterversion -o wide + register: spoke_version_output + changed_when: false + +- name: Display spoke cluster information + ansible.builtin.debug: + msg: + - "=== SPOKE CLUSTER ({{ spoke_cluster }}) NODES ===" + - "{{ spoke_nodes_output.stdout }}" + - "" + - "=== SPOKE CLUSTER VERSION ===" + - "{{ spoke_version_output.stdout }}" diff --git a/playbooks/telco-kpis/tasks/run-cyclictest-test.yml b/playbooks/telco-kpis/tasks/run-cyclictest-test.yml new file mode 100644 index 00000000..f34f9e57 --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-cyclictest-test.yml @@ -0,0 +1,116 @@ +--- +# Run Cyclictest latency test +# Test: Cyclictest (Real-time latency measurement) +# Purpose: Measure RT kernel latency on isolated CPUs with PerformanceProfile +# Duration: Configurable via duration parameter (default: 12h) + +- name: Ensure test-runner container image exists on bastion + ansible.builtin.import_tasks: setup-test-runner.yml + +- name: Create artifacts directory on bastion + ansible.builtin.tempfile: + state: directory + suffix: .cyclictest-{{ spoke_cluster }} + path: /tmp + register: artifacts_dir + +- name: Display test configuration + ansible.builtin.debug: + msg: + - "==========================================" + - "Cyclictest Test Configuration" + - "==========================================" + - "Spoke Cluster: {{ spoke_cluster }}" + - "Kubeconfig: {{ spoke_kubeconfig }}" + - "Duration: {{ duration | default(telco_kpis_default_duration_cyclictest) }}" + - "Artifacts: {{ artifacts_dir.path }}" + - "==========================================" + +- name: Ensure kubeconfig is readable by podman container + ansible.builtin.file: + path: "{{ spoke_kubeconfig }}" + mode: '0644' + +- name: Display monitoring instructions for real-time log viewing + ansible.builtin.debug: + msg: + - "==========================================" + - "To monitor test execution in real-time:" + - "==========================================" + - "SSH to bastion: ssh {{ ansible_user }}@{{ ansible_host }}" + - "Tail the log: tail -f {{ artifacts_dir.path }}/podman-run.log" + - "" + - "Or from local:" + - "ssh {{ ansible_user }}@{{ ansible_host }} 'tail -f {{ artifacts_dir.path }}/podman-run.log'" + - "==========================================" + - "Container name: telco-kpis-cyclictest-{{ spoke_cluster }}" + - "Podman logs: podman logs -f telco-kpis-cyclictest-{{ spoke_cluster }}" + - "==========================================" + +- name: Run Cyclictest test in podman container + ansible.builtin.shell: | + set -o pipefail + podman run --rm \ + --name telco-kpis-cyclictest-{{ spoke_cluster }} \ + -v {{ spoke_kubeconfig }}:/tmp/kubeconfig-ro:ro,Z \ + -v {{ artifacts_dir.path }}:/workspace/artifacts:rw,Z \ + telco-kpis-test-runner:latest \ + bash -c ' + set -e + cp /tmp/kubeconfig-ro /tmp/kubeconfig + export KUBECONFIG=/tmp/kubeconfig + export GIT_SSL_NO_VERIFY=true + export RAN_METRICS_URL=https://pushgateway.ran-metrics.telco5g.corp.redhat.com/metrics/job/pipeline1/ + + echo "Cloning ran-integration repository..." + git clone --depth 1 {{ telco_kpis_ran_integration_repo | default('https://gitlab.cee.redhat.com/ran/ran-integration.git') }} /workspace/ran-integration + cd /workspace/ran-integration + + echo "Cloning cyclictest repository..." + git clone --depth 1 {{ telco_kpis_cyclictest_repo | default('https://gitlab.cee.redhat.com/ran/cyclictest.git') }} cyclictest + + export WORKSPACE=/workspace/ran-integration + export SCRIPTS_DIR=/workspace/ran-integration/scripts + + echo "Creating artifacts symlink..." + ln -sfn /workspace/artifacts /workspace/ran-integration/artifacts + + echo "Starting Cyclictest (duration={{ duration | default(telco_kpis_default_duration_cyclictest) }})..." + set +e + bash scripts/test_cyclictest.sh cyclictest={{ duration | default(telco_kpis_default_duration_cyclictest) }} + exit_code=$? + + echo "Cyclictest completed with exit code: $exit_code" + exit $exit_code + ' 2>&1 | tee {{ artifacts_dir.path }}/podman-run.log + register: cyclictest_result + failed_when: false + args: + executable: /bin/bash + +- name: Display test execution result + ansible.builtin.debug: + msg: + - "Test exit code: {{ cyclictest_result.rc }}" + - "{{ 'Test PASSED' if cyclictest_result.rc == 0 else 'Test FAILED' }}" + +- name: Manage test artifacts (collect, share, report) + ansible.builtin.include_role: + name: artifact-management + vars: + artifact_management_test_name: "cyclictest" + artifact_management_spoke_cluster: "{{ spoke_cluster }}" + artifact_management_temp_dir: "{{ artifacts_dir.path }}" + artifact_management_test_rc: "{{ cyclictest_result.rc }}" + artifact_management_test_duration: "{{ duration_seconds }}" + artifact_management_junit_source_pattern: "cyclictest_report.xml" + +- name: Cleanup artifacts directory on bastion + ansible.builtin.file: + path: "{{ artifacts_dir.path }}" + state: absent + +- name: Fail if test failed + ansible.builtin.fail: + msg: "Cyclictest failed with exit code {{ cyclictest_result.rc }}" + when: cyclictest_result.rc != 0 diff --git a/playbooks/telco-kpis/tasks/run-oslat-test.yml b/playbooks/telco-kpis/tasks/run-oslat-test.yml new file mode 100644 index 00000000..6bd910b2 --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-oslat-test.yml @@ -0,0 +1,116 @@ +--- +# Run OSLAT latency test +# Test: OSLAT (OS-level latency test) +# Purpose: Measure OS latency on isolated CPUs with PerformanceProfile +# Duration: Configurable via duration parameter (default: 10m) + +- name: Ensure test-runner container image exists on bastion + ansible.builtin.import_tasks: setup-test-runner.yml + +- name: Create artifacts directory on bastion + ansible.builtin.tempfile: + state: directory + suffix: .oslat-{{ spoke_cluster }} + path: /tmp + register: artifacts_dir + +- name: Display test configuration + ansible.builtin.debug: + msg: + - "==========================================" + - "OSLAT Test Configuration" + - "==========================================" + - "Spoke Cluster: {{ spoke_cluster }}" + - "Kubeconfig: {{ spoke_kubeconfig }}" + - "Duration: {{ duration | default(telco_kpis_default_duration_oslat) }}" + - "Artifacts: {{ artifacts_dir.path }}" + - "==========================================" + +- name: Ensure kubeconfig is readable by podman container + ansible.builtin.file: + path: "{{ spoke_kubeconfig }}" + mode: '0644' + +- name: Display monitoring instructions for real-time log viewing + ansible.builtin.debug: + msg: + - "==========================================" + - "To monitor test execution in real-time:" + - "==========================================" + - "SSH to bastion: ssh {{ ansible_user }}@{{ ansible_host }}" + - "Tail the log: tail -f {{ artifacts_dir.path }}/podman-run.log" + - "" + - "Or from local:" + - "ssh {{ ansible_user }}@{{ ansible_host }} 'tail -f {{ artifacts_dir.path }}/podman-run.log'" + - "==========================================" + - "Container name: telco-kpis-oslat-{{ spoke_cluster }}" + - "Podman logs: podman logs -f telco-kpis-oslat-{{ spoke_cluster }}" + - "==========================================" + +- name: Run OSLAT test in podman container + ansible.builtin.shell: | + set -o pipefail + podman run --rm \ + --name telco-kpis-oslat-{{ spoke_cluster }} \ + -v {{ spoke_kubeconfig }}:/tmp/kubeconfig-ro:ro,Z \ + -v {{ artifacts_dir.path }}:/workspace/artifacts:rw,Z \ + telco-kpis-test-runner:latest \ + bash -c ' + set -e + cp /tmp/kubeconfig-ro /tmp/kubeconfig + export KUBECONFIG=/tmp/kubeconfig + export GIT_SSL_NO_VERIFY=true + export RAN_METRICS_URL=https://pushgateway.ran-metrics.telco5g.corp.redhat.com/metrics/job/pipeline1/ + + echo "Cloning ran-integration repository..." + git clone --depth 1 {{ telco_kpis_ran_integration_repo | default('https://gitlab.cee.redhat.com/ran/ran-integration.git') }} /workspace/ran-integration + cd /workspace/ran-integration + + echo "Cloning oslat repository..." + git clone --depth 1 {{ telco_kpis_oslat_repo | default('https://gitlab.cee.redhat.com/ran/oslat.git') }} oslat + + export WORKSPACE=/workspace/ran-integration + export SCRIPTS_DIR=/workspace/ran-integration/scripts + + echo "Creating artifacts symlink..." + ln -sfn /workspace/artifacts /workspace/ran-integration/artifacts + + echo "Starting OSLAT test (duration={{ duration | default(telco_kpis_default_duration_oslat) }})..." + set +e + bash scripts/test_oslat.sh oslat={{ duration | default(telco_kpis_default_duration_oslat) }} + exit_code=$? + + echo "OSLAT test completed with exit code: $exit_code" + exit $exit_code + ' 2>&1 | tee {{ artifacts_dir.path }}/podman-run.log + register: oslat_result + failed_when: false + args: + executable: /bin/bash + +- name: Display test execution result + ansible.builtin.debug: + msg: + - "Test exit code: {{ oslat_result.rc }}" + - "{{ 'Test PASSED' if oslat_result.rc == 0 else 'Test FAILED' }}" + +- name: Manage test artifacts (collect, share, report) + ansible.builtin.include_role: + name: artifact-management + vars: + artifact_management_test_name: "oslat" + artifact_management_spoke_cluster: "{{ spoke_cluster }}" + artifact_management_temp_dir: "{{ artifacts_dir.path }}" + artifact_management_test_rc: "{{ oslat_result.rc }}" + artifact_management_test_duration: "{{ duration_seconds }}" + artifact_management_junit_source_pattern: "oslat_report.xml" + +- name: Cleanup artifacts directory on bastion + ansible.builtin.file: + path: "{{ artifacts_dir.path }}" + state: absent + +- name: Fail if test failed + ansible.builtin.fail: + msg: "OSLAT test failed with exit code {{ oslat_result.rc }}" + when: oslat_result.rc != 0 diff --git a/playbooks/telco-kpis/tasks/run-ptp-test.yml b/playbooks/telco-kpis/tasks/run-ptp-test.yml new file mode 100644 index 00000000..d8393da9 --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-ptp-test.yml @@ -0,0 +1,113 @@ +--- +# Run PTP (Precision Time Protocol) test +# Test: PTP validation +# Purpose: Validate PTP synchronization and offset metrics on spoke cluster +# Duration: Configurable via duration parameter (default: 10m) + +- name: Ensure test-runner container image exists on bastion + ansible.builtin.import_tasks: setup-test-runner.yml + +- name: Create artifacts directory on bastion + ansible.builtin.tempfile: + state: directory + suffix: .ptp-{{ spoke_cluster }} + path: /tmp + register: artifacts_dir + +- name: Display test configuration + ansible.builtin.debug: + msg: + - "==========================================" + - "PTP Test Configuration" + - "==========================================" + - "Spoke Cluster: {{ spoke_cluster }}" + - "Kubeconfig: {{ spoke_kubeconfig }}" + - "Duration: {{ duration | default(telco_kpis_default_duration_ptp) }}" + - "Artifacts: {{ artifacts_dir.path }}" + - "==========================================" + +- name: Ensure kubeconfig is readable by podman container + ansible.builtin.file: + path: "{{ spoke_kubeconfig }}" + mode: '0644' + +- name: Display monitoring instructions for real-time log viewing + ansible.builtin.debug: + msg: + - "==========================================" + - "To monitor test execution in real-time:" + - "==========================================" + - "SSH to bastion: ssh {{ ansible_user }}@{{ ansible_host }}" + - "Tail the log: tail -f {{ artifacts_dir.path }}/podman-run.log" + - "" + - "Or from local:" + - "ssh {{ ansible_user }}@{{ ansible_host }} 'tail -f {{ artifacts_dir.path }}/podman-run.log'" + - "==========================================" + - "Container name: telco-kpis-ptp-{{ spoke_cluster }}" + - "Podman logs: podman logs -f telco-kpis-ptp-{{ spoke_cluster }}" + - "==========================================" + +- name: Run PTP test in podman container + ansible.builtin.shell: | + set -o pipefail + podman run --rm \ + --name telco-kpis-ptp-{{ spoke_cluster }} \ + -v {{ spoke_kubeconfig }}:/tmp/kubeconfig-ro:ro,Z \ + -v {{ artifacts_dir.path }}:/workspace/artifacts:rw,Z \ + telco-kpis-test-runner:latest \ + bash -c ' + set -e + cp /tmp/kubeconfig-ro /tmp/kubeconfig + export KUBECONFIG=/tmp/kubeconfig + export GIT_SSL_NO_VERIFY=true + export RAN_METRICS_URL=https://pushgateway.ran-metrics.telco5g.corp.redhat.com/metrics/job/pipeline1/ + + echo "Cloning ran-integration repository..." + git clone --depth 1 {{ telco_kpis_ran_integration_repo | default('https://gitlab.cee.redhat.com/ran/ran-integration.git') }} /workspace/ran-integration + cd /workspace/ran-integration + + export WORKSPACE=/workspace/ran-integration + export SCRIPTS_DIR=/workspace/ran-integration/scripts + + echo "Creating artifacts symlink..." + ln -sfn /workspace/artifacts /workspace/ran-integration/artifacts + + echo "Starting PTP test (duration={{ duration | default(telco_kpis_default_duration_ptp) }})..." + set +e + bash scripts/test_ptp.sh ptp={{ duration | default(telco_kpis_default_duration_ptp) }} + exit_code=$? + + echo "PTP test completed with exit code: $exit_code" + exit $exit_code + ' 2>&1 | tee {{ artifacts_dir.path }}/podman-run.log + register: ptp_result + failed_when: false + args: + executable: /bin/bash + +- name: Display test execution result + ansible.builtin.debug: + msg: + - "Test exit code: {{ ptp_result.rc }}" + - "{{ 'Test PASSED' if ptp_result.rc == 0 else 'Test FAILED' }}" + +- name: Manage test artifacts (collect, share, report) + ansible.builtin.include_role: + name: artifact-management + vars: + artifact_management_test_name: "ptp" + artifact_management_spoke_cluster: "{{ spoke_cluster }}" + artifact_management_temp_dir: "{{ artifacts_dir.path }}" + artifact_management_test_rc: "{{ ptp_result.rc }}" + artifact_management_test_duration: "{{ duration_seconds }}" + artifact_management_junit_source_pattern: "*ptp*.xml,*report*.xml" + +- name: Cleanup artifacts directory on bastion + ansible.builtin.file: + path: "{{ artifacts_dir.path }}" + state: absent + +- name: Fail if test failed + ansible.builtin.fail: + msg: "PTP test failed with exit code {{ ptp_result.rc }}" + when: ptp_result.rc != 0 diff --git a/playbooks/telco-kpis/tasks/run-reboot-test.yml b/playbooks/telco-kpis/tasks/run-reboot-test.yml new file mode 100644 index 00000000..32d4842d --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-reboot-test.yml @@ -0,0 +1,77 @@ +--- +# Reboot test task for Telco-KPIs +# TODO: Implement actual reboot test logic +# For now: validates connection and cluster access + +- name: Display bastion hostname information + ansible.builtin.command: hostnamectl + register: hostnamectl_output + changed_when: false + +- name: Show hostnamectl output + ansible.builtin.debug: + var: hostnamectl_output.stdout_lines + +- name: Verify hub kubeconfig file exists + ansible.builtin.stat: + path: "{{ hub_kubeconfig }}" + register: hub_kubeconfig_stat + +- name: Assert hub kubeconfig exists + ansible.builtin.assert: + that: hub_kubeconfig_stat.stat.exists + fail_msg: "Hub kubeconfig file not found: {{ hub_kubeconfig }}" + success_msg: "Hub kubeconfig exists: {{ hub_kubeconfig }}" + +- name: Verify spoke kubeconfig file exists + ansible.builtin.stat: + path: "{{ spoke_kubeconfig }}" + register: spoke_kubeconfig_stat + +- name: Assert spoke kubeconfig exists + ansible.builtin.assert: + that: spoke_kubeconfig_stat.stat.exists + fail_msg: "Spoke kubeconfig file not found: {{ spoke_kubeconfig }}" + success_msg: "Spoke kubeconfig exists: {{ spoke_kubeconfig }}" + +- name: Get hub cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get nodes -o wide + register: hub_nodes_output + changed_when: false + +- name: Get hub cluster version + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get clusterversion -o wide + register: hub_version_output + changed_when: false + +- name: Display hub cluster information + ansible.builtin.debug: + msg: + - "=== HUB CLUSTER NODES ===" + - "{{ hub_nodes_output.stdout }}" + - "" + - "=== HUB CLUSTER VERSION ===" + - "{{ hub_version_output.stdout }}" + +- name: Get spoke cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get nodes -o wide + register: spoke_nodes_output + changed_when: false + +- name: Get spoke cluster version + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get clusterversion -o wide + register: spoke_version_output + changed_when: false + +- name: Display spoke cluster information + ansible.builtin.debug: + msg: + - "=== SPOKE CLUSTER ({{ spoke_cluster }}) NODES ===" + - "{{ spoke_nodes_output.stdout }}" + - "" + - "=== SPOKE CLUSTER VERSION ===" + - "{{ spoke_version_output.stdout }}" diff --git a/playbooks/telco-kpis/tasks/run-rfc2544-test.yml b/playbooks/telco-kpis/tasks/run-rfc2544-test.yml new file mode 100644 index 00000000..95cd3962 --- /dev/null +++ b/playbooks/telco-kpis/tasks/run-rfc2544-test.yml @@ -0,0 +1,77 @@ +--- +# RFC2544 test task for Telco-KPIs +# TODO: Implement actual rfc2544 test logic +# For now: validates connection and cluster access + +- name: Display bastion hostname information + ansible.builtin.command: hostnamectl + register: hostnamectl_output + changed_when: false + +- name: Show hostnamectl output + ansible.builtin.debug: + var: hostnamectl_output.stdout_lines + +- name: Verify hub kubeconfig file exists + ansible.builtin.stat: + path: "{{ hub_kubeconfig }}" + register: hub_kubeconfig_stat + +- name: Assert hub kubeconfig exists + ansible.builtin.assert: + that: hub_kubeconfig_stat.stat.exists + fail_msg: "Hub kubeconfig file not found: {{ hub_kubeconfig }}" + success_msg: "Hub kubeconfig exists: {{ hub_kubeconfig }}" + +- name: Verify spoke kubeconfig file exists + ansible.builtin.stat: + path: "{{ spoke_kubeconfig }}" + register: spoke_kubeconfig_stat + +- name: Assert spoke kubeconfig exists + ansible.builtin.assert: + that: spoke_kubeconfig_stat.stat.exists + fail_msg: "Spoke kubeconfig file not found: {{ spoke_kubeconfig }}" + success_msg: "Spoke kubeconfig exists: {{ spoke_kubeconfig }}" + +- name: Get hub cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get nodes -o wide + register: hub_nodes_output + changed_when: false + +- name: Get hub cluster version + ansible.builtin.command: > + oc --kubeconfig {{ hub_kubeconfig }} get clusterversion -o wide + register: hub_version_output + changed_when: false + +- name: Display hub cluster information + ansible.builtin.debug: + msg: + - "=== HUB CLUSTER NODES ===" + - "{{ hub_nodes_output.stdout }}" + - "" + - "=== HUB CLUSTER VERSION ===" + - "{{ hub_version_output.stdout }}" + +- name: Get spoke cluster nodes + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get nodes -o wide + register: spoke_nodes_output + changed_when: false + +- name: Get spoke cluster version + ansible.builtin.command: > + oc --kubeconfig {{ spoke_kubeconfig }} get clusterversion -o wide + register: spoke_version_output + changed_when: false + +- name: Display spoke cluster information + ansible.builtin.debug: + msg: + - "=== SPOKE CLUSTER ({{ spoke_cluster }}) NODES ===" + - "{{ spoke_nodes_output.stdout }}" + - "" + - "=== SPOKE CLUSTER VERSION ===" + - "{{ spoke_version_output.stdout }}"