diff --git a/.github/workflows/generate_matrix_page.yaml b/.github/workflows/generate_matrix_page.yaml index 0b614398b..10b122508 100644 --- a/.github/workflows/generate_matrix_page.yaml +++ b/.github/workflows/generate_matrix_page.yaml @@ -32,8 +32,12 @@ jobs: steps: - name: Set dynamic env vars run: | + # GPU Operator dashboard paths echo "DASHBOARD_DATA_FILEPATH=${DASHBOARD_OUTPUT_DIR}/gpu_operator_matrix.json" >> "$GITHUB_ENV" echo "DASHBOARD_HTML_FILEPATH=${DASHBOARD_OUTPUT_DIR}/gpu_operator_matrix.html" >> "$GITHUB_ENV" + # Network Operator dashboard paths + echo "NNO_DASHBOARD_DATA_FILEPATH=${DASHBOARD_OUTPUT_DIR}/network_operator_matrix.json" >> "$GITHUB_ENV" + echo "NNO_DASHBOARD_HTML_FILEPATH=${DASHBOARD_OUTPUT_DIR}/network_operator_matrix.html" >> "$GITHUB_ENV" echo "GH_PAGES_BRANCH=${{ github.event.inputs.gh_pages_branch || 'gh-pages' }}" >> "$GITHUB_ENV" env: DASHBOARD_OUTPUT_DIR: ${{ env.DASHBOARD_OUTPUT_DIR }} @@ -67,27 +71,62 @@ jobs: - name: Install Dependencies run: | pip install -r workflows/gpu_operator_dashboard/requirements.txt + pip install -r workflows/nno_dashboard/requirements.txt - name: Fetch CI Data run: | echo "Processing PR: ${{ steps.determine_pr.outputs.PR_NUMBER }}" + # GPU Operator python -m workflows.gpu_operator_dashboard.fetch_ci_data \ --pr_number "${{ steps.determine_pr.outputs.PR_NUMBER }}" \ --baseline_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" \ --merged_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" + # Network Operator + python -m workflows.nno_dashboard.fetch_ci_data \ + --pr_number "${{ steps.determine_pr.outputs.PR_NUMBER }}" \ + --baseline_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" \ + --merged_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" - name: Generate HTML Dashboard (only if JSON changed) run: | cd "${{ env.DASHBOARD_OUTPUT_DIR }}" + + # Check if GPU Operator JSON changed + GPU_CHANGED=false if [[ ${{ github.event_name }} == "pull_request_target" ]] && git diff --exit-code gpu_operator_matrix.json; then - echo "no changes" + echo "GPU Operator: no changes" + else + echo "GPU Operator: changes detected" + GPU_CHANGED=true + fi + + # Check if Network Operator JSON changed + NNO_CHANGED=false + if [[ ${{ github.event_name }} == "pull_request_target" ]] && git diff --exit-code network_operator_matrix.json; then + echo "Network Operator: no changes" else - cd "${{ github.workspace }}" + echo "Network Operator: changes detected" + NNO_CHANGED=true + fi + + cd "${{ github.workspace }}" + + # Generate GPU Operator dashboard if changed + if [ "$GPU_CHANGED" = true ]; then + echo "Generating GPU Operator dashboard..." python -m workflows.gpu_operator_dashboard.generate_ci_dashboard \ --dashboard_data_filepath "${{ env.DASHBOARD_DATA_FILEPATH }}" \ --dashboard_html_filepath "${{ env.DASHBOARD_HTML_FILEPATH }}" fi + + # Generate Network Operator dashboard if changed + if [ "$NNO_CHANGED" = true ]; then + echo "Generating Network Operator dashboard..." + python -m workflows.nno_dashboard.generate_ci_dashboard \ + --dashboard_data_filepath "${{ env.NNO_DASHBOARD_DATA_FILEPATH }}" \ + --dashboard_html_filepath "${{ env.NNO_DASHBOARD_HTML_FILEPATH }}" + fi - name: Deploy HTML to GitHub Pages uses: JamesIves/github-pages-deploy-action@v4 diff --git a/.gitignore b/.gitignore index 06cf92b78..ac23b99e1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ __pycache__/ venv/ *.pyc +.DS_Store +.vscode/ \ No newline at end of file diff --git a/workflows/README.md b/workflows/README.md index 768d3978e..e84979d73 100644 --- a/workflows/README.md +++ b/workflows/README.md @@ -6,8 +6,9 @@ This directory contains multiple workflows for automating various aspects of the - [gpu_operator_versions/](./gpu_operator_versions/) — Automation for updating versions and triggering CI jobs - [gpu_operator_dashboard/](./gpu_operator_dashboard/) — CI dashboard generation for NVIDIA GPU Operator test results +- [nno_dashboard/](./nno_dashboard/) — CI dashboard generation for NVIDIA Network Operator test results - [microshift_dashboard/](./microshift_dashboard/) — MicroShift NVIDIA Device Plugin testing dashboard -- Shared modules: [utils.py](./utils.py), [templates.py](./templates.py) +- [common/](./common/) — Shared utilities: logging, templates, GCS access, HTML builders, data structures See the individual README files in each subdirectory for detailed information. diff --git a/workflows/common/__init__.py b/workflows/common/__init__.py index e69de29bb..09661a8dd 100644 --- a/workflows/common/__init__.py +++ b/workflows/common/__init__.py @@ -0,0 +1,94 @@ +""" +Common utilities shared across NVIDIA CI workflows. +""" + +from workflows.common.utils import get_logger, logger +from workflows.common.templates import load_template +from workflows.common.data_structures import ( + TestResult, + OCP_FULL_VERSION, + OPERATOR_VERSION, + GPU_OPERATOR_VERSION, + STATUS_SUCCESS, + STATUS_FAILURE, + STATUS_ABORTED, +) +from workflows.common.gcs_utils import ( + http_get_json, + fetch_gcs_file_content, + build_prow_job_url, + fetch_filtered_files, + build_job_history_url, + GCS_API_BASE_URL, + GCS_MAX_RESULTS_PER_REQUEST, +) +from workflows.common.html_builders import ( + build_toc, + build_notes, + build_history_bar, + build_last_updated_footer, + sanitize_id, +) +from workflows.common.validation import ( + is_valid_ocp_version, + has_valid_semantic_versions, + is_infrastructure_type, +) +from workflows.common.data_fetching import ( + build_version_lookups, + build_finished_lookup, + extract_test_status, + extract_timestamp, + determine_repo_from_job_name, + convert_sets_to_lists_recursive, + merge_job_history_links, + int_or_none, +) + +__all__ = [ + # Utils + "get_logger", + "logger", + "load_template", + + # Data structures + "TestResult", + "OCP_FULL_VERSION", + "OPERATOR_VERSION", + "GPU_OPERATOR_VERSION", + "STATUS_SUCCESS", + "STATUS_FAILURE", + "STATUS_ABORTED", + + # GCS utilities + "http_get_json", + "fetch_gcs_file_content", + "build_prow_job_url", + "fetch_filtered_files", + "build_job_history_url", + "GCS_API_BASE_URL", + "GCS_MAX_RESULTS_PER_REQUEST", + + # HTML builders + "build_toc", + "build_notes", + "build_history_bar", + "build_last_updated_footer", + "sanitize_id", + + # Validation + "is_valid_ocp_version", + "has_valid_semantic_versions", + "is_infrastructure_type", + + # Data fetching + "build_version_lookups", + "build_finished_lookup", + "extract_test_status", + "extract_timestamp", + "determine_repo_from_job_name", + "convert_sets_to_lists_recursive", + "merge_job_history_links", + "int_or_none", +] + diff --git a/workflows/common/data_fetching.py b/workflows/common/data_fetching.py new file mode 100644 index 000000000..e69484815 --- /dev/null +++ b/workflows/common/data_fetching.py @@ -0,0 +1,177 @@ +""" +Common data fetching patterns for CI dashboards. +Shared logic for building file lookups and processing builds. +""" + +import json +from typing import Dict, Any, List, Tuple, Optional +from workflows.common.utils import logger +from workflows.common.gcs_utils import fetch_gcs_file_content + + +def build_version_lookups( + version_files_list: List[Tuple[str, List[Dict[str, Any]]]] +) -> Dict[str, Dict[str, str]]: + """ + Build lookup dictionaries for version files organized by build directory. + + Args: + version_files_list: List of tuples (file_type, file_items) + e.g., [("ocp", ocp_files), ("operator", operator_files)] + + Returns: + Dict mapping file_type to {build_dir: content} + e.g., {"ocp": {build_dir: "4.17.16"}, "operator": {build_dir: "25.4.0"}} + """ + version_lookups = {} + + for file_type, file_items in version_files_list: + lookup = {} + for file_item in file_items: + path = file_item["name"] + build_dir = path.rsplit("/", 1)[0] + try: + content = fetch_gcs_file_content(path) + lookup[build_dir] = content.strip() + except Exception as e: + logger.warning(f"Failed to fetch {file_type} from {path}: {e}") + version_lookups[file_type] = lookup + + return version_lookups + + +def build_finished_lookup( + finished_files: List[Dict[str, Any]] +) -> Dict[str, Dict[str, Any]]: + """ + Build lookup dictionary for finished.json files by build directory. + + Args: + finished_files: List of finished.json file items from GCS + + Returns: + Dict mapping build_dir to parsed finished.json content + """ + finished_lookup = {} + + for finished_item in finished_files: + finished_path = finished_item["name"] + build_dir = finished_path.rsplit("/", 1)[0] + try: + content = fetch_gcs_file_content(finished_path) + finished_lookup[build_dir] = json.loads(content) + except Exception as e: + logger.warning(f"Failed to fetch/parse finished.json from {finished_path}: {e}") + + return finished_lookup + + +def extract_test_status( + finished_json: Dict[str, Any], + status_success: str, + status_failure: str, + status_aborted: str +) -> str: + """ + Extract and normalize test status from finished.json. + + Args: + finished_json: Parsed finished.json content + status_success: String constant for success status + status_failure: String constant for failure status + status_aborted: String constant for aborted status + + Returns: + Normalized test status string + """ + result_str = finished_json.get("result", "UNKNOWN").upper() + if result_str in [status_success, status_failure, status_aborted]: + return result_str + return status_failure + + +def extract_timestamp(finished_json: Dict[str, Any]) -> int: + """ + Extract timestamp from finished.json. + + Args: + finished_json: Parsed finished.json content + + Returns: + Unix timestamp (defaults to 0 if not found) + """ + return finished_json.get("timestamp", 0) + + +def determine_repo_from_job_name(job_name: str) -> str: + """ + Determine repository from job name pattern. + + Args: + job_name: Job name string + + Returns: + Repository identifier ('openshift_release' or 'rh-ecosystem-edge_nvidia-ci') + """ + return "openshift_release" if job_name.startswith("rehearse-") else "rh-ecosystem-edge_nvidia-ci" + + +def convert_sets_to_lists_recursive(data: Any) -> Any: + """ + Recursively convert sets to sorted lists for JSON serialization. + + Args: + data: Any data structure that may contain sets + + Returns: + Data structure with sets converted to sorted lists + """ + if isinstance(data, set): + return sorted(list(data)) + elif isinstance(data, dict): + return {k: convert_sets_to_lists_recursive(v) for k, v in data.items()} + elif isinstance(data, list): + return [convert_sets_to_lists_recursive(item) for item in data] + else: + return data + + +def merge_job_history_links( + new_links: Any, + existing_links: Any +) -> List[str]: + """ + Merge and deduplicate job history links. + + Args: + new_links: New links (can be set or list) + existing_links: Existing links (can be set or list) + + Returns: + Sorted list of unique links + """ + # Convert both to sets + new_set = set(new_links) if isinstance(new_links, (set, list)) else set() + existing_set = set(existing_links) if isinstance(existing_links, (set, list)) else set() + + # Merge and return sorted list + all_links = new_set | existing_set + return sorted(list(all_links)) + + +def int_or_none(value: Optional[str]) -> Optional[int]: + """ + Convert string to int or None for unlimited. + + Args: + value: String value to convert + + Returns: + Integer or None + """ + if value is None: + return None + if value.lower() in ('none', 'unlimited'): + return None + return int(value) + diff --git a/workflows/common/data_structures.py b/workflows/common/data_structures.py new file mode 100644 index 000000000..6348d8b9e --- /dev/null +++ b/workflows/common/data_structures.py @@ -0,0 +1,59 @@ +""" +Shared data structures and constants for CI dashboards. +""" + +from dataclasses import dataclass +from typing import Any, Dict, Optional + +# Constants for version field names (shared across dashboards) +OCP_FULL_VERSION = "ocp_full_version" +OPERATOR_VERSION = "operator_version" # Generic operator version field + +# GPU Operator specific (for backward compatibility) +GPU_OPERATOR_VERSION = "gpu_operator_version" + +# Constants for job statuses +STATUS_SUCCESS = "SUCCESS" +STATUS_FAILURE = "FAILURE" +STATUS_ABORTED = "ABORTED" + + +@dataclass(frozen=True) +class TestResult: + """Represents a single test run result (shared data structure).""" + ocp_full_version: str + operator_version: str # Can be GPU or Network operator version + test_status: str + prow_job_url: str + job_timestamp: str + test_flavor: Optional[str] = None # Optional: for dashboards with test flavors (NNO) + + def to_dict(self) -> Dict[str, Any]: + """Convert TestResult to dictionary format for JSON serialization.""" + result = { + OCP_FULL_VERSION: self.ocp_full_version, + "operator_version": self.operator_version, + "test_status": self.test_status, + "prow_job_url": self.prow_job_url, + "job_timestamp": self.job_timestamp, + } + # Include test_flavor only if it's set + if self.test_flavor is not None: + result["test_flavor"] = self.test_flavor + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "TestResult": + """Create TestResult from dictionary.""" + # Handle backward compatibility with GPU operator data + operator_version = data.get("operator_version") or data.get(GPU_OPERATOR_VERSION) + + return cls( + ocp_full_version=data[OCP_FULL_VERSION], + operator_version=operator_version, + test_status=data["test_status"], + prow_job_url=data["prow_job_url"], + job_timestamp=data["job_timestamp"], + test_flavor=data.get("test_flavor"), + ) + diff --git a/workflows/common/gcs_utils.py b/workflows/common/gcs_utils.py new file mode 100644 index 000000000..5e28d9494 --- /dev/null +++ b/workflows/common/gcs_utils.py @@ -0,0 +1,130 @@ +""" +GCS (Google Cloud Storage) utilities for fetching CI test artifacts. +Shared across GPU Operator and Network Operator dashboards. +""" + +import re +import urllib.parse +from typing import Dict, Any + +import requests + +from workflows.common.utils import logger + +# GCS API base URL for test-platform-results bucket +GCS_API_BASE_URL = "https://storage.googleapis.com/storage/v1/b/test-platform-results/o" + +# Maximum number of results per GCS API request for pagination +GCS_MAX_RESULTS_PER_REQUEST = 1000 + + +def http_get_json(url: str, params: Dict[str, Any] = None, headers: Dict[str, str] = None) -> Dict[str, Any]: + """ + Send an HTTP GET request and return the JSON response. + + Args: + url: URL to fetch + params: Optional query parameters + headers: Optional HTTP headers + + Returns: + Parsed JSON response + + Raises: + requests.HTTPError: If the request fails + """ + response = requests.get(url, params=params, headers=headers, timeout=30) + response.raise_for_status() + return response.json() + + +def fetch_gcs_file_content(file_path: str) -> str: + """ + Fetch the raw text content from a file in GCS. + + Args: + file_path: Path to the file in GCS (e.g., "pr-logs/pull/...") + + Returns: + File content as string + + Raises: + requests.HTTPError: If the file cannot be fetched + """ + logger.info(f"Fetching file content for {file_path}") + response = requests.get( + url=f"{GCS_API_BASE_URL}/{urllib.parse.quote_plus(file_path)}", + params={"alt": "media"}, + timeout=30, + ) + response.raise_for_status() + return response.content.decode("UTF-8") + + +def build_prow_job_url(finished_json_path: str) -> str: + """ + Build a Prow job URL from a finished.json file path. + + Args: + finished_json_path: Path to finished.json file (e.g., "pr-logs/pull/.../finished.json") + + Returns: + Full URL to the Prow job artifacts page + """ + directory_path = finished_json_path[:-len('/finished.json')] + return f"https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/{directory_path}" + + +def fetch_filtered_files(pr_number: str, glob_pattern: str) -> list[Dict[str, Any]]: + """ + Fetch files from GCS matching a specific pattern for a PR. + + Args: + pr_number: Pull request number + glob_pattern: Glob pattern to match files (e.g., "*/finished.json", "*/ocp.version") + + Returns: + List of file metadata dictionaries from GCS API + """ + all_items = [] + + # Search in both possible PR locations + for prefix in [ + f"pr-logs/pull/rh-ecosystem-edge_nvidia-ci/{pr_number}/", + f"pr-logs/pull/openshift_release/{pr_number}/" + ]: + page_token = None # Reset pagination token for each prefix + while True: + params = { + "prefix": prefix, + "delimiter": "", + "matchGlob": glob_pattern, + "maxResults": GCS_MAX_RESULTS_PER_REQUEST, + } + if page_token: + params["pageToken"] = page_token + + data = http_get_json(GCS_API_BASE_URL, params=params) + items = data.get("items", []) + all_items.extend(items) + + page_token = data.get("nextPageToken") + if not page_token: + break + + logger.info(f"Found {len(all_items)} files matching pattern '{glob_pattern}' for PR #{pr_number}") + return all_items + + +def build_job_history_url(job_name: str) -> str: + """ + Build a Prow job history URL for a given job name. + + Args: + job_name: Name of the CI job + + Returns: + Full URL to the job history page + """ + return f"https://prow.ci.openshift.org/job-history/gs/test-platform-results/pr-logs/directory/{job_name}" + diff --git a/workflows/common/html_builders.py b/workflows/common/html_builders.py new file mode 100644 index 000000000..7c31b3605 --- /dev/null +++ b/workflows/common/html_builders.py @@ -0,0 +1,162 @@ +""" +Shared HTML building utilities for CI dashboards. +""" + +import html +from typing import List, Dict, Any +from datetime import datetime, timezone + + +def build_toc(ocp_keys: List[str]) -> str: + """ + Build a Table of Contents (TOC) for OpenShift versions. + + Args: + ocp_keys: List of OCP version strings to include in TOC + + Returns: + HTML string containing the TOC + """ + toc_links = ", ".join( + f'{ocp_version}' for ocp_version in ocp_keys + ) + return f""" +