|
| 1 | +# Copyright (C) 2026 Intel Corporation |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +from pathlib import Path |
| 5 | +from zipfile import ZipFile |
| 6 | +import tempfile |
| 7 | + |
| 8 | +import requests |
| 9 | +from github.WorkflowRun import WorkflowRun |
| 10 | +from urllib3.util.retry import Retry |
| 11 | +import argparse |
| 12 | +from requests.adapters import HTTPAdapter |
| 13 | +from github import Github, Auth |
| 14 | + |
| 15 | +import os |
| 16 | +import re |
| 17 | +import logging |
| 18 | + |
| 19 | + |
| 20 | +def init_logger(): |
| 21 | + LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() |
| 22 | + logging.basicConfig( |
| 23 | + level=LOGLEVEL, format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d-%Y %H:%M:%S" |
| 24 | + ) |
| 25 | + |
| 26 | + |
| 27 | +init_logger() |
| 28 | + |
| 29 | +LOGGER = logging.getLogger("ci-doctor-preanalysis") |
| 30 | + |
| 31 | +CI_DOCTOR_DIR = Path("/tmp/ci-doctor/") |
| 32 | + |
| 33 | + |
| 34 | +def get_arguments() -> argparse.Namespace: |
| 35 | + def repository_name(value: str) -> str: |
| 36 | + if not re.match(r"^[A-Za-z0-9._-]+/[A-Za-z0-9._-]+$", value): |
| 37 | + raise argparse.ArgumentTypeError(f"Invalid format (expected 'owner/name'): {value}") |
| 38 | + return value |
| 39 | + |
| 40 | + def run_id(value: str) -> int: |
| 41 | + if not re.match(r"^[0-9]+$", value): |
| 42 | + raise argparse.ArgumentTypeError(f"Run ID must be a positive integer: {value}") |
| 43 | + return int(value) |
| 44 | + |
| 45 | + parser = argparse.ArgumentParser() |
| 46 | + parser.add_argument( |
| 47 | + "-r", |
| 48 | + "--repository-name", |
| 49 | + type=repository_name, |
| 50 | + required=True, |
| 51 | + help="Repository name in the OWNER/REPOSITORY format", |
| 52 | + ) |
| 53 | + parser.add_argument("--run-id", type=run_id, required=True, help="Workflow Run ID") |
| 54 | + return parser.parse_args() |
| 55 | + |
| 56 | + |
| 57 | +def _safe_extract(archive_path: Path, dest_dir: Path) -> None: |
| 58 | + """Extract a zip archive, rejecting entries that would escape dest_dir.""" |
| 59 | + dest_dir = dest_dir.resolve() |
| 60 | + with ZipFile(file=archive_path, mode="r") as zip_file: |
| 61 | + for member in zip_file.namelist(): |
| 62 | + member_path = (dest_dir / member).resolve() |
| 63 | + if not str(member_path).startswith(str(dest_dir) + os.sep): |
| 64 | + raise ValueError(f"Zip entry escapes target directory: {member}") |
| 65 | + zip_file.extractall(dest_dir) |
| 66 | + |
| 67 | + |
| 68 | +def collect_logs_for_run(run: WorkflowRun, logs_dir: Path, GITHUB_TOKEN: str, session: requests.Session): |
| 69 | + """ |
| 70 | + Downloads logs of a given Workflow Run, |
| 71 | + saves them to a specified path, and returns that path. |
| 72 | +
|
| 73 | + We don't need successful job logs, so we remove them. |
| 74 | + We could've just downloaded logs for failed jobs only, |
| 75 | + but when you download all logs from a workflow run, |
| 76 | + GitHub includes "system.txt" files for each job, which can also |
| 77 | + contain errors on which we might want to trigger rerun. |
| 78 | +
|
| 79 | + Example log archive structure: |
| 80 | + . |
| 81 | + ├── 10_Pytorch Layer Tests _ PyTorch Layer Tests.txt |
| 82 | + ├── 11_CPU functional tests _ CPU functional tests.txt |
| 83 | + ├── 12_C++ unit tests _ C++ unit tests.txt |
| 84 | + ├── 13_OpenVINO tokenizers extension _ OpenVINO tokenizers extension.txt |
| 85 | + ├── C++ unit tests _ C++ unit tests |
| 86 | + │ └── system.txt |
| 87 | + ├── CPU functional tests _ CPU functional tests |
| 88 | + │ └── system.txt |
| 89 | + ├── OpenVINO tokenizers extension _ OpenVINO tokenizers extension |
| 90 | + │ └── system.txt |
| 91 | + ├── Pytorch Layer Tests _ PyTorch Layer Tests |
| 92 | + └── system.txt |
| 93 | +
|
| 94 | + Sometimes though, directories contain log files for each individual step, |
| 95 | + IN ADDITION to the full log in root of the directory: |
| 96 | + . |
| 97 | + ├── 1_Build.txt |
| 98 | + └── Build |
| 99 | + ├── 13_Upload build logs.txt |
| 100 | + ├── 1_Set up job.txt |
| 101 | + ├── 24_Post Clone vcpkg.txt |
| 102 | + ├── 25_Post Clone OpenVINO.txt |
| 103 | + ├── 26_Stop containers.txt |
| 104 | + ├── 27_Complete job.txt |
| 105 | + ├── 2_Initialize containers.txt |
| 106 | + ├── 3_Clone OpenVINO.txt |
| 107 | + ├── 4_Get VCPKG version and put it into GitHub ENV.txt |
| 108 | + ├── 5_Init submodules for non vcpkg dependencies.txt |
| 109 | + ├── 6_Clone vcpkg.txt |
| 110 | + ├── 7_System info.txt |
| 111 | + ├── 8_Build vcpkg.txt |
| 112 | + ├── 9_CMake - configure.txt |
| 113 | + └── system.txt |
| 114 | +
|
| 115 | + In that case, we need only 'system.txt' file from each directory |
| 116 | + """ |
| 117 | + # Get failed jobs |
| 118 | + failed_jobs = [job for job in run.jobs() if job.conclusion in ("failure", "cancelled")] |
| 119 | + LOGGER.info(f"FAILED JOBS: {[job.name for job in failed_jobs]}") |
| 120 | + |
| 121 | + with tempfile.NamedTemporaryFile(suffix=".zip") as temp_file: |
| 122 | + log_archive_path = Path(temp_file.name) |
| 123 | + |
| 124 | + # Download logs archive |
| 125 | + with open(file=log_archive_path, mode="wb") as log_archive: |
| 126 | + LOGGER.info(f"DOWNLOADING LOGS FOR RUN ID {run.id}") |
| 127 | + # PyGitHub does not expose the "/repos/{owner}/{repo}/actions/runs/{run_id}/logs" endpoint so we have to use requests |
| 128 | + LOGGER.debug(f"Downloading logs from {run.logs_url}") |
| 129 | + response = session.get(url=run.logs_url, headers={"Authorization": f"Bearer {GITHUB_TOKEN}"}) |
| 130 | + response.raise_for_status() |
| 131 | + log_archive.write(response.content) |
| 132 | + |
| 133 | + # Unpack it |
| 134 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 135 | + logs_temp_dir = Path(temp_dir).resolve() |
| 136 | + _safe_extract(log_archive_path, logs_temp_dir) |
| 137 | + |
| 138 | + # Traverse the unpacked logs to find the ones of failed jobs |
| 139 | + for job in failed_jobs: |
| 140 | + job_filename = job.name.replace("/", "_") |
| 141 | + LOGGER.debug(f"Looking for failed job logs with filename: {job_filename}") |
| 142 | + |
| 143 | + for p in logs_temp_dir.iterdir(): |
| 144 | + # Move failed jobs' logs to the final destination |
| 145 | + if p.is_dir() and p.name == job_filename: |
| 146 | + system_log_path = p / "system.txt" |
| 147 | + if system_log_path.is_file(): |
| 148 | + LOGGER.debug(f"Keeping system.txt from directory {p} for failed job {job.name}") |
| 149 | + system_log_path.rename(logs_dir / f"{job_filename}__system.txt") |
| 150 | + elif p.is_file() and p.name.endswith(f"{job_filename}.txt"): |
| 151 | + LOGGER.debug(f"Keeping file {p} for failed job {job.name}") |
| 152 | + p.rename(logs_dir / p.name) |
| 153 | + |
| 154 | + LOGGER.info(f"COLLECTED LOGS FOR {run.id} IN {logs_dir}") |
| 155 | + |
| 156 | + |
| 157 | +# Lines that match ERROR_PATTERN but are known false positives. |
| 158 | +NOISE_PATTERN = re.compile( |
| 159 | + r"(-o pipefail|xfail|XFAIL|Defaulting to unsafe serialization|SCCACHE_IGNORE_SERVER_IO_ERROR)", |
| 160 | +) |
| 161 | + |
| 162 | +# Case-insensitive pattern matching common CI error indicators. |
| 163 | +ERROR_PATTERN = re.compile( |
| 164 | + r"(" |
| 165 | + r"\berror[\s:\[)]" |
| 166 | + r"|\bfail(?:ed|ure|ing|s)?\b" |
| 167 | + r"|panic:" |
| 168 | + r"|\bfatal[\s:]" |
| 169 | + r"|\bundefined[\s:]" |
| 170 | + r"|\bexception\b" |
| 171 | + r"|exit status [^0]" |
| 172 | + r")", |
| 173 | + re.IGNORECASE, |
| 174 | +) |
| 175 | + |
| 176 | +MAX_HINT_LINES = 30 |
| 177 | + |
| 178 | + |
| 179 | +def extract_hints(logs_dir: Path, hints_dir: Path) -> None: |
| 180 | + """Extracts lines matching ERROR_PATTERN from log files, writes them to separate hint files.""" |
| 181 | + |
| 182 | + for log_file in logs_dir.iterdir(): |
| 183 | + if not log_file.is_file() or not log_file.name.endswith(".txt"): |
| 184 | + continue |
| 185 | + hints: list[str] = [] |
| 186 | + with log_file.open() as f: |
| 187 | + for lineno, line in enumerate(f, start=1): |
| 188 | + if NOISE_PATTERN.search(line) or not ERROR_PATTERN.search(line): |
| 189 | + continue |
| 190 | + hints.append(f"{lineno}:{line.strip()}") |
| 191 | + if len(hints) >= MAX_HINT_LINES: |
| 192 | + break |
| 193 | + |
| 194 | + hints_file_path = hints_dir / f"{log_file.name}-hints.txt" |
| 195 | + if hints: |
| 196 | + hints_file_path.write_text("\n".join(hints)) |
| 197 | + |
| 198 | + |
| 199 | +def count_lines(file_path: Path) -> int: |
| 200 | + with file_path.open() as f: |
| 201 | + return sum(1 for _ in f) |
| 202 | + |
| 203 | + |
| 204 | +def write_summary(run: WorkflowRun, logs_dir: Path, hints_dir: Path) -> None: |
| 205 | + """Write a consolidated summary file for the CI Doctor agent.""" |
| 206 | + lines: list[str] = [ |
| 207 | + "=== Failed Jobs Summary ===", |
| 208 | + f"Run ID: {run.id}", |
| 209 | + "", |
| 210 | + ] |
| 211 | + |
| 212 | + failed_jobs = [job for job in run.jobs() if job.conclusion in ("failure", "cancelled")] |
| 213 | + for job in failed_jobs: |
| 214 | + failed_steps = ", ".join([step.name for step in job.steps if step.conclusion in ("failure", "cancelled")]) |
| 215 | + lines.append(f" Job {job.id} {job.name} {job.url}:") |
| 216 | + lines.append(f" Failed steps: {failed_steps if failed_steps else '(none)'}") |
| 217 | + |
| 218 | + lines.append("") |
| 219 | + lines.append(f"Downloaded log files ({logs_dir}):") |
| 220 | + for log_file in sorted(logs_dir.glob("*.txt")): |
| 221 | + lines.append(f" {log_file}") |
| 222 | + |
| 223 | + lines.append("") |
| 224 | + lines.append(f"Hint files ({hints_dir}):") |
| 225 | + for hints_file in sorted(hints_dir.glob("*-hints.txt")): |
| 226 | + if not hints_file.stat().st_size: |
| 227 | + continue |
| 228 | + hint_count = count_lines(hints_file) |
| 229 | + lines.append(f" {hints_file} ({hint_count} matches)") |
| 230 | + # Show first 3 hint lines as preview. |
| 231 | + try: |
| 232 | + with hints_file.open() as f: |
| 233 | + for i, line in enumerate(f): |
| 234 | + if i >= 3: |
| 235 | + break |
| 236 | + lines.append(f" {line.rstrip()}") |
| 237 | + except OSError: |
| 238 | + pass |
| 239 | + |
| 240 | + summary_text = "\n".join(lines) + "\n" |
| 241 | + |
| 242 | + SUMMARY_FILE = logs_dir.parent / "summary.txt" |
| 243 | + SUMMARY_FILE.write_text(summary_text) |
| 244 | + print(summary_text) |
| 245 | + print(f"Pre-analysis complete. Agent should start with {SUMMARY_FILE}") |
| 246 | + |
| 247 | + |
| 248 | +PATTERNS_TO_FILTER_OUT = [ |
| 249 | + # 2026-03-13T13:42:55.9786288Z Received 35870 data chunks (chunk size: 16384 bytes), time passed: 30784ms |
| 250 | + re.compile(r"Received \d+ data chunks \(chunk size: \d+ bytes\), time passed: \d+ms"), |
| 251 | +] |
| 252 | + |
| 253 | + |
| 254 | +def filter_logs(job_logs_dir: Path): |
| 255 | + """Remove lines matching patterns in PATTERNS_TO_FILTER_OUT from log files in LOG_DIR.""" |
| 256 | + for log_file in job_logs_dir.glob("*.txt"): |
| 257 | + filtered_lines: list[str] = [] |
| 258 | + with log_file.open() as f: |
| 259 | + for line in f: |
| 260 | + if any(pattern.search(line) for pattern in PATTERNS_TO_FILTER_OUT): |
| 261 | + continue |
| 262 | + filtered_lines.append(line.rstrip()) |
| 263 | + |
| 264 | + log_file.write_text("\n".join(filtered_lines) + "\n") |
| 265 | + |
| 266 | + |
| 267 | +def main(): |
| 268 | + args = get_arguments() |
| 269 | + run_id = args.run_id |
| 270 | + repository_name = args.repository_name |
| 271 | + |
| 272 | + GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] |
| 273 | + |
| 274 | + session = requests.Session() |
| 275 | + retry_strategy = Retry(total=5, backoff_factor=3, backoff_jitter=1, status_forcelist=[429, 500, 502, 503, 504]) |
| 276 | + session.mount("https://api.github.com", HTTPAdapter(max_retries=retry_strategy)) |
| 277 | + session.mount("https://results-receiver.actions.githubusercontent.com", HTTPAdapter(max_retries=retry_strategy)) |
| 278 | + |
| 279 | + github = Github(auth=Auth.Token(token=GITHUB_TOKEN)) |
| 280 | + gh_repo = github.get_repo(full_name_or_id=repository_name) |
| 281 | + run = gh_repo.get_workflow_run(id_=run_id) |
| 282 | + |
| 283 | + if run.conclusion not in ("failure", "cancelled"): |
| 284 | + LOGGER.warning( |
| 285 | + f"Run {run_id} in {repository_name} has conclusion '{run.conclusion}'. Expected conclusion is 'failure' or 'cancelled'. No logs will be collected." |
| 286 | + ) |
| 287 | + return |
| 288 | + |
| 289 | + RUN_DIR = CI_DOCTOR_DIR / f"run_{run_id}" |
| 290 | + |
| 291 | + # check if run_dir is empty |
| 292 | + if RUN_DIR.exists() and any(RUN_DIR.iterdir()): |
| 293 | + raise RuntimeError(f"Run directory {RUN_DIR} is not empty. Clean it up before running the script.") |
| 294 | + |
| 295 | + LOGS_DIR = RUN_DIR / "logs" |
| 296 | + LOGS_DIR.mkdir(parents=True, exist_ok=True) |
| 297 | + |
| 298 | + collect_logs_for_run(run=run, logs_dir=LOGS_DIR, GITHUB_TOKEN=GITHUB_TOKEN, session=session) |
| 299 | + filter_logs(job_logs_dir=LOGS_DIR) |
| 300 | + |
| 301 | + HINTS_DIR = RUN_DIR / "hints" |
| 302 | + HINTS_DIR.mkdir(exist_ok=True, parents=True) |
| 303 | + |
| 304 | + extract_hints(logs_dir=LOGS_DIR, hints_dir=HINTS_DIR) |
| 305 | + |
| 306 | + write_summary(run=run, logs_dir=LOGS_DIR, hints_dir=HINTS_DIR) |
| 307 | + |
| 308 | + |
| 309 | +if __name__ == "__main__": |
| 310 | + main() |
0 commit comments