From 57b6f6617d56eaede0b460b862f07bafcbcb3e7f Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 7 Feb 2026 10:52:43 -0800
Subject: [PATCH 1/5] Compile Contributors Script

---
 tools/python/compile_contributors.py | 457 +++++++++++++++++++++++++++
 1 file changed, 457 insertions(+)
 create mode 100644 tools/python/compile_contributors.py

diff --git a/tools/python/compile_contributors.py b/tools/python/compile_contributors.py
new file mode 100644
index 0000000000000..f2e788ac2512b
--- /dev/null
+++ b/tools/python/compile_contributors.py
@@ -0,0 +1,457 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+Compile Contributors Script
+---------------------------
+Description:
+    This script compiles contributor information by comparing two git branches/commits.
+    It identifies Pull Requests, handles cherry-picked commits (including "Cherry-pick round" meta-PRs),
+    and consolidates author identities (names vs usernames).
+
+Usage:
+    python compile_contributors.py [--base <base_branch>] [--target <target_branch>] [--dir <output_dir>]
+
+Example:
+    python compile_contributors.py --base origin/rel-1.23.2 --target origin/rel-1.24.1 --dir rel-1.24.1_report
+
+Outputs:
+    - detail.csv: Detailed breakdown of PRs, authors, and commit links.
+    - logs.txt: Processing logs and summary (professional humans-only contributor list for release notes).
+
+Requirements:
+    - GitHub CLI (gh) logged in.
+"""
+
+import argparse
+import csv
+import datetime
+import json
+import os
+import re
+import subprocess
+
+
+def log_event(message, log_file=None):
+    """Log a message to the console and an optional log file."""
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    full_message = f"[{timestamp}] {message}"
+    print(message)  # Clean print for console UI
+    if log_file:
+        log_file.write(full_message + "\n")
+
+
+# Constants
+MAX_CHERRY_PICK_SCAN = 50
+PR_CACHE = {}  # Cache for PR details to speed up multiple rounds referencing same PRs
+NAME_TO_LOGIN = {}  # Map full names to GitHub logins for consolidation
+
+# Bots to exclude from contributor lists
+BOT_NAMES = {
+    "Copilot",
+    "dependabot[bot]",
+    "app/dependabot",
+    "github-actions[bot]",
+    "app/copilot-swe-agent",
+    "CI Bot",
+    "github-advanced-security[bot]",
+    "GitHub Actions",
+    "dependabot",
+    "github-actions",
+    "Gemini",
+    "CI",
+}
+
+
+def is_bot(name):
+    if not name:
+        return True
+    name_clean = name.strip().lstrip("@")
+    # Known bots and patterns
+    if name_clean in BOT_NAMES:
+        return True
+    if "[bot]" in name_clean.lower():
+        return True
+    if name_clean.lower().startswith("app/"):
+        return True
+    return False
+
+
+def is_invalid(name):
+    if not name:
+        return True
+    # If it's a bot, it's considered a valid identity for the CSV
+    if is_bot(name):
+        return False
+
+    name_clean = name.strip().lstrip("@")
+    # Paths, brackets, and code extensions
+    if "/" in name_clean or "\\" in name_clean or "[" in name_clean or "]" in name_clean:
+        return True
+    if any(name_clean.lower().endswith(ext) for ext in [".cmake", ".py", ".h", ".cc", ".cpp", ".txt", ".md"]):
+        return True
+    return False
+
+
+def run_command(command, cwd=".", silent=False):
+    result = subprocess.run(command, check=False, shell=True, capture_output=True, text=True, cwd=cwd, encoding="utf-8")
+    if result.returncode != 0:
+        if not silent:
+            print(f"Error running command: {command}\n{result.stderr}")
+        return None
+    return result.stdout
+
+
+def get_pr_number(subject):
+    match = re.search(r"\(#(\d+)\)$", subject.strip())
+    if match:
+        return match.group(1)
+    return None
+
+
+def get_pr_details(pr_number):
+    if pr_number in PR_CACHE:
+        return PR_CACHE[pr_number]
+
+    # Try as a PR first - fetch author and commits to get all contributors
+    output = run_command(f"gh pr view {pr_number} --json number,title,author,body,commits", silent=True)
+    if output:
+        details = json.loads(output)
+        PR_CACHE[pr_number] = details
+        return details
+
+    PR_CACHE[pr_number] = None
+    return None
+
+
+def extract_authors_from_pr(details):
+    authors = set()
+    if not details:
+        return authors
+
+    # Add main PR author
+    pr_login = None
+    if details.get("author"):
+        pr_login = details["author"]["login"]
+        authors.add(pr_login)
+
+    # Add authors from all commits in the PR
+    if "commits" in details:
+        for commit in details["commits"]:
+            for author_info in commit.get("authors", []):
+                login = author_info.get("login")
+                name = author_info.get("name")
+                if login:
+                    authors.add(login)
+                    if name:
+                        NAME_TO_LOGIN[name] = login
+                elif name:
+                    # HEURISTIC: If there is no login but there is a name,
+                    # and we have a PR author login, associate this name with the PR author.
+                    # This handles squash-merged PRs where Git name != GitHub handle.
+                    if pr_login:
+                        authors.add(pr_login)
+                        NAME_TO_LOGIN[name] = pr_login
+                    else:
+                        authors.add(name)
+
+    return authors
+
+
+def extract_authors_from_commit(commit_id):
+    authors = set()
+    # Format: AuthorName \n Body
+    info = run_command(f'git show -s --format="%an%n%B" {commit_id}', silent=True)
+    if not info:
+        return authors
+
+    lines = info.strip().splitlines()
+    if lines:
+        authors.add(lines[0])  # Main author name
+
+    # Look for Co-authored-by trailers
+    for line in lines:
+        if "co-authored-by:" in line.lower():
+            # Pattern: Co-authored-by: Name <email> or Co-authored-by: login <email>
+            match = re.search(r"co-authored-by:\s*(.*?)\s*<", line, re.IGNORECASE)
+            if match:
+                authors.add(match.group(1).strip())
+
+    return authors
+
+
+def extract_pr_numbers(text, strict=False):
+    if not text:
+        return []
+
+    if strict:
+        # Strict mode: Only look for (#123) with closing paren or full onnxruntime URLs
+        # This avoids noise from version numbers or external repo PRs
+        # And it avoids matching truncated headlines like (#25... as PR #25
+        patterns = [
+            r"\(#(\d+)\)",  # (#123)
+            r"microsoft/onnxruntime/pull/(\d+)",
+        ]
+        results = []
+        for p in patterns:
+            results.extend(re.findall(p, text))
+        return [int(x) for x in set(results)]
+
+    # Matches patterns like #123 or https://github.com/microsoft/onnxruntime/pull/123
+    # Also handles ( #123) or similar in titles
+    prs = re.findall(r"(?:#|/pull/)(\d+)", text)
+    return [int(x) for x in set(prs)]
+
+
+def get_prs_from_log(log_output, prs_base=None, log_file=None):
+    if not log_output:
+        return {}
+
+    all_prs = {}  # pr_number -> {title, authors, original_pr, cherry_pick_commit}
+    lines = log_output.splitlines()
+    total_commits = len(lines)
+    commit_count = 0
+
+    log_event(f"Processing {total_commits} commits...", log_file)
+
+    for line in lines:
+        commit_count += 1
+        parts = line.split(" ", 1)
+        if len(parts) < 2:
+            continue
+        commit_id = parts[0]
+        subject = parts[1]
+
+        # Concise progress indicator
+        pr_num_str = get_pr_number(subject)
+        display_id = f"PR #{pr_num_str}" if pr_num_str else f"commit {commit_id}"
+        log_event(f"[{commit_count}/{total_commits}] Processing {display_id}...", log_file)
+
+        details = None
+        if pr_num_str:
+            if prs_base and pr_num_str in prs_base:
+                log_event(f"  - PR #{pr_num_str} already in base branch, skipping.", log_file)
+                continue
+            details = get_pr_details(pr_num_str)
+
+        if details:
+            # Check if it's a cherry-pick round PR - scan deep to identify meta-PRs
+            is_meta_pr = (
+                "cherry pick" in subject.lower() or "cherry-pick" in subject.lower() or "cherrypick" in subject.lower()
+            )
+
+            if is_meta_pr and commit_count < MAX_CHERRY_PICK_SCAN:
+                log_event(f"  - Meta-PR detected, expanding: {details['title']}", log_file)
+                # Collect Original PRs from Title, Body, and Commits
+                all_extracted_nums = []
+                all_extracted_nums.extend(extract_pr_numbers(details["title"]))
+                all_extracted_nums.extend(extract_pr_numbers(details["body"], strict=True))
+
+                commits_output = run_command(f"gh pr view {pr_num_str} --json commits", silent=True)
+                if commits_output:
+                    commits_data = json.loads(commits_output)
+                    for commit in commits_data.get("commits", []):
+                        all_extracted_nums.extend(extract_pr_numbers(commit.get("messageHeadline", ""), strict=True))
+                        all_extracted_nums.extend(extract_pr_numbers(commit.get("messageBody", ""), strict=True))
+
+                # Filter and Normalize
+                current_pr_int = int(pr_num_str)
+                valid_pr_nums = []
+                for op_num in set(all_extracted_nums):
+                    if op_num == current_pr_int:
+                        continue
+                    if abs(op_num - current_pr_int) < 5000:
+                        valid_pr_nums.append(str(op_num))
+
+                original_pr_nums = sorted(valid_pr_nums)
+                log_event(f"  - Extracted sub-PR candidates: {original_pr_nums}", log_file)
+
+                if original_pr_nums:
+                    log_event(f"  -> Found {len(original_pr_nums)} sub-PRs for expansion.", log_file)
+                    for op_num_str in original_pr_nums:
+                        if prs_base and op_num_str in prs_base:
+                            log_event(f"    - Sub-PR #{op_num_str} already in base branch, skipping.", log_file)
+                            continue
+
+                        op_details = get_pr_details(op_num_str)
+                        if op_details:
+                            log_event(f"    - Added Sub-PR #{op_num_str}: {op_details['title']}", log_file)
+                            all_prs[op_num_str] = {
+                                "title": op_details["title"],
+                                "authors": list(extract_authors_from_pr(op_details)),
+                                "cherry_pick_commit": commit_id,
+                                "cherry_pick_pr": pr_num_str,
+                            }
+                        else:
+                            # FALLBACK: Use Meta-PR authors if sub-PR fetch fails
+                            log_event(
+                                f"    - Warning: Fetch failed for PR #{op_num_str}, using meta-PR authors fallback.",
+                                log_file,
+                            )
+                            meta_authors = extract_authors_from_pr(details)
+                            all_prs[op_num_str] = {
+                                "title": f"Original PR #{op_num_str} (details missing)",
+                                "authors": list(meta_authors),
+                                "cherry_pick_commit": commit_id,
+                                "cherry_pick_pr": pr_num_str,
+                            }
+                else:
+                    log_event("  - No sub-PRs found, treating meta-PR as a normal PR.", log_file)
+                    all_prs[pr_num_str] = {
+                        "title": details["title"],
+                        "authors": list(extract_authors_from_pr(details)),
+                        "cherry_pick_commit": commit_id,
+                        "cherry_pick_pr": None,
+                    }
+            else:
+                log_event(f"  - Added PR #{pr_num_str}: {details['title']}", log_file)
+                all_prs[pr_num_str] = {
+                    "title": details["title"],
+                    "authors": list(extract_authors_from_pr(details)),
+                    "cherry_pick_commit": commit_id,
+                    "cherry_pick_pr": None,
+                }
+        else:
+            # Not a PR OR PR detail fetch failed (e.g. it was an issue or deleted PR)
+            # Use git commit author as the reliable fallback
+            if pr_num_str:
+                log_event(
+                    f"  - PR #{pr_num_str} lookup failed (possibly issue or deleted). Falling back to commit author.",
+                    log_file,
+                )
+            authors = extract_authors_from_commit(commit_id)
+            if authors:
+                log_event(f"  - Added commit {commit_id} with authors: {list(authors)}", log_file)
+                all_prs[f"commit_{commit_id}"] = {
+                    "title": subject,
+                    "authors": list(authors),
+                    "cherry_pick_commit": commit_id,
+                    "cherry_pick_pr": None,
+                }
+
+    return all_prs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compile contributor list from Git log comparison.")
+    parser.add_argument("--base", default="origin/rel-1.23.2", help="Base branch/commit to compare from")
+    parser.add_argument("--target", default="origin/rel-1.24.1", help="Target branch/commit to compare to")
+    parser.add_argument("--dir", default="contributors", help="Output directory for reports and logs")
+    args = parser.parse_args()
+
+    branch_base = args.base
+    branch_target = args.target
+    output_dir = args.dir
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    logs_path = os.path.join(output_dir, "logs.txt")
+    with open(logs_path, "w", encoding="utf-8") as log_file:
+        log_event(f"Starting comparison: {branch_base} -> {branch_target}", log_file)
+
+        # 1. Fetch base branch PRs (scan depth controlled by MAX_CHERRY_PICK_SCAN)
+        log_event(f"Fetching base branch history for {branch_base} (last {MAX_CHERRY_PICK_SCAN})...", log_file)
+        log_base = run_command(f"git log {branch_base} -n {MAX_CHERRY_PICK_SCAN} --oneline")
+        prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file)
+        prs_base = set(prs_base_dict.keys())
+
+        # 2. Fetch target branch PRs (only those not in base)
+        log_event(f"Fetching target branch history: {branch_base}..{branch_target}...", log_file)
+        log_target = run_command(f"git log {branch_base}..{branch_target} --oneline")
+        prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file)
+
+        # All PRs in target but not in base (deduplicated by key)
+        new_pr_keys = set(prs_target.keys())
+
+        contributors = {}  # username -> count
+        details = []
+
+        for key in sorted(new_pr_keys, key=lambda x: str(x)):
+            info = prs_target[key]
+            authors = info.get("authors", [])
+
+            # Count each author separately
+            for author in authors:
+                contributors[author] = contributors.get(author, 0) + 1
+
+            details.append(
+                {
+                    "original_pr": key,
+                    "title": info["title"],
+                    "authors": "; ".join(authors),
+                    "target_commit": info["cherry_pick_commit"],
+                    "cherry_pick_pr": info["cherry_pick_pr"],
+                }
+            )
+
+        # Consolidation Pass
+        consolidated_contributors = {}  # login_lower -> count
+        display_names = {}  # login_lower -> original_casing
+        raw_contributors = {}  # login_lower -> count
+
+        for contributor, count in contributors.items():
+            # Map to final identity
+            final_author = NAME_TO_LOGIN.get(contributor, contributor)
+            author_lower = final_author.lower()
+
+            if author_lower not in display_names:
+                display_names[author_lower] = final_author
+
+            raw_contributors[author_lower] = raw_contributors.get(author_lower, 0) + count
+
+            # Human-only for summary: Filter bots AND invalid/path strings
+            if not is_bot(contributor) and not is_invalid(contributor):
+                if not is_bot(final_author) and not is_invalid(final_author):
+                    consolidated_contributors[author_lower] = consolidated_contributors.get(author_lower, 0) + count
+
+        # Sort human contributors by count descending for summary
+        sorted_contributors = sorted(consolidated_contributors.items(), key=lambda x: x[1], reverse=True)
+
+        log_event("\n--- Summary ---", log_file)
+        # Prefix only identified github logins (no spaces) and format as markdown links
+        output_users = []
+        for login_lower, _login in sorted_contributors:
+            u = display_names[login_lower]
+            if " " not in u:
+                output_users.append(f"[@{u}](https://github.com/{u})")
+            else:
+                output_users.append(u)
+
+        # Summary text as a single line for best copy-paste behavior in GitHub
+        summary_text = ", ".join(output_users)
+        log_event(summary_text, log_file)
+
+        # Write details to CSV in the output directory
+        csv_path = os.path.join(output_dir, "detail.csv")
+        with open(csv_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(
+                f, fieldnames=["original_pr", "authors", "title", "target_commit", "cherry_pick_pr"]
+            )
+            writer.writeheader()
+            for row in details:
+                # Consolidate authors in CSV as well
+                authors_list = [a.strip() for a in row["authors"].split(";")]
+                consolidated_authors = []
+                for a in authors_list:
+                    # In CSV, we KEEP bots but filter out truly invalid entries (paths)
+                    final_a = NAME_TO_LOGIN.get(a, a)
+                    # Normalize for uniqueness check
+                    if not is_invalid(final_a):
+                        consolidated_authors.append(final_a)
+
+                # Deduplicate while preserving case-insensitive uniqueness
+                unique_authors = {}
+                for a in consolidated_authors:
+                    unique_authors[a.lower()] = a
+                row["authors"] = "; ".join(sorted(unique_authors.values(), key=lambda x: x.lower()))
+                writer.writerow(row)
+
+        log_event(
+            f"\nDetailed information written to {csv_path}. Total human contributors: {len(consolidated_contributors)}",
+            log_file,
+        )
+
+
+if __name__ == "__main__":
+    main()

From 1bdc09b83a63cf66d3d8c496edb60ce60932ebc2 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 7 Feb 2026 11:00:44 -0800
Subject: [PATCH 2/5] Update tools/python/compile_contributors.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tools/python/compile_contributors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/python/compile_contributors.py b/tools/python/compile_contributors.py
index f2e788ac2512b..b1eae5617ea96 100644
--- a/tools/python/compile_contributors.py
+++ b/tools/python/compile_contributors.py
@@ -367,7 +367,7 @@ def main():
         contributors = {}  # username -> count
         details = []
 
-        for key in sorted(new_pr_keys, key=lambda x: str(x)):
+        for key in sorted(new_pr_keys, key=str):
             info = prs_target[key]
             authors = info.get("authors", [])
 

From 6a4d3d871019b5c4cf0bd5dc5d217f92a06c3361 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 7 Feb 2026 11:24:52 -0800
Subject: [PATCH 3/5] review feedback

---
 tools/python/compile_contributors.py | 118 +++++++++++++++++----------
 1 file changed, 74 insertions(+), 44 deletions(-)

diff --git a/tools/python/compile_contributors.py b/tools/python/compile_contributors.py
index b1eae5617ea96..cd2cddfead272 100644
--- a/tools/python/compile_contributors.py
+++ b/tools/python/compile_contributors.py
@@ -20,6 +20,7 @@
     - logs.txt: Processing logs and summary (professional humans-only contributor list for release notes).
 
 Requirements:
+    - Python 3.7+
     - GitHub CLI (gh) logged in.
 """
 
@@ -41,8 +42,42 @@ def log_event(message, log_file=None):
         log_file.write(full_message + "\n")
 
 
+def run_command(command_list, cwd=".", silent=False):
+    """Run a command using a list of arguments for security (no shell=True)."""
+    result = subprocess.run(command_list, check=False, capture_output=True, text=True, cwd=cwd, encoding="utf-8")
+    if result.returncode != 0:
+        if not silent:
+            log_str = " ".join(command_list)
+            print(f"Error running command: {log_str}")
+            if result.stderr:
+                print(f"Stderr: {result.stderr.strip()}")
+        return None
+    return result.stdout
+
+
+def check_preflight():
+    """Verify gh CLI and git repository early."""
+    # Check git
+    git_check = run_command(["git", "rev-parse", "--is-inside-work-tree"], silent=True)
+    if not git_check:
+        print("Error: This script must be run inside a git repository.")
+        return False
+
+    # Check gh
+    gh_check = run_command(["gh", "--version"], silent=True)
+    if not gh_check:
+        print("Error: GitHub CLI (gh) not found or not in PATH.")
+        return False
+
+    gh_auth = run_command(["gh", "auth", "status"], silent=True)
+    if not gh_auth:
+        print("Error: GitHub CLI not authenticated. Please run 'gh auth login'.")
+        return False
+
+    return True
+
+
 # Constants
-MAX_CHERRY_PICK_SCAN = 50
 PR_CACHE = {}  # Cache for PR details to speed up multiple rounds referencing same PRs
 NAME_TO_LOGIN = {}  # Map full names to GitHub logins for consolidation
 
@@ -93,15 +128,6 @@ def is_invalid(name):
     return False
 
 
-def run_command(command, cwd=".", silent=False):
-    result = subprocess.run(command, check=False, shell=True, capture_output=True, text=True, cwd=cwd, encoding="utf-8")
-    if result.returncode != 0:
-        if not silent:
-            print(f"Error running command: {command}\n{result.stderr}")
-        return None
-    return result.stdout
-
-
 def get_pr_number(subject):
     match = re.search(r"\(#(\d+)\)$", subject.strip())
     if match:
@@ -114,7 +140,7 @@ def get_pr_details(pr_number):
         return PR_CACHE[pr_number]
 
     # Try as a PR first - fetch author and commits to get all contributors
-    output = run_command(f"gh pr view {pr_number} --json number,title,author,body,commits", silent=True)
+    output = run_command(["gh", "pr", "view", pr_number, "--json", "number,title,author,body,commits"], silent=True)
     if output:
         details = json.loads(output)
         PR_CACHE[pr_number] = details
@@ -161,7 +187,7 @@ def extract_authors_from_pr(details):
 def extract_authors_from_commit(commit_id):
     authors = set()
     # Format: AuthorName \n Body
-    info = run_command(f'git show -s --format="%an%n%B" {commit_id}', silent=True)
+    info = run_command(["git", "show", "-s", "--format=%an%n%B", commit_id], silent=True)
     if not info:
         return authors
 
@@ -203,7 +229,7 @@ def extract_pr_numbers(text, strict=False):
     return [int(x) for x in set(prs)]
 
 
-def get_prs_from_log(log_output, prs_base=None, log_file=None):
+def get_prs_from_log(log_output, prs_base=None, log_file=None, scan_depth=100):
     if not log_output:
         return {}
 
@@ -235,40 +261,42 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
             details = get_pr_details(pr_num_str)
 
         if details:
-            # Check if it's a cherry-pick round PR - scan deep to identify meta-PRs
+            # Check if it's a cherry-pick round PR
             is_meta_pr = (
                 "cherry pick" in subject.lower() or "cherry-pick" in subject.lower() or "cherrypick" in subject.lower()
             )
 
-            if is_meta_pr and commit_count < MAX_CHERRY_PICK_SCAN:
+            if is_meta_pr and commit_count < scan_depth:
                 log_event(f"  - Meta-PR detected, expanding: {details['title']}", log_file)
                 # Collect Original PRs from Title, Body, and Commits
                 all_extracted_nums = []
-                all_extracted_nums.extend(extract_pr_numbers(details["title"]))
+                # Use strict extraction even for titles to avoid matching issues like #26985
+                all_extracted_nums.extend(extract_pr_numbers(details["title"], strict=True))
                 all_extracted_nums.extend(extract_pr_numbers(details["body"], strict=True))
 
-                commits_output = run_command(f"gh pr view {pr_num_str} --json commits", silent=True)
-                if commits_output:
-                    commits_data = json.loads(commits_output)
-                    for commit in commits_data.get("commits", []):
-                        all_extracted_nums.extend(extract_pr_numbers(commit.get("messageHeadline", ""), strict=True))
-                        all_extracted_nums.extend(extract_pr_numbers(commit.get("messageBody", ""), strict=True))
+                # Reuse commits already fetched in get_pr_details to avoid an extra gh CLI call
+                for commit in details.get("commits", []):
+                    all_extracted_nums.extend(extract_pr_numbers(commit.get("messageHeadline", ""), strict=True))
+                    all_extracted_nums.extend(extract_pr_numbers(commit.get("messageBody", ""), strict=True))
 
                 # Filter and Normalize
                 current_pr_int = int(pr_num_str)
-                valid_pr_nums = []
+                valid_pr_ints = []
                 for op_num in set(all_extracted_nums):
                     if op_num == current_pr_int:
                         continue
+                    # Only accept reasonably recent PRs to avoid noise
                     if abs(op_num - current_pr_int) < 5000:
-                        valid_pr_nums.append(str(op_num))
+                        valid_pr_ints.append(op_num)
 
-                original_pr_nums = sorted(valid_pr_nums)
-                log_event(f"  - Extracted sub-PR candidates: {original_pr_nums}", log_file)
+                # Sorting results numerically (100 > 99)
+                original_pr_ints = sorted(valid_pr_ints)
+                log_event(f"  - Extracted sub-PR candidates: {original_pr_ints}", log_file)
 
-                if original_pr_nums:
-                    log_event(f"  -> Found {len(original_pr_nums)} sub-PRs for expansion.", log_file)
-                    for op_num_str in original_pr_nums:
+                if original_pr_ints:
+                    log_event(f"  -> Found {len(original_pr_ints)} sub-PRs for expansion.", log_file)
+                    for op_num in original_pr_ints:
+                        op_num_str = str(op_num)
                         if prs_base and op_num_str in prs_base:
                             log_event(f"    - Sub-PR #{op_num_str} already in base branch, skipping.", log_file)
                             continue
@@ -283,18 +311,12 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
                                 "cherry_pick_pr": pr_num_str,
                             }
                         else:
-                            # FALLBACK: Use Meta-PR authors if sub-PR fetch fails
+                            # If we can't resolve this number as a PR, do not fabricate an entry.
+                            # It may be an issue reference or an inaccessible/deleted PR.
                             log_event(
-                                f"    - Warning: Fetch failed for PR #{op_num_str}, using meta-PR authors fallback.",
+                                f"    - Warning: Unable to resolve PR #{op_num_str} via GitHub CLI; skipping.",
                                 log_file,
                             )
-                            meta_authors = extract_authors_from_pr(details)
-                            all_prs[op_num_str] = {
-                                "title": f"Original PR #{op_num_str} (details missing)",
-                                "authors": list(meta_authors),
-                                "cherry_pick_commit": commit_id,
-                                "cherry_pick_pr": pr_num_str,
-                            }
                 else:
                     log_event("  - No sub-PRs found, treating meta-PR as a normal PR.", log_file)
                     all_prs[pr_num_str] = {
@@ -337,11 +359,17 @@ def main():
     parser.add_argument("--base", default="origin/rel-1.23.2", help="Base branch/commit to compare from")
     parser.add_argument("--target", default="origin/rel-1.24.1", help="Target branch/commit to compare to")
     parser.add_argument("--dir", default="contributors", help="Output directory for reports and logs")
+    parser.add_argument("--scan-depth", type=int, default=100, help="Depth to scan base/meta-PRs for deduplication")
     args = parser.parse_args()
 
+    # Early validation
+    if not check_preflight():
+        return
+
     branch_base = args.base
     branch_target = args.target
     output_dir = args.dir
+    scan_depth = args.scan_depth
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
@@ -350,16 +378,17 @@ def main():
     with open(logs_path, "w", encoding="utf-8") as log_file:
         log_event(f"Starting comparison: {branch_base} -> {branch_target}", log_file)
 
-        # 1. Fetch base branch PRs (scan depth controlled by MAX_CHERRY_PICK_SCAN)
-        log_event(f"Fetching base branch history for {branch_base} (last {MAX_CHERRY_PICK_SCAN})...", log_file)
-        log_base = run_command(f"git log {branch_base} -n {MAX_CHERRY_PICK_SCAN} --oneline")
-        prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file)
+        # 1. Fetch base branch PRs (scan depth controlled by scan_depth)
+        log_event(f"Fetching base branch history for {branch_base} (last {scan_depth})...", log_file)
+        log_base = run_command(["git", "log", branch_base, "-n", str(scan_depth), "--oneline"])
+        prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file, scan_depth=scan_depth)
         prs_base = set(prs_base_dict.keys())
 
         # 2. Fetch target branch PRs (only those not in base)
         log_event(f"Fetching target branch history: {branch_base}..{branch_target}...", log_file)
-        log_target = run_command(f"git log {branch_base}..{branch_target} --oneline")
-        prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file)
+        # Using A..B syntax for git log
+        log_target = run_command(["git", "log", f"{branch_base}..{branch_target}", "--oneline"])
+        prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file, scan_depth=scan_depth)
 
         # All PRs in target but not in base (deduplicated by key)
         new_pr_keys = set(prs_target.keys())
@@ -367,6 +396,7 @@ def main():
         contributors = {}  # username -> count
         details = []
 
+        # Use str directly as key for sorting
         for key in sorted(new_pr_keys, key=str):
             info = prs_target[key]
             authors = info.get("authors", [])

From d02b8db509600dfb6a15a2e8a77d1475ce4e20e7 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 7 Feb 2026 11:35:31 -0800
Subject: [PATCH 4/5] error handling

---
 tools/python/compile_contributors.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/python/compile_contributors.py b/tools/python/compile_contributors.py
index cd2cddfead272..e1fa899bd2452 100644
--- a/tools/python/compile_contributors.py
+++ b/tools/python/compile_contributors.py
@@ -381,6 +381,13 @@ def main():
         # 1. Fetch base branch PRs (scan depth controlled by scan_depth)
         log_event(f"Fetching base branch history for {branch_base} (last {scan_depth})...", log_file)
         log_base = run_command(["git", "log", branch_base, "-n", str(scan_depth), "--oneline"])
+        if log_base is None:
+            log_event(
+                f"Error: Could not fetch history for base ref '{branch_base}'. Please check if the ref exists.",
+                log_file,
+            )
+            return
+
         prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file, scan_depth=scan_depth)
         prs_base = set(prs_base_dict.keys())
 
@@ -388,6 +395,13 @@ def main():
         log_event(f"Fetching target branch history: {branch_base}..{branch_target}...", log_file)
         # Using A..B syntax for git log
         log_target = run_command(["git", "log", f"{branch_base}..{branch_target}", "--oneline"])
+        if log_target is None:
+            log_event(
+                f"Error: Could not fetch history for range '{branch_base}..{branch_target}'. Please check if the refs exist.",
+                log_file,
+            )
+            return
+
         prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file, scan_depth=scan_depth)
 
         # All PRs in target but not in base (deduplicated by key)

From 2dd7aef5d3f09321637727dc09a9e93eeb895fc1 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 7 Feb 2026 15:17:11 -0800
Subject: [PATCH 5/5] deterministic

---
 tools/python/compile_contributors.py | 33 ++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/tools/python/compile_contributors.py b/tools/python/compile_contributors.py
index e1fa899bd2452..494b0f91c5381 100644
--- a/tools/python/compile_contributors.py
+++ b/tools/python/compile_contributors.py
@@ -80,6 +80,7 @@ def check_preflight():
 # Constants
 PR_CACHE = {}  # Cache for PR details to speed up multiple rounds referencing same PRs
 NAME_TO_LOGIN = {}  # Map full names to GitHub logins for consolidation
+VERIFIED_LOGINS = set()  # Track IDs known to be valid GitHub logins (vs free-form names)
 
 # Bots to exclude from contributor lists
 BOT_NAMES = {
@@ -160,6 +161,7 @@ def extract_authors_from_pr(details):
     if details.get("author"):
         pr_login = details["author"]["login"]
         authors.add(pr_login)
+        VERIFIED_LOGINS.add(pr_login.lower())
 
     # Add authors from all commits in the PR
     if "commits" in details:
@@ -169,6 +171,7 @@ def extract_authors_from_pr(details):
                 name = author_info.get("name")
                 if login:
                     authors.add(login)
+                    VERIFIED_LOGINS.add(login.lower())
                     if name:
                         NAME_TO_LOGIN[name] = login
                 elif name:
@@ -285,9 +288,7 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None, scan_depth=100):
                 for op_num in set(all_extracted_nums):
                     if op_num == current_pr_int:
                         continue
-                    # Only accept reasonably recent PRs to avoid noise
-                    if abs(op_num - current_pr_int) < 5000:
-                        valid_pr_ints.append(op_num)
+                    valid_pr_ints.append(op_num)
 
                 # Sorting results numerically (100 > 99)
                 original_pr_ints = sorted(valid_pr_ints)
@@ -311,12 +312,22 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None, scan_depth=100):
                                 "cherry_pick_pr": pr_num_str,
                             }
                         else:
-                            # If we can't resolve this number as a PR, do not fabricate an entry.
-                            # It may be an issue reference or an inaccessible/deleted PR.
+                            # If we can't resolve this number as a PR (e.g., issue reference or inaccessible/deleted PR),
+                            # do not invent new authors, but still attribute it to the known meta-PR to avoid losing credit.
                             log_event(
-                                f"    - Warning: Unable to resolve PR #{op_num_str} via GitHub CLI; skipping.",
+                                f"    - Warning: Unable to resolve PR #{op_num_str} via GitHub CLI; attributing via meta-PR #{pr_num_str}.",
                                 log_file,
                             )
+                            if op_num_str not in all_prs:
+                                fallback_title = (
+                                    f"Unresolved sub-PR #{op_num_str} (attributed via meta-PR #{pr_num_str})"
+                                )
+                                all_prs[op_num_str] = {
+                                    "title": fallback_title,
+                                    "authors": list(extract_authors_from_pr(details)),
+                                    "cherry_pick_commit": commit_id,
+                                    "cherry_pick_pr": pr_num_str,
+                                }
                 else:
                     log_event("  - No sub-PRs found, treating meta-PR as a normal PR.", log_file)
                     all_prs[pr_num_str] = {
@@ -359,7 +370,7 @@ def main():
     parser.add_argument("--base", default="origin/rel-1.23.2", help="Base branch/commit to compare from")
     parser.add_argument("--target", default="origin/rel-1.24.1", help="Target branch/commit to compare to")
     parser.add_argument("--dir", default="contributors", help="Output directory for reports and logs")
-    parser.add_argument("--scan-depth", type=int, default=100, help="Depth to scan base/meta-PRs for deduplication")
+    parser.add_argument("--scan-depth", type=int, default=200, help="Depth to scan base/meta-PRs for deduplication")
     args = parser.parse_args()
 
     # Early validation
@@ -449,15 +460,15 @@ def main():
                 if not is_bot(final_author) and not is_invalid(final_author):
                     consolidated_contributors[author_lower] = consolidated_contributors.get(author_lower, 0) + count
 
-        # Sort human contributors by count descending for summary
-        sorted_contributors = sorted(consolidated_contributors.items(), key=lambda x: x[1], reverse=True)
+        # Sort human contributors by count descending, then alphabetically by identity for determinism
+        sorted_contributors = sorted(consolidated_contributors.items(), key=lambda x: (-x[1], x[0]))
 
         log_event("\n--- Summary ---", log_file)
-        # Prefix only identified github logins (no spaces) and format as markdown links
+        # Prefix only identified github logins and format as markdown links
         output_users = []
         for login_lower, _login in sorted_contributors:
             u = display_names[login_lower]
-            if " " not in u:
+            if login_lower in VERIFIED_LOGINS:
                 output_users.append(f"[@{u}](https://github.com/{u})")
             else:
                 output_users.append(u)