Skip to content

Commit 6a4d3d8

Browse files
committed
review feedback
1 parent 1bdc09b commit 6a4d3d8

File tree

1 file changed

+74
-44
lines changed

1 file changed

+74
-44
lines changed

tools/python/compile_contributors.py

Lines changed: 74 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
- logs.txt: Processing logs and summary (professional humans-only contributor list for release notes).
2121
2222
Requirements:
23+
- Python 3.7+
2324
- GitHub CLI (gh) logged in.
2425
"""
2526

@@ -41,8 +42,42 @@ def log_event(message, log_file=None):
4142
log_file.write(full_message + "\n")
4243

4344

45+
def run_command(command_list, cwd=".", silent=False):
46+
"""Run a command using a list of arguments for security (no shell=True)."""
47+
result = subprocess.run(command_list, check=False, capture_output=True, text=True, cwd=cwd, encoding="utf-8")
48+
if result.returncode != 0:
49+
if not silent:
50+
log_str = " ".join(command_list)
51+
print(f"Error running command: {log_str}")
52+
if result.stderr:
53+
print(f"Stderr: {result.stderr.strip()}")
54+
return None
55+
return result.stdout
56+
57+
58+
def check_preflight():
59+
"""Verify gh CLI and git repository early."""
60+
# Check git
61+
git_check = run_command(["git", "rev-parse", "--is-inside-work-tree"], silent=True)
62+
if not git_check:
63+
print("Error: This script must be run inside a git repository.")
64+
return False
65+
66+
# Check gh
67+
gh_check = run_command(["gh", "--version"], silent=True)
68+
if not gh_check:
69+
print("Error: GitHub CLI (gh) not found or not in PATH.")
70+
return False
71+
72+
gh_auth = run_command(["gh", "auth", "status"], silent=True)
73+
if not gh_auth:
74+
print("Error: GitHub CLI not authenticated. Please run 'gh auth login'.")
75+
return False
76+
77+
return True
78+
79+
4480
# Constants
45-
MAX_CHERRY_PICK_SCAN = 50
4681
PR_CACHE = {} # Cache for PR details to speed up multiple rounds referencing same PRs
4782
NAME_TO_LOGIN = {} # Map full names to GitHub logins for consolidation
4883

@@ -93,15 +128,6 @@ def is_invalid(name):
93128
return False
94129

95130

96-
def run_command(command, cwd=".", silent=False):
97-
result = subprocess.run(command, check=False, shell=True, capture_output=True, text=True, cwd=cwd, encoding="utf-8")
98-
if result.returncode != 0:
99-
if not silent:
100-
print(f"Error running command: {command}\n{result.stderr}")
101-
return None
102-
return result.stdout
103-
104-
105131
def get_pr_number(subject):
106132
match = re.search(r"\(#(\d+)\)$", subject.strip())
107133
if match:
@@ -114,7 +140,7 @@ def get_pr_details(pr_number):
114140
return PR_CACHE[pr_number]
115141

116142
# Try as a PR first - fetch author and commits to get all contributors
117-
output = run_command(f"gh pr view {pr_number} --json number,title,author,body,commits", silent=True)
143+
output = run_command(["gh", "pr", "view", pr_number, "--json", "number,title,author,body,commits"], silent=True)
118144
if output:
119145
details = json.loads(output)
120146
PR_CACHE[pr_number] = details
@@ -161,7 +187,7 @@ def extract_authors_from_pr(details):
161187
def extract_authors_from_commit(commit_id):
162188
authors = set()
163189
# Format: AuthorName \n Body
164-
info = run_command(f'git show -s --format="%an%n%B" {commit_id}', silent=True)
190+
info = run_command(["git", "show", "-s", "--format=%an%n%B", commit_id], silent=True)
165191
if not info:
166192
return authors
167193

@@ -203,7 +229,7 @@ def extract_pr_numbers(text, strict=False):
203229
return [int(x) for x in set(prs)]
204230

205231

206-
def get_prs_from_log(log_output, prs_base=None, log_file=None):
232+
def get_prs_from_log(log_output, prs_base=None, log_file=None, scan_depth=100):
207233
if not log_output:
208234
return {}
209235

@@ -235,40 +261,42 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
235261
details = get_pr_details(pr_num_str)
236262

237263
if details:
238-
# Check if it's a cherry-pick round PR - scan deep to identify meta-PRs
264+
# Check if it's a cherry-pick round PR
239265
is_meta_pr = (
240266
"cherry pick" in subject.lower() or "cherry-pick" in subject.lower() or "cherrypick" in subject.lower()
241267
)
242268

243-
if is_meta_pr and commit_count < MAX_CHERRY_PICK_SCAN:
269+
if is_meta_pr and commit_count < scan_depth:
244270
log_event(f" - Meta-PR detected, expanding: {details['title']}", log_file)
245271
# Collect Original PRs from Title, Body, and Commits
246272
all_extracted_nums = []
247-
all_extracted_nums.extend(extract_pr_numbers(details["title"]))
273+
# Use strict extraction even for titles to avoid matching issues like #26985
274+
all_extracted_nums.extend(extract_pr_numbers(details["title"], strict=True))
248275
all_extracted_nums.extend(extract_pr_numbers(details["body"], strict=True))
249276

250-
commits_output = run_command(f"gh pr view {pr_num_str} --json commits", silent=True)
251-
if commits_output:
252-
commits_data = json.loads(commits_output)
253-
for commit in commits_data.get("commits", []):
254-
all_extracted_nums.extend(extract_pr_numbers(commit.get("messageHeadline", ""), strict=True))
255-
all_extracted_nums.extend(extract_pr_numbers(commit.get("messageBody", ""), strict=True))
277+
# Reuse commits already fetched in get_pr_details to avoid an extra gh CLI call
278+
for commit in details.get("commits", []):
279+
all_extracted_nums.extend(extract_pr_numbers(commit.get("messageHeadline", ""), strict=True))
280+
all_extracted_nums.extend(extract_pr_numbers(commit.get("messageBody", ""), strict=True))
256281

257282
# Filter and Normalize
258283
current_pr_int = int(pr_num_str)
259-
valid_pr_nums = []
284+
valid_pr_ints = []
260285
for op_num in set(all_extracted_nums):
261286
if op_num == current_pr_int:
262287
continue
288+
# Only accept reasonably recent PRs to avoid noise
263289
if abs(op_num - current_pr_int) < 5000:
264-
valid_pr_nums.append(str(op_num))
290+
valid_pr_ints.append(op_num)
265291

266-
original_pr_nums = sorted(valid_pr_nums)
267-
log_event(f" - Extracted sub-PR candidates: {original_pr_nums}", log_file)
292+
# Sorting results numerically (100 > 99)
293+
original_pr_ints = sorted(valid_pr_ints)
294+
log_event(f" - Extracted sub-PR candidates: {original_pr_ints}", log_file)
268295

269-
if original_pr_nums:
270-
log_event(f" -> Found {len(original_pr_nums)} sub-PRs for expansion.", log_file)
271-
for op_num_str in original_pr_nums:
296+
if original_pr_ints:
297+
log_event(f" -> Found {len(original_pr_ints)} sub-PRs for expansion.", log_file)
298+
for op_num in original_pr_ints:
299+
op_num_str = str(op_num)
272300
if prs_base and op_num_str in prs_base:
273301
log_event(f" - Sub-PR #{op_num_str} already in base branch, skipping.", log_file)
274302
continue
@@ -283,18 +311,12 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
283311
"cherry_pick_pr": pr_num_str,
284312
}
285313
else:
286-
# FALLBACK: Use Meta-PR authors if sub-PR fetch fails
314+
# If we can't resolve this number as a PR, do not fabricate an entry.
315+
# It may be an issue reference or an inaccessible/deleted PR.
287316
log_event(
288-
f" - Warning: Fetch failed for PR #{op_num_str}, using meta-PR authors fallback.",
317+
f" - Warning: Unable to resolve PR #{op_num_str} via GitHub CLI; skipping.",
289318
log_file,
290319
)
291-
meta_authors = extract_authors_from_pr(details)
292-
all_prs[op_num_str] = {
293-
"title": f"Original PR #{op_num_str} (details missing)",
294-
"authors": list(meta_authors),
295-
"cherry_pick_commit": commit_id,
296-
"cherry_pick_pr": pr_num_str,
297-
}
298320
else:
299321
log_event(" - No sub-PRs found, treating meta-PR as a normal PR.", log_file)
300322
all_prs[pr_num_str] = {
@@ -337,11 +359,17 @@ def main():
337359
parser.add_argument("--base", default="origin/rel-1.23.2", help="Base branch/commit to compare from")
338360
parser.add_argument("--target", default="origin/rel-1.24.1", help="Target branch/commit to compare to")
339361
parser.add_argument("--dir", default="contributors", help="Output directory for reports and logs")
362+
parser.add_argument("--scan-depth", type=int, default=100, help="Depth to scan base/meta-PRs for deduplication")
340363
args = parser.parse_args()
341364

365+
# Early validation
366+
if not check_preflight():
367+
return
368+
342369
branch_base = args.base
343370
branch_target = args.target
344371
output_dir = args.dir
372+
scan_depth = args.scan_depth
345373

346374
if not os.path.exists(output_dir):
347375
os.makedirs(output_dir)
@@ -350,23 +378,25 @@ def main():
350378
with open(logs_path, "w", encoding="utf-8") as log_file:
351379
log_event(f"Starting comparison: {branch_base} -> {branch_target}", log_file)
352380

353-
# 1. Fetch base branch PRs (scan depth controlled by MAX_CHERRY_PICK_SCAN)
354-
log_event(f"Fetching base branch history for {branch_base} (last {MAX_CHERRY_PICK_SCAN})...", log_file)
355-
log_base = run_command(f"git log {branch_base} -n {MAX_CHERRY_PICK_SCAN} --oneline")
356-
prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file)
381+
# 1. Fetch base branch PRs (scan depth controlled by scan_depth)
382+
log_event(f"Fetching base branch history for {branch_base} (last {scan_depth})...", log_file)
383+
log_base = run_command(["git", "log", branch_base, "-n", str(scan_depth), "--oneline"])
384+
prs_base_dict = get_prs_from_log(log_base, prs_base=None, log_file=log_file, scan_depth=scan_depth)
357385
prs_base = set(prs_base_dict.keys())
358386

359387
# 2. Fetch target branch PRs (only those not in base)
360388
log_event(f"Fetching target branch history: {branch_base}..{branch_target}...", log_file)
361-
log_target = run_command(f"git log {branch_base}..{branch_target} --oneline")
362-
prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file)
389+
# Using A..B syntax for git log
390+
log_target = run_command(["git", "log", f"{branch_base}..{branch_target}", "--oneline"])
391+
prs_target = get_prs_from_log(log_target, prs_base=prs_base, log_file=log_file, scan_depth=scan_depth)
363392

364393
# All PRs in target but not in base (deduplicated by key)
365394
new_pr_keys = set(prs_target.keys())
366395

367396
contributors = {} # username -> count
368397
details = []
369398

399+
# Use str directly as key for sorting
370400
for key in sorted(new_pr_keys, key=str):
371401
info = prs_target[key]
372402
authors = info.get("authors", [])

0 commit comments

Comments
 (0)