2020 - logs.txt: Processing logs and summary (professional humans-only contributor list for release notes).
2121
2222Requirements:
23+ - Python 3.7+
2324 - GitHub CLI (gh) logged in.
2425"""
2526
@@ -41,8 +42,42 @@ def log_event(message, log_file=None):
4142 log_file .write (full_message + "\n " )
4243
4344
45+ def run_command (command_list , cwd = "." , silent = False ):
46+ """Run a command using a list of arguments for security (no shell=True)."""
47+ result = subprocess .run (command_list , check = False , capture_output = True , text = True , cwd = cwd , encoding = "utf-8" )
48+ if result .returncode != 0 :
49+ if not silent :
50+ log_str = " " .join (command_list )
51+ print (f"Error running command: { log_str } " )
52+ if result .stderr :
53+ print (f"Stderr: { result .stderr .strip ()} " )
54+ return None
55+ return result .stdout
56+
57+
58+ def check_preflight ():
59+ """Verify gh CLI and git repository early."""
60+ # Check git
61+ git_check = run_command (["git" , "rev-parse" , "--is-inside-work-tree" ], silent = True )
62+ if not git_check :
63+ print ("Error: This script must be run inside a git repository." )
64+ return False
65+
66+ # Check gh
67+ gh_check = run_command (["gh" , "--version" ], silent = True )
68+ if not gh_check :
69+ print ("Error: GitHub CLI (gh) not found or not in PATH." )
70+ return False
71+
72+ gh_auth = run_command (["gh" , "auth" , "status" ], silent = True )
73+ if not gh_auth :
74+ print ("Error: GitHub CLI not authenticated. Please run 'gh auth login'." )
75+ return False
76+
77+ return True
78+
79+
4480# Constants
45- MAX_CHERRY_PICK_SCAN = 50
4681PR_CACHE = {} # Cache for PR details to speed up multiple rounds referencing same PRs
4782NAME_TO_LOGIN = {} # Map full names to GitHub logins for consolidation
4883
@@ -93,15 +128,6 @@ def is_invalid(name):
93128 return False
94129
95130
96- def run_command (command , cwd = "." , silent = False ):
97- result = subprocess .run (command , check = False , shell = True , capture_output = True , text = True , cwd = cwd , encoding = "utf-8" )
98- if result .returncode != 0 :
99- if not silent :
100- print (f"Error running command: { command } \n { result .stderr } " )
101- return None
102- return result .stdout
103-
104-
105131def get_pr_number (subject ):
106132 match = re .search (r"\(#(\d+)\)$" , subject .strip ())
107133 if match :
@@ -114,7 +140,7 @@ def get_pr_details(pr_number):
114140 return PR_CACHE [pr_number ]
115141
116142 # Try as a PR first - fetch author and commits to get all contributors
117- output = run_command (f "gh pr view { pr_number } --json number,title,author,body,commits" , silent = True )
143+ output = run_command ([ "gh" , "pr" , " view" , pr_number , " --json" , " number,title,author,body,commits"] , silent = True )
118144 if output :
119145 details = json .loads (output )
120146 PR_CACHE [pr_number ] = details
@@ -161,7 +187,7 @@ def extract_authors_from_pr(details):
161187def extract_authors_from_commit (commit_id ):
162188 authors = set ()
163189 # Format: AuthorName \n Body
164- info = run_command (f' git show -s --format=" %an%n%B" { commit_id } ' , silent = True )
190+ info = run_command ([ " git" , " show" , "-s" , " --format=%an%n%B", commit_id ] , silent = True )
165191 if not info :
166192 return authors
167193
@@ -203,7 +229,7 @@ def extract_pr_numbers(text, strict=False):
203229 return [int (x ) for x in set (prs )]
204230
205231
206- def get_prs_from_log (log_output , prs_base = None , log_file = None ):
232+ def get_prs_from_log (log_output , prs_base = None , log_file = None , scan_depth = 100 ):
207233 if not log_output :
208234 return {}
209235
@@ -235,40 +261,42 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
235261 details = get_pr_details (pr_num_str )
236262
237263 if details :
238- # Check if it's a cherry-pick round PR - scan deep to identify meta-PRs
264+ # Check if it's a cherry-pick round PR
239265 is_meta_pr = (
240266 "cherry pick" in subject .lower () or "cherry-pick" in subject .lower () or "cherrypick" in subject .lower ()
241267 )
242268
243- if is_meta_pr and commit_count < MAX_CHERRY_PICK_SCAN :
269+ if is_meta_pr and commit_count < scan_depth :
244270 log_event (f" - Meta-PR detected, expanding: { details ['title' ]} " , log_file )
245271 # Collect Original PRs from Title, Body, and Commits
246272 all_extracted_nums = []
247- all_extracted_nums .extend (extract_pr_numbers (details ["title" ]))
273+ # Use strict extraction even for titles to avoid matching issues like #26985
274+ all_extracted_nums .extend (extract_pr_numbers (details ["title" ], strict = True ))
248275 all_extracted_nums .extend (extract_pr_numbers (details ["body" ], strict = True ))
249276
250- commits_output = run_command (f"gh pr view { pr_num_str } --json commits" , silent = True )
251- if commits_output :
252- commits_data = json .loads (commits_output )
253- for commit in commits_data .get ("commits" , []):
254- all_extracted_nums .extend (extract_pr_numbers (commit .get ("messageHeadline" , "" ), strict = True ))
255- all_extracted_nums .extend (extract_pr_numbers (commit .get ("messageBody" , "" ), strict = True ))
277+ # Reuse commits already fetched in get_pr_details to avoid an extra gh CLI call
278+ for commit in details .get ("commits" , []):
279+ all_extracted_nums .extend (extract_pr_numbers (commit .get ("messageHeadline" , "" ), strict = True ))
280+ all_extracted_nums .extend (extract_pr_numbers (commit .get ("messageBody" , "" ), strict = True ))
256281
257282 # Filter and Normalize
258283 current_pr_int = int (pr_num_str )
259- valid_pr_nums = []
284+ valid_pr_ints = []
260285 for op_num in set (all_extracted_nums ):
261286 if op_num == current_pr_int :
262287 continue
288+ # Only accept reasonably recent PRs to avoid noise
263289 if abs (op_num - current_pr_int ) < 5000 :
264- valid_pr_nums .append (str ( op_num ) )
290+ valid_pr_ints .append (op_num )
265291
266- original_pr_nums = sorted (valid_pr_nums )
267- log_event (f" - Extracted sub-PR candidates: { original_pr_nums } " , log_file )
292+ # Sorting results numerically (100 > 99)
293+ original_pr_ints = sorted (valid_pr_ints )
294+ log_event (f" - Extracted sub-PR candidates: { original_pr_ints } " , log_file )
268295
269- if original_pr_nums :
270- log_event (f" -> Found { len (original_pr_nums )} sub-PRs for expansion." , log_file )
271- for op_num_str in original_pr_nums :
296+ if original_pr_ints :
297+ log_event (f" -> Found { len (original_pr_ints )} sub-PRs for expansion." , log_file )
298+ for op_num in original_pr_ints :
299+ op_num_str = str (op_num )
272300 if prs_base and op_num_str in prs_base :
273301 log_event (f" - Sub-PR #{ op_num_str } already in base branch, skipping." , log_file )
274302 continue
@@ -283,18 +311,12 @@ def get_prs_from_log(log_output, prs_base=None, log_file=None):
283311 "cherry_pick_pr" : pr_num_str ,
284312 }
285313 else :
286- # FALLBACK: Use Meta-PR authors if sub-PR fetch fails
314+ # If we can't resolve this number as a PR, do not fabricate an entry.
315+ # It may be an issue reference or an inaccessible/deleted PR.
287316 log_event (
288- f" - Warning: Fetch failed for PR #{ op_num_str } , using meta-PR authors fallback ." ,
317+ f" - Warning: Unable to resolve PR #{ op_num_str } via GitHub CLI; skipping ." ,
289318 log_file ,
290319 )
291- meta_authors = extract_authors_from_pr (details )
292- all_prs [op_num_str ] = {
293- "title" : f"Original PR #{ op_num_str } (details missing)" ,
294- "authors" : list (meta_authors ),
295- "cherry_pick_commit" : commit_id ,
296- "cherry_pick_pr" : pr_num_str ,
297- }
298320 else :
299321 log_event (" - No sub-PRs found, treating meta-PR as a normal PR." , log_file )
300322 all_prs [pr_num_str ] = {
@@ -337,11 +359,17 @@ def main():
337359 parser .add_argument ("--base" , default = "origin/rel-1.23.2" , help = "Base branch/commit to compare from" )
338360 parser .add_argument ("--target" , default = "origin/rel-1.24.1" , help = "Target branch/commit to compare to" )
339361 parser .add_argument ("--dir" , default = "contributors" , help = "Output directory for reports and logs" )
362+ parser .add_argument ("--scan-depth" , type = int , default = 100 , help = "Depth to scan base/meta-PRs for deduplication" )
340363 args = parser .parse_args ()
341364
365+ # Early validation
366+ if not check_preflight ():
367+ return
368+
342369 branch_base = args .base
343370 branch_target = args .target
344371 output_dir = args .dir
372+ scan_depth = args .scan_depth
345373
346374 if not os .path .exists (output_dir ):
347375 os .makedirs (output_dir )
@@ -350,23 +378,25 @@ def main():
350378 with open (logs_path , "w" , encoding = "utf-8" ) as log_file :
351379 log_event (f"Starting comparison: { branch_base } -> { branch_target } " , log_file )
352380
353- # 1. Fetch base branch PRs (scan depth controlled by MAX_CHERRY_PICK_SCAN )
354- log_event (f"Fetching base branch history for { branch_base } (last { MAX_CHERRY_PICK_SCAN } )..." , log_file )
355- log_base = run_command (f "git log { branch_base } -n { MAX_CHERRY_PICK_SCAN } --oneline" )
356- prs_base_dict = get_prs_from_log (log_base , prs_base = None , log_file = log_file )
381+ # 1. Fetch base branch PRs (scan depth controlled by scan_depth )
382+ log_event (f"Fetching base branch history for { branch_base } (last { scan_depth } )..." , log_file )
383+ log_base = run_command ([ "git" , " log" , branch_base , "-n" , str ( scan_depth ), " --oneline"] )
384+ prs_base_dict = get_prs_from_log (log_base , prs_base = None , log_file = log_file , scan_depth = scan_depth )
357385 prs_base = set (prs_base_dict .keys ())
358386
359387 # 2. Fetch target branch PRs (only those not in base)
360388 log_event (f"Fetching target branch history: { branch_base } ..{ branch_target } ..." , log_file )
361- log_target = run_command (f"git log { branch_base } ..{ branch_target } --oneline" )
362- prs_target = get_prs_from_log (log_target , prs_base = prs_base , log_file = log_file )
389+ # Using A..B syntax for git log
390+ log_target = run_command (["git" , "log" , f"{ branch_base } ..{ branch_target } " , "--oneline" ])
391+ prs_target = get_prs_from_log (log_target , prs_base = prs_base , log_file = log_file , scan_depth = scan_depth )
363392
364393 # All PRs in target but not in base (deduplicated by key)
365394 new_pr_keys = set (prs_target .keys ())
366395
367396 contributors = {} # username -> count
368397 details = []
369398
399+ # Use str directly as key for sorting
370400 for key in sorted (new_pr_keys , key = str ):
371401 info = prs_target [key ]
372402 authors = info .get ("authors" , [])
0 commit comments