2424from pathlib import Path
2525from packageurl import PackageURL
2626
27- from minecode_pipelines .miners .swift import fetch_git_tags_raw
28- from minecode_pipelines .miners .swift import get_tags_and_commits_from_git_output
29- from minecode_pipelines .miners .swift import split_org_repo
3027from minecode_pipelines .utils import cycle_from_index , grouper
28+ import shutil
29+ import subprocess
30+ from urllib .parse import urlparse
3131
3232PACKAGE_BATCH_SIZE = 100
3333
@@ -39,10 +39,10 @@ def mine_swift_packageurls(packages_urls, start_index, logger):
3939 for batch_index , package_batch in enumerate (
4040 grouper (n = PACKAGE_BATCH_SIZE , iterable = packages_iter )
4141 ):
42- for item in package_batch :
43- if not item :
42+ for package_repo_url in package_batch :
43+ if not package_repo_url :
4444 continue
45- package_repo_url = item
45+ logger ( f"Processing package repo URL: { package_repo_url } " )
4646 git_ls_remote = fetch_git_tags_raw (package_repo_url , 60 , logger )
4747 if not git_ls_remote :
4848 continue
@@ -83,3 +83,81 @@ def generate_package_urls(package_repo_url, tags_and_commits):
8383 updated_purls .append (purl )
8484
8585 return base_purl , updated_purls
86+
87+
88+ def is_safe_repo_url (repo_url : str ) -> bool :
89+ """Return True if the URL is HTTPS GitHub with .git suffix or has at least two path segments."""
90+ parsed = urlparse (repo_url )
91+ return (
92+ parsed .scheme == "https" and parsed .netloc == "github.com" and parsed .path .endswith (".git" )
93+ )
94+
95+
96+ def fetch_git_tags_raw (repo_url : str , timeout : int = 60 , logger = None ) -> str | None :
97+ """Run `git ls-remote` on a GitHub repo and return raw output, or None on error."""
98+ git_executable = shutil .which ("git" )
99+ if git_executable is None :
100+ logger ("Git executable not found in PATH" )
101+ return None
102+
103+ if not is_safe_repo_url (repo_url ):
104+ raise ValueError (f"Unsafe repo URL: { repo_url } " )
105+
106+ try :
107+ result = subprocess .run ( # NOQA
108+ [git_executable , "ls-remote" , repo_url ],
109+ capture_output = True ,
110+ text = True ,
111+ check = True ,
112+ timeout = timeout ,
113+ )
114+ return result .stdout .strip ()
115+ except subprocess .CalledProcessError as e :
116+ logger (f"Failed to fetch tags for { repo_url } : { e } " )
117+ except subprocess .TimeoutExpired :
118+ logger (f"Timeout fetching tags for { repo_url } " )
119+ return None
120+
121+
122+ # FIXME duplicated with miners github
123+ def split_org_repo (url_like ):
124+ """
125+ Given a URL-like string to a GitHub repo or a repo name as in org/name,
126+ split and return the org and name.
127+
128+ For example:
129+ >>> split_org_repo('foo/bar')
130+ ('foo', 'bar')
131+ >>> split_org_repo('https://api.github.com/repos/foo/bar/')
132+ ('foo', 'bar')
133+ >>> split_org_repo('github.com/foo/bar/')
134+ ('foo', 'bar')
135+ >>> split_org_repo('git://github.com/foo/bar.git')
136+ ('foo', 'bar')
137+ """
138+ segments = [s .strip () for s in url_like .split ("/" ) if s .strip ()]
139+ if not len (segments ) >= 2 :
140+ raise ValueError (f"Not a GitHub-like URL: { url_like } " )
141+ org = segments [- 2 ]
142+ name = segments [- 1 ]
143+ if name .endswith (".git" ):
144+ name , _ , _ = name .rpartition (".git" )
145+ return org , name
146+
147+
148+ def get_tags_and_commits_from_git_output (git_ls_remote ):
149+ """
150+ Yield tuples of (tag, commit), given a git ls-remote output
151+ """
152+ tags_and_commits = []
153+ for line in git_ls_remote .split ("\n " ):
154+ # line: kjwfgeklngelkfjofjeo123 refs/tags/1.2.3
155+ line_segments = line .split ("\t " )
156+ # segments: ["kjwfgeklngelkfjofjeo123", "refs/tags/1.2.3"]
157+ if len (line_segments ) > 1 and (
158+ line_segments [1 ].startswith ("refs/tags/" ) or line_segments [1 ] == "HEAD"
159+ ):
160+ commit = line_segments [0 ]
161+ tag = line_segments [1 ].replace ("refs/tags/" , "" )
162+ tags_and_commits .append ((tag , commit ))
163+ return tags_and_commits
0 commit comments