Skip to content

Commit 7508706

Browse files
authored
Merge pull request #103 from sw360/martin/fix-github-tag-matching
GitHub tag matching
2 parents 9924eaf + d05917c commit 7508706

File tree

6 files changed

+1074
-165
lines changed

6 files changed

+1074
-165
lines changed

capycli/bom/findsources.py

Lines changed: 230 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
import re
1212
import sys
1313
import time
14-
from typing import Any, Dict, List, Tuple
14+
from collections.abc import Iterable
15+
from typing import Any, Dict, List, Set, Tuple
16+
from urllib.parse import parse_qs, urlparse
1517

1618
import requests
1719
import semver
@@ -36,13 +38,59 @@
3638
class FindSources(capycli.common.script_base.ScriptBase):
3739
"""Go through the list of SBOM items and try to determine the source code."""
3840

41+
class TagCache:
42+
"""A key task performed in this module is fetching tags from GitHub
43+
and match tags to (component) versions. This task includes many
44+
calls to the GitHub API, which we seek to limit by implementing
45+
an internal cache and a logic to guess tags, instead of
46+
performing exhaustive searches.
47+
"""
48+
def __init__(self) -> None:
49+
self.data: Dict[Tuple[str, str], Set[str]] = {}
50+
51+
def __getitem__(self, key: Any) -> Set[str]:
52+
"""Get the set of all cached tags for a key."""
53+
return self.data[self._validate_key(key)]
54+
55+
def _validate_key(self, key: Tuple[str, str]) -> Tuple[str, str]:
56+
"""Ensure our keys are hashable."""
57+
if len(key) != 2 or key != (str(key[0]), str(key[1])):
58+
raise KeyError(f'{self.__class__.__name__} key must consist of'
59+
'a project name and a version string')
60+
return key
61+
62+
def add(self, project: str, version: str, tag: str) -> None:
63+
"""Cache a tag for a specific project and version."""
64+
key = self._validate_key((project, version))
65+
tags = self.data.setdefault(key, set())
66+
tags.add(tag)
67+
68+
def filter(self, project: str, version: str, data: Any) -> List[str]:
69+
"""Remove all cached entries from @data."""
70+
if isinstance(data, str):
71+
data = [data]
72+
elif not isinstance(data, Iterable):
73+
raise ValueError('Expecting an iterable of tags!')
74+
key = self._validate_key((project, version))
75+
return [item for item in data
76+
if item not in self.data.get(key, [])
77+
and len(item) > 0]
78+
79+
def filter_and_cache(self, project: str, version: str, data: Any) -> List[str]:
80+
"""Convenience method to to filtering and adding in one run."""
81+
candidates = set(self.filter(project, version, data))
82+
for tag in candidates:
83+
self.add(project, version, tag)
84+
return list(candidates)
85+
3986
def __init__(self) -> None:
4087
self.verbose: bool = False
41-
self.version_regex = re.compile(r"[\d+\.|_]+[\d+]")
88+
self.version_regex = re.compile(r"(\d+[._])+\d+")
4289
self.github_project_name_regex = re.compile(r"^[a-zA-Z0-9-]+(/[a-zA-Z0-9-]+)*$")
4390
self.github_name: str = ""
4491
self.github_token: str = ""
4592
self.sw360_url: str = os.environ.get("SW360ServerUrl", "")
93+
self.tag_cache = self.TagCache()
4694

4795
def is_sourcefile_accessible(self, sourcefile_url: str) -> bool:
4896
"""Check if the URL is accessible."""
@@ -70,34 +118,61 @@ def is_sourcefile_accessible(self, sourcefile_url: str) -> bool:
70118
return False
71119

72120
@staticmethod
73-
def github_request(url: str, username: str = "", token: str = "") -> Any:
121+
def github_request(url: str, username: str = "", token: str = "",
122+
return_response: bool = False,
123+
allow_redirects: bool = True, # default in requests
124+
) -> Any:
74125
try:
75126
headers = {}
76127
if token:
77128
headers["Authorization"] = "token " + token
78129
if username:
79130
headers["Username"] = username
80-
response = requests.get(url, headers=headers)
81-
if not response.ok:
82-
if response.status_code == 429 or \
83-
'rate limit exceeded' in response.reason or \
84-
"API rate limit exceeded" in response.json().get("message"):
85-
print(
86-
Fore.LIGHTYELLOW_EX +
87-
" Github API rate limit exceeded - wait 60s and retry ... " +
88-
Style.RESET_ALL)
89-
time.sleep(60)
90-
return FindSources.github_request(url, username, token)
91-
92-
return response.json()
131+
response = requests.get(url, headers=headers,
132+
allow_redirects=allow_redirects)
133+
if response.status_code == 429 \
134+
or 'rate limit exceeded' in response.reason \
135+
or 'API rate limit exceeded' in response.json().get('message', ''):
136+
print(
137+
Fore.LIGHTYELLOW_EX +
138+
" Github API rate limit exceeded - wait 60s and retry ... " +
139+
Style.RESET_ALL)
140+
time.sleep(60)
141+
return FindSources.github_request(url, username, token, return_response=return_response)
142+
if response.json().get('message', '').startswith("Bad credentials"):
143+
print_red("Invalid GitHub credential provided - aborting!")
144+
sys.exit(ResultCode.RESULT_ERROR_ACCESSING_SERVICE)
145+
146+
except AttributeError as err:
147+
# response.json() did not return a dictionary
148+
if hasattr(err, 'name'):
149+
name = err.name
150+
else: # Python prior to 3.10
151+
name = err.args[0].split("'")[3]
152+
if not name == 'get':
153+
raise
154+
155+
except requests.exceptions.JSONDecodeError:
156+
response._content = b'{}'
157+
158+
except requests.exceptions.ConnectionError as ex:
159+
print(
160+
Fore.LIGHTYELLOW_EX +
161+
f" Connection issues accessing {url} " + repr(ex) +
162+
"\n Retrying in 60 seconds!" +
163+
Style.RESET_ALL)
164+
time.sleep(60)
165+
return FindSources.github_request(url, username, token, return_response=return_response)
93166

94167
except Exception as ex:
95168
print(
96169
Fore.LIGHTYELLOW_EX +
97170
" Error accessing GitHub: " + repr(ex) +
98171
Style.RESET_ALL)
99-
100-
return {}
172+
response = requests.Response()
173+
response._content = \
174+
b'{' + f'"exception": "{repr(ex)}"'.encode() + b'}'
175+
return response if return_response else response.json()
101176

102177
@staticmethod
103178
def get_repositories(name: str, language: str, username: str = "", token: str = "") -> Any:
@@ -135,27 +210,133 @@ def get_repo_name(github_url: str) -> str:
135210
@staticmethod
136211
def get_github_info(repository_url: str, username: str = "",
137212
token: str = "") -> get_github_info_type:
213+
"""This method used to iterate through all resource pages of
214+
GitHub's /tags API, collect the results, then return a huge
215+
list with all results.
216+
Removed because this approach does not scale well and we did
217+
encounter projects with tens of thousands of tags.
138218
"""
139-
Query tag infos from GitHub.
140-
141-
In the good case a list of tags entries (= dictionaries) is returned.
142-
In the bad case a JSON error message is returned.
219+
raise NotImplementedError(
220+
"Removed with introduction of get_matchting_source_tag!")
221+
222+
def _get_github_repo(self, github_ref: str) -> Dict[str, Any]:
223+
"""Fetch GitHub API object identified by @github_ref.
224+
@github_ref can be a simple "<owner>/<repo>" string or any
225+
from the plethora of links that refer to a
226+
project on GitHub.
227+
By using urlparse() we save ourselves a little bit of work
228+
with trailing queries and fragments, but any @github_ref with
229+
colons, where the first colon is not part of '://' will not
230+
yield viable results,
231+
e.g. 'api.github.com:443/repos/sw360/capycli'.
232+
"""
233+
url = 'api.github.com/repos/'
234+
gh_ref = urlparse(github_ref, scheme='no_scheme')
235+
if gh_ref.scheme == 'no_scheme': # interpret @github_ref as OWNER/REPO
236+
url += gh_ref.path
237+
elif not gh_ref.netloc.endswith('github.com'):
238+
raise ValueError(f'{github_ref} is not an expected @github_ref!')
239+
elif gh_ref.path.startswith('/repos'):
240+
url += gh_ref.path[6:]
241+
else:
242+
url += gh_ref.path
243+
if url.endswith('.git'):
244+
url = url[0:-4]
245+
url = 'https://' + url.replace('//', '/')
246+
repo = {}
247+
while 'tags_url' not in repo and 'github.com' in url:
248+
repo = self.github_request(url, self.github_name, self.github_token)
249+
url = url.rsplit('/', 1)[0] # remove last path segment
250+
if 'tags_url' not in repo:
251+
raise ValueError(f"Unable to make @github_ref {github_ref} work!")
252+
return repo
253+
254+
def _get_link_page(self, res: requests.Response, which: str = 'next') -> int:
255+
"""Fetch only page number from link-header."""
256+
try:
257+
url = urlparse(res.links[which]['url'])
258+
return int(parse_qs(url.query)['page'][0])
259+
except KeyError: # GitHub gave us only one results page
260+
return 1
261+
262+
def get_matching_source_url(self, version: Any, github_ref: str,
263+
version_prefix: Any = None
264+
) -> str:
265+
"""Find a URL to download source code from GitHub. We are
266+
looking for the source code in @github_ref at @version.
267+
268+
We expect to match @version to an existing tag in the repo
269+
identified by @github_ref. We want to have the source
270+
code download URL of that existing tag!
271+
272+
In order to perform this matching, we must retrieve the tags
273+
from GitHub and then analyse them. First, we use
274+
get_matching_tag(). If that doesn't yield a positive result,
275+
we try to infer a tag for @version, to prevent an exhaustive
276+
search over all tags.
143277
"""
144-
length_per_page = 100
145-
page = 1
146-
tags: List[Dict[str, Any]] = []
147-
tag_url = "https://api.github.com/repos/" + repository_url + "/tags"
148-
query = "?per_page=%s&page=%s" % (length_per_page, page)
149-
tmp = FindSources.github_request(tag_url + query, username, token)
150-
if not isinstance(tmp, list):
151-
return tags
152-
tags.extend(tmp)
153-
while len(tmp) == length_per_page:
154-
page += 1
155-
query = "?per_page=%s&page=%s" % (length_per_page, page)
156-
tmp = FindSources.github_request(tag_url + query, username, token)
157-
tags.extend(tmp)
158-
return tags
278+
try:
279+
repo = self._get_github_repo(github_ref)
280+
except ValueError as err:
281+
print_yellow(" " + str(err))
282+
return ""
283+
284+
tags_url = repo['tags_url'] + '?per_page=100'
285+
git_refs_url_tpl = repo['git_refs_url'].replace('{/sha}', '{sha}', 1)
286+
287+
res = self.github_request(tags_url, self.github_name,
288+
self.github_token, return_response=True)
289+
pages = self._get_link_page(res, 'last')
290+
for _ in range(pages): # we prefer this over "while True"
291+
# note: in res.json() we already have the first results page
292+
try:
293+
tags = [tag for tag in res.json()
294+
if version_prefix is None
295+
or tag['name'].startswith(version_prefix)]
296+
source_url = self.get_matching_tag(tags, version, tags_url)
297+
if len(source_url) > 0: # we found what we believe is
298+
return source_url # the correct source_url
299+
300+
except (TypeError, KeyError, AttributeError):
301+
# res.json() did not give us an iterable of things where
302+
# 'name' is a viable index, for instance an error message
303+
tags = []
304+
305+
new_prefixes = self.tag_cache.filter_and_cache(
306+
repo['full_name'], version, # cache key
307+
[self.version_regex.split(tag['name'], 1)[0]
308+
for tag in tags
309+
if self.version_regex.search(tag['name']) is not None])
310+
311+
for prefix in new_prefixes:
312+
url = git_refs_url_tpl.format(sha=f'/tags/{prefix}')
313+
w_prefix = self.github_request(url, self.github_name,
314+
self.github_token)
315+
if isinstance(w_prefix, dict): # exact match
316+
w_prefix = [w_prefix]
317+
318+
# ORDER BY tag-name-length DESC
319+
by_size = sorted([(len(tag['ref']), tag) for tag in w_prefix],
320+
key=lambda x: x[0])
321+
w_prefix = [itm[1] for itm in reversed(by_size)]
322+
323+
transformed_for_get_matching_tags = [
324+
{'name': tag['ref'].replace('refs/tags/', '', 1),
325+
'zipball_url': tag['url'].replace(
326+
'/git/refs/tags/', '/zipball/refs/tags/', 1),
327+
} for tag in w_prefix]
328+
source_url = self.get_matching_tag(
329+
transformed_for_get_matching_tags, version, tags_url)
330+
if len(source_url) > 0: # we found what we believe is
331+
return source_url # the correct source_url
332+
try:
333+
url = res.links['next']['url']
334+
res = self.github_request(url, self.github_name,
335+
self.github_token, return_response=True)
336+
except KeyError: # no more result pages
337+
break
338+
print_yellow(" No matching tag for version " + version + " found")
339+
return ""
159340

160341
def to_semver_string(self, version: str) -> str:
161342
"""Bring all version information to a format we can compare."""
@@ -193,8 +374,7 @@ def find_github_url(self, component: Component, use_language: bool = True) -> st
193374
name_match = [r for r in repositories.get("items") if component_name in r.get("name", "")]
194375
if len(name_match):
195376
for match in name_match:
196-
tag_info = self.github_request(match["tags_url"], self.github_name, self.github_token)
197-
source_url = self.get_matching_tag(tag_info, component.version or "", match["html_url"])
377+
source_url = self.get_matching_source_url(component.version, match["tags_url"])
198378
if len(name_match) == 1:
199379
return source_url
200380
elif source_url:
@@ -261,10 +441,7 @@ def find_golang_url(self, component: Component) -> str:
261441

262442
if repository_name.startswith("https://github.com/"):
263443
repository_name = repository_name[len("https://github.com/"):]
264-
tag_info = self.get_github_info(repository_name, self.github_name, self.github_token)
265-
tag_info_checked = self.check_for_github_error(tag_info)
266-
source_url = self.get_matching_tag(tag_info_checked, component_version,
267-
repository_name, version_prefix or "")
444+
source_url = self.get_matching_source_url(component_version, repository_name, version_prefix)
268445

269446
# component["RepositoryUrl"] = repository_name
270447
return source_url
@@ -284,26 +461,15 @@ def get_github_source_url(self, github_url: str, version: str) -> str:
284461

285462
if self.verbose:
286463
print_text(" repo_name:", repo_name)
287-
288-
tag_info = self.get_github_info(repo_name, self.github_name, self.github_token)
289-
tag_info_checked = self.check_for_github_error(tag_info)
290-
return self.get_matching_tag(tag_info_checked, version, github_url)
464+
return self.get_matching_source_url(version, repo_name)
291465

292466
def check_for_github_error(self, tag_info: get_github_info_type) -> List[Dict[str, Any]]:
293-
if isinstance(tag_info, list):
294-
# assume valid answer
295-
return tag_info
296-
297-
# check for 'rate limit exceeded' message
298-
if "message" in tag_info:
299-
if tag_info["message"].startswith("API rate limit exceeded"):
300-
print_red("GitHub API rate limit exceeded - aborting!")
301-
sys.exit(ResultCode.RESULT_ERROR_ACCESSING_SERVICE)
302-
if tag_info["message"].startswith("Bad credentials"):
303-
print_red("Invalid GitHub credential provided - aborting!")
304-
sys.exit(ResultCode.RESULT_ERROR_ACCESSING_SERVICE)
305-
306-
return []
467+
"""This method was introduced to check the output of
468+
get_github_info() for errors.
469+
Removed, because get_github_info was removed.
470+
"""
471+
raise NotImplementedError(
472+
"Removed with introduction of get_matchting_source_tag!")
307473

308474
def get_matching_tag(self, tag_info: List[Dict[str, Any]], version: str, github_url: str,
309475
version_prefix: str = "") -> str:
@@ -369,7 +535,7 @@ def get_source_url_from_release(self, release_id: str) -> str:
369535
if release_details:
370536
source_url = release_details.get("sourceCodeDownloadurl", "")
371537
if self.verbose:
372-
print("getting source url from get from sw360 for release_id " + release_id)
538+
print(" getting source url from get from sw360 for release_id " + release_id)
373539
if source_url != "":
374540
return source_url
375541
break
@@ -468,7 +634,8 @@ def find_source_url_recursive_by_sw360(self, component: Component) -> str:
468634

469635
@staticmethod
470636
def find_source_url_by_language(component: Component) -> str:
471-
capycli.dependencies.javascript.GetJavascriptDependencies().try_find_component_metadata(component, "")
637+
if hasattr(capycli, 'dependencies'):
638+
capycli.dependencies.javascript.GetJavascriptDependencies().try_find_component_metadata(component, "")
472639
url = CycloneDxSupport.get_ext_ref_source_url(component)
473640
if isinstance(url, XsUri):
474641
return url._uri

0 commit comments

Comments
 (0)