11
11
import re
12
12
import sys
13
13
import time
14
- from typing import Any , Dict , List , Tuple
14
+ from collections .abc import Iterable
15
+ from typing import Any , Dict , List , Set , Tuple
16
+ from urllib .parse import parse_qs , urlparse
15
17
16
18
import requests
17
19
import semver
36
38
class FindSources (capycli .common .script_base .ScriptBase ):
37
39
"""Go through the list of SBOM items and try to determine the source code."""
38
40
41
+ class TagCache :
42
+ """A key task performed in this module is fetching tags from GitHub
43
+ and match tags to (component) versions. This task includes many
44
+ calls to the GitHub API, which we seek to limit by implementing
45
+ an internal cache and a logic to guess tags, instead of
46
+ performing exhaustive searches.
47
+ """
48
+ def __init__ (self ) -> None :
49
+ self .data : Dict [Tuple [str , str ], Set [str ]] = {}
50
+
51
+ def __getitem__ (self , key : Any ) -> Set [str ]:
52
+ """Get the set of all cached tags for a key."""
53
+ return self .data [self ._validate_key (key )]
54
+
55
+ def _validate_key (self , key : Tuple [str , str ]) -> Tuple [str , str ]:
56
+ """Ensure our keys are hashable."""
57
+ if len (key ) != 2 or key != (str (key [0 ]), str (key [1 ])):
58
+ raise KeyError (f'{ self .__class__ .__name__ } key must consist of'
59
+ 'a project name and a version string' )
60
+ return key
61
+
62
+ def add (self , project : str , version : str , tag : str ) -> None :
63
+ """Cache a tag for a specific project and version."""
64
+ key = self ._validate_key ((project , version ))
65
+ tags = self .data .setdefault (key , set ())
66
+ tags .add (tag )
67
+
68
+ def filter (self , project : str , version : str , data : Any ) -> List [str ]:
69
+ """Remove all cached entries from @data."""
70
+ if isinstance (data , str ):
71
+ data = [data ]
72
+ elif not isinstance (data , Iterable ):
73
+ raise ValueError ('Expecting an iterable of tags!' )
74
+ key = self ._validate_key ((project , version ))
75
+ return [item for item in data
76
+ if item not in self .data .get (key , [])
77
+ and len (item ) > 0 ]
78
+
79
+ def filter_and_cache (self , project : str , version : str , data : Any ) -> List [str ]:
80
+ """Convenience method to to filtering and adding in one run."""
81
+ candidates = set (self .filter (project , version , data ))
82
+ for tag in candidates :
83
+ self .add (project , version , tag )
84
+ return list (candidates )
85
+
39
86
def __init__ (self ) -> None :
40
87
self .verbose : bool = False
41
- self .version_regex = re .compile (r"[ \d+\.|_]+[ \d+] " )
88
+ self .version_regex = re .compile (r"( \d+[._])+ \d+" )
42
89
self .github_project_name_regex = re .compile (r"^[a-zA-Z0-9-]+(/[a-zA-Z0-9-]+)*$" )
43
90
self .github_name : str = ""
44
91
self .github_token : str = ""
45
92
self .sw360_url : str = os .environ .get ("SW360ServerUrl" , "" )
93
+ self .tag_cache = self .TagCache ()
46
94
47
95
def is_sourcefile_accessible (self , sourcefile_url : str ) -> bool :
48
96
"""Check if the URL is accessible."""
@@ -70,34 +118,61 @@ def is_sourcefile_accessible(self, sourcefile_url: str) -> bool:
70
118
return False
71
119
72
120
@staticmethod
73
- def github_request (url : str , username : str = "" , token : str = "" ) -> Any :
121
+ def github_request (url : str , username : str = "" , token : str = "" ,
122
+ return_response : bool = False ,
123
+ allow_redirects : bool = True , # default in requests
124
+ ) -> Any :
74
125
try :
75
126
headers = {}
76
127
if token :
77
128
headers ["Authorization" ] = "token " + token
78
129
if username :
79
130
headers ["Username" ] = username
80
- response = requests .get (url , headers = headers )
81
- if not response .ok :
82
- if response .status_code == 429 or \
83
- 'rate limit exceeded' in response .reason or \
84
- "API rate limit exceeded" in response .json ().get ("message" ):
85
- print (
86
- Fore .LIGHTYELLOW_EX +
87
- " Github API rate limit exceeded - wait 60s and retry ... " +
88
- Style .RESET_ALL )
89
- time .sleep (60 )
90
- return FindSources .github_request (url , username , token )
91
-
92
- return response .json ()
131
+ response = requests .get (url , headers = headers ,
132
+ allow_redirects = allow_redirects )
133
+ if response .status_code == 429 \
134
+ or 'rate limit exceeded' in response .reason \
135
+ or 'API rate limit exceeded' in response .json ().get ('message' , '' ):
136
+ print (
137
+ Fore .LIGHTYELLOW_EX +
138
+ " Github API rate limit exceeded - wait 60s and retry ... " +
139
+ Style .RESET_ALL )
140
+ time .sleep (60 )
141
+ return FindSources .github_request (url , username , token , return_response = return_response )
142
+ if response .json ().get ('message' , '' ).startswith ("Bad credentials" ):
143
+ print_red ("Invalid GitHub credential provided - aborting!" )
144
+ sys .exit (ResultCode .RESULT_ERROR_ACCESSING_SERVICE )
145
+
146
+ except AttributeError as err :
147
+ # response.json() did not return a dictionary
148
+ if hasattr (err , 'name' ):
149
+ name = err .name
150
+ else : # Python prior to 3.10
151
+ name = err .args [0 ].split ("'" )[3 ]
152
+ if not name == 'get' :
153
+ raise
154
+
155
+ except requests .exceptions .JSONDecodeError :
156
+ response ._content = b'{}'
157
+
158
+ except requests .exceptions .ConnectionError as ex :
159
+ print (
160
+ Fore .LIGHTYELLOW_EX +
161
+ f" Connection issues accessing { url } " + repr (ex ) +
162
+ "\n Retrying in 60 seconds!" +
163
+ Style .RESET_ALL )
164
+ time .sleep (60 )
165
+ return FindSources .github_request (url , username , token , return_response = return_response )
93
166
94
167
except Exception as ex :
95
168
print (
96
169
Fore .LIGHTYELLOW_EX +
97
170
" Error accessing GitHub: " + repr (ex ) +
98
171
Style .RESET_ALL )
99
-
100
- return {}
172
+ response = requests .Response ()
173
+ response ._content = \
174
+ b'{' + f'"exception": "{ repr (ex )} "' .encode () + b'}'
175
+ return response if return_response else response .json ()
101
176
102
177
@staticmethod
103
178
def get_repositories (name : str , language : str , username : str = "" , token : str = "" ) -> Any :
@@ -135,27 +210,133 @@ def get_repo_name(github_url: str) -> str:
135
210
@staticmethod
136
211
def get_github_info (repository_url : str , username : str = "" ,
137
212
token : str = "" ) -> get_github_info_type :
213
+ """This method used to iterate through all resource pages of
214
+ GitHub's /tags API, collect the results, then return a huge
215
+ list with all results.
216
+ Removed because this approach does not scale well and we did
217
+ encounter projects with tens of thousands of tags.
138
218
"""
139
- Query tag infos from GitHub.
140
-
141
- In the good case a list of tags entries (= dictionaries) is returned.
142
- In the bad case a JSON error message is returned.
219
+ raise NotImplementedError (
220
+ "Removed with introduction of get_matchting_source_tag!" )
221
+
222
+ def _get_github_repo (self , github_ref : str ) -> Dict [str , Any ]:
223
+ """Fetch GitHub API object identified by @github_ref.
224
+ @github_ref can be a simple "<owner>/<repo>" string or any
225
+ from the plethora of links that refer to a
226
+ project on GitHub.
227
+ By using urlparse() we save ourselves a little bit of work
228
+ with trailing queries and fragments, but any @github_ref with
229
+ colons, where the first colon is not part of '://' will not
230
+ yield viable results,
231
+ e.g. 'api.github.com:443/repos/sw360/capycli'.
232
+ """
233
+ url = 'api.github.com/repos/'
234
+ gh_ref = urlparse (github_ref , scheme = 'no_scheme' )
235
+ if gh_ref .scheme == 'no_scheme' : # interpret @github_ref as OWNER/REPO
236
+ url += gh_ref .path
237
+ elif not gh_ref .netloc .endswith ('github.com' ):
238
+ raise ValueError (f'{ github_ref } is not an expected @github_ref!' )
239
+ elif gh_ref .path .startswith ('/repos' ):
240
+ url += gh_ref .path [6 :]
241
+ else :
242
+ url += gh_ref .path
243
+ if url .endswith ('.git' ):
244
+ url = url [0 :- 4 ]
245
+ url = 'https://' + url .replace ('//' , '/' )
246
+ repo = {}
247
+ while 'tags_url' not in repo and 'github.com' in url :
248
+ repo = self .github_request (url , self .github_name , self .github_token )
249
+ url = url .rsplit ('/' , 1 )[0 ] # remove last path segment
250
+ if 'tags_url' not in repo :
251
+ raise ValueError (f"Unable to make @github_ref { github_ref } work!" )
252
+ return repo
253
+
254
+ def _get_link_page (self , res : requests .Response , which : str = 'next' ) -> int :
255
+ """Fetch only page number from link-header."""
256
+ try :
257
+ url = urlparse (res .links [which ]['url' ])
258
+ return int (parse_qs (url .query )['page' ][0 ])
259
+ except KeyError : # GitHub gave us only one results page
260
+ return 1
261
+
262
+ def get_matching_source_url (self , version : Any , github_ref : str ,
263
+ version_prefix : Any = None
264
+ ) -> str :
265
+ """Find a URL to download source code from GitHub. We are
266
+ looking for the source code in @github_ref at @version.
267
+
268
+ We expect to match @version to an existing tag in the repo
269
+ identified by @github_ref. We want to have the source
270
+ code download URL of that existing tag!
271
+
272
+ In order to perform this matching, we must retrieve the tags
273
+ from GitHub and then analyse them. First, we use
274
+ get_matching_tag(). If that doesn't yield a positive result,
275
+ we try to infer a tag for @version, to prevent an exhaustive
276
+ search over all tags.
143
277
"""
144
- length_per_page = 100
145
- page = 1
146
- tags : List [Dict [str , Any ]] = []
147
- tag_url = "https://api.github.com/repos/" + repository_url + "/tags"
148
- query = "?per_page=%s&page=%s" % (length_per_page , page )
149
- tmp = FindSources .github_request (tag_url + query , username , token )
150
- if not isinstance (tmp , list ):
151
- return tags
152
- tags .extend (tmp )
153
- while len (tmp ) == length_per_page :
154
- page += 1
155
- query = "?per_page=%s&page=%s" % (length_per_page , page )
156
- tmp = FindSources .github_request (tag_url + query , username , token )
157
- tags .extend (tmp )
158
- return tags
278
+ try :
279
+ repo = self ._get_github_repo (github_ref )
280
+ except ValueError as err :
281
+ print_yellow (" " + str (err ))
282
+ return ""
283
+
284
+ tags_url = repo ['tags_url' ] + '?per_page=100'
285
+ git_refs_url_tpl = repo ['git_refs_url' ].replace ('{/sha}' , '{sha}' , 1 )
286
+
287
+ res = self .github_request (tags_url , self .github_name ,
288
+ self .github_token , return_response = True )
289
+ pages = self ._get_link_page (res , 'last' )
290
+ for _ in range (pages ): # we prefer this over "while True"
291
+ # note: in res.json() we already have the first results page
292
+ try :
293
+ tags = [tag for tag in res .json ()
294
+ if version_prefix is None
295
+ or tag ['name' ].startswith (version_prefix )]
296
+ source_url = self .get_matching_tag (tags , version , tags_url )
297
+ if len (source_url ) > 0 : # we found what we believe is
298
+ return source_url # the correct source_url
299
+
300
+ except (TypeError , KeyError , AttributeError ):
301
+ # res.json() did not give us an iterable of things where
302
+ # 'name' is a viable index, for instance an error message
303
+ tags = []
304
+
305
+ new_prefixes = self .tag_cache .filter_and_cache (
306
+ repo ['full_name' ], version , # cache key
307
+ [self .version_regex .split (tag ['name' ], 1 )[0 ]
308
+ for tag in tags
309
+ if self .version_regex .search (tag ['name' ]) is not None ])
310
+
311
+ for prefix in new_prefixes :
312
+ url = git_refs_url_tpl .format (sha = f'/tags/{ prefix } ' )
313
+ w_prefix = self .github_request (url , self .github_name ,
314
+ self .github_token )
315
+ if isinstance (w_prefix , dict ): # exact match
316
+ w_prefix = [w_prefix ]
317
+
318
+ # ORDER BY tag-name-length DESC
319
+ by_size = sorted ([(len (tag ['ref' ]), tag ) for tag in w_prefix ],
320
+ key = lambda x : x [0 ])
321
+ w_prefix = [itm [1 ] for itm in reversed (by_size )]
322
+
323
+ transformed_for_get_matching_tags = [
324
+ {'name' : tag ['ref' ].replace ('refs/tags/' , '' , 1 ),
325
+ 'zipball_url' : tag ['url' ].replace (
326
+ '/git/refs/tags/' , '/zipball/refs/tags/' , 1 ),
327
+ } for tag in w_prefix ]
328
+ source_url = self .get_matching_tag (
329
+ transformed_for_get_matching_tags , version , tags_url )
330
+ if len (source_url ) > 0 : # we found what we believe is
331
+ return source_url # the correct source_url
332
+ try :
333
+ url = res .links ['next' ]['url' ]
334
+ res = self .github_request (url , self .github_name ,
335
+ self .github_token , return_response = True )
336
+ except KeyError : # no more result pages
337
+ break
338
+ print_yellow (" No matching tag for version " + version + " found" )
339
+ return ""
159
340
160
341
def to_semver_string (self , version : str ) -> str :
161
342
"""Bring all version information to a format we can compare."""
@@ -193,8 +374,7 @@ def find_github_url(self, component: Component, use_language: bool = True) -> st
193
374
name_match = [r for r in repositories .get ("items" ) if component_name in r .get ("name" , "" )]
194
375
if len (name_match ):
195
376
for match in name_match :
196
- tag_info = self .github_request (match ["tags_url" ], self .github_name , self .github_token )
197
- source_url = self .get_matching_tag (tag_info , component .version or "" , match ["html_url" ])
377
+ source_url = self .get_matching_source_url (component .version , match ["tags_url" ])
198
378
if len (name_match ) == 1 :
199
379
return source_url
200
380
elif source_url :
@@ -261,10 +441,7 @@ def find_golang_url(self, component: Component) -> str:
261
441
262
442
if repository_name .startswith ("https://github.com/" ):
263
443
repository_name = repository_name [len ("https://github.com/" ):]
264
- tag_info = self .get_github_info (repository_name , self .github_name , self .github_token )
265
- tag_info_checked = self .check_for_github_error (tag_info )
266
- source_url = self .get_matching_tag (tag_info_checked , component_version ,
267
- repository_name , version_prefix or "" )
444
+ source_url = self .get_matching_source_url (component_version , repository_name , version_prefix )
268
445
269
446
# component["RepositoryUrl"] = repository_name
270
447
return source_url
@@ -284,26 +461,15 @@ def get_github_source_url(self, github_url: str, version: str) -> str:
284
461
285
462
if self .verbose :
286
463
print_text (" repo_name:" , repo_name )
287
-
288
- tag_info = self .get_github_info (repo_name , self .github_name , self .github_token )
289
- tag_info_checked = self .check_for_github_error (tag_info )
290
- return self .get_matching_tag (tag_info_checked , version , github_url )
464
+ return self .get_matching_source_url (version , repo_name )
291
465
292
466
def check_for_github_error (self , tag_info : get_github_info_type ) -> List [Dict [str , Any ]]:
293
- if isinstance (tag_info , list ):
294
- # assume valid answer
295
- return tag_info
296
-
297
- # check for 'rate limit exceeded' message
298
- if "message" in tag_info :
299
- if tag_info ["message" ].startswith ("API rate limit exceeded" ):
300
- print_red ("GitHub API rate limit exceeded - aborting!" )
301
- sys .exit (ResultCode .RESULT_ERROR_ACCESSING_SERVICE )
302
- if tag_info ["message" ].startswith ("Bad credentials" ):
303
- print_red ("Invalid GitHub credential provided - aborting!" )
304
- sys .exit (ResultCode .RESULT_ERROR_ACCESSING_SERVICE )
305
-
306
- return []
467
+ """This method was introduced to check the output of
468
+ get_github_info() for errors.
469
+ Removed, because get_github_info was removed.
470
+ """
471
+ raise NotImplementedError (
472
+ "Removed with introduction of get_matchting_source_tag!" )
307
473
308
474
def get_matching_tag (self , tag_info : List [Dict [str , Any ]], version : str , github_url : str ,
309
475
version_prefix : str = "" ) -> str :
@@ -369,7 +535,7 @@ def get_source_url_from_release(self, release_id: str) -> str:
369
535
if release_details :
370
536
source_url = release_details .get ("sourceCodeDownloadurl" , "" )
371
537
if self .verbose :
372
- print ("getting source url from get from sw360 for release_id " + release_id )
538
+ print (" getting source url from get from sw360 for release_id " + release_id )
373
539
if source_url != "" :
374
540
return source_url
375
541
break
@@ -468,7 +634,8 @@ def find_source_url_recursive_by_sw360(self, component: Component) -> str:
468
634
469
635
@staticmethod
470
636
def find_source_url_by_language (component : Component ) -> str :
471
- capycli .dependencies .javascript .GetJavascriptDependencies ().try_find_component_metadata (component , "" )
637
+ if hasattr (capycli , 'dependencies' ):
638
+ capycli .dependencies .javascript .GetJavascriptDependencies ().try_find_component_metadata (component , "" )
472
639
url = CycloneDxSupport .get_ext_ref_source_url (component )
473
640
if isinstance (url , XsUri ):
474
641
return url ._uri
0 commit comments