44from __future__ import annotations
55
66import argparse
7+ import asyncio
78import json
89import os
910import re
1011import shutil
1112import sys
12- from collections import defaultdict
1313from dataclasses import KW_ONLY , dataclass , field
1414from importlib .resources import files
1515from pathlib import Path
1616from textwrap import dedent
17- from typing import TYPE_CHECKING , cast
17+ from typing import TYPE_CHECKING , cast , override
1818
1919import httpx
2020import jsonschema
2121import yaml
22+ from httpx_limiter import ( # type: ignore[attr-defined]
23+ AbstractRateLimiterRepository ,
24+ AsyncMultiRateLimitedTransport ,
25+ Rate ,
26+ )
27+ from httpx_limiter .aiolimiter import AiolimiterAsyncLimiter # type: ignore[attr-defined]
2228from httpx_retries import Retry , RetryTransport
2329from PIL import Image
2430
2531from ._logging import log , setup_logging
2632
2733if TYPE_CHECKING :
28- from collections .abc import Iterable , Mapping , Sequence
34+ from collections .abc import Awaitable , Generator , Iterable , Mapping , Sequence
2935 from importlib .resources .abc import Traversable
3036
3137 from .schema import ScverseEcosystemPackages # pyright: ignore[reportMissingModuleSource]
@@ -40,16 +46,6 @@ class ValidationError(Exception):
4046 pass
4147
4248
43- class ErrorList (list [Exception ]):
44- """List of error messages. Ignores None objects, and logs an error when one gets added."""
45-
46- def append (self , obj : Exception | None ) -> None :
47- if obj is None :
48- return
49- log .error (f"Validation error: { obj } " )
50- return super ().append (obj )
51-
52-
5349RE_RTD = re .compile (
5450 r"https?://(?P<domain>.*\.(?:readthedocs\.io|rtfd\.io|readthedocs-hosted\.com))/(?P<version>en/[^/]+)(?P<path>.*)"
5551)
@@ -59,15 +55,15 @@ def append(self, obj: Exception | None) -> None:
5955class HTTPValidator [E = str ]:
6056 """Validate HTTP URLs."""
6157
62- client : httpx .Client
58+ client : httpx .AsyncClient
6359 _ : KW_ONLY
6460 validated : set [E ] = field (default_factory = set )
6561
6662
6763class LinkChecker (HTTPValidator ):
6864 """Track known links and validate URLs."""
6965
70- def __call__ (self , url : str , context : str ) -> None | ValidationError :
66+ async def __call__ (self , url : str , context : str ) -> None | ValidationError :
7167 """Check if URL is duplicate, validate it exists, and register it.
7268
7369 Parameters
@@ -88,7 +84,7 @@ def __call__(self, url: str, context: str) -> None | ValidationError:
8884 return ValidationError (msg )
8985
9086 try :
91- response = self .client .head (url )
87+ response = await self .client .head (url )
9288 except Exception as e :
9389 msg = f"URL { url } is not reachable: { e } "
9490 return ValidationError (msg )
@@ -107,7 +103,7 @@ class GitHubUserValidator(HTTPValidator):
107103
108104 github_token : str | None = None
109105
110- def __call__ (self , usernames : Sequence [str ], context : str ) -> None | ValidationError :
106+ async def __call__ (self , usernames : Sequence [str ], context : str ) -> None | ValidationError :
111107 """Validate that a GitHub username exists.
112108
113109 Parameters
@@ -128,7 +124,7 @@ def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationE
128124 q = "\n " .join (f"user{ i } : user(login: { json .dumps (name )} ) {{ login }}" for i , name in enumerate (unvalidated ))
129125
130126 try :
131- response = self .client .post (
127+ response = await self .client .post (
132128 "https://api.github.com/graphql" , headers = headers , json = {"query" : f"query {{ { q } }}" }
133129 )
134130 except Exception as e :
@@ -154,7 +150,7 @@ def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationE
154150class PyPIValidator (HTTPValidator ):
155151 """Validate PyPI package names against the PyPI API."""
156152
157- def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
153+ async def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
158154 """Validate that a PyPI package exists.
159155
160156 Parameters
@@ -168,7 +164,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
168164 return None
169165
170166 try :
171- response = self .client .head (f"https://pypi.org/pypi/{ package_name } /json" )
167+ response = await self .client .head (f"https://pypi.org/pypi/{ package_name } /json" )
172168 except Exception as e :
173169 msg = f"{ context } : Failed to validate PyPI package { package_name !r} : { e } "
174170 return ValidationError (msg )
@@ -189,7 +185,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
189185class CondaValidator (HTTPValidator ):
190186 """Validate Conda package identifiers using the Anaconda API."""
191187
192- def __call__ (self , package_spec : str , context : str ) -> None | ValidationError :
188+ async def __call__ (self , package_spec : str , context : str ) -> None | ValidationError :
193189 """Validate that a Conda package exists.
194190
195191 Parameters
@@ -211,7 +207,7 @@ def __call__(self, package_spec: str, context: str) -> None | ValidationError:
211207
212208 # Check package exists on the channel
213209 try :
214- response = self .client .head (f"https://api.anaconda.org/package/{ channel } /{ package_name } " )
210+ response = await self .client .head (f"https://api.anaconda.org/package/{ channel } /{ package_name } " )
215211 except Exception as e :
216212 msg = f"{ context } : Failed to validate Conda package '{ package_spec } ': { e } "
217213 return ValidationError (msg )
@@ -232,7 +228,7 @@ def __call__(self, package_spec: str, context: str) -> None | ValidationError:
232228class CRANValidator (HTTPValidator ):
233229 """Validate CRAN package names using the CRAN API."""
234230
235- def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
231+ async def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
236232 """Validate that a CRAN package exists.
237233
238234 Parameters
@@ -247,7 +243,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
247243
248244 # CRAN packages can be checked via the packages database
249245 try :
250- response = self .client .head (f"https://crandb.r-pkg.org/{ package_name } " )
246+ response = await self .client .head (f"https://crandb.r-pkg.org/{ package_name } " )
251247 except Exception as e :
252248 msg = f"{ context } : Failed to validate CRAN package '{ package_name } ': { e } "
253249 return ValidationError (msg )
@@ -268,7 +264,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
268264class BioconductorValidator (HTTPValidator ):
269265 """Validate Bioconductor package names using the Bioconductor API."""
270266
271- def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
267+ async def __call__ (self , package_name : str , context : str ) -> None | ValidationError :
272268 """Validate that a Bioconductor package exists.
273269
274270 Parameters
@@ -283,7 +279,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
283279
284280 # Bioconductor packages can be checked via their web API
285281 try :
286- response = self .client .head (f"https://bioconductor.org/packages/{ package_name } /" )
282+ response = await self .client .head (f"https://bioconductor.org/packages/{ package_name } /" )
287283 except Exception as e :
288284 msg = f"{ context } : Failed to validate Bioconductor package '{ package_name } ': { e } "
289285 return ValidationError (msg )
@@ -300,11 +296,11 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
300296 return None
301297
302298
303- def check_image (img_path : Path ) -> None | ValidationError :
299+ def check_image (img_path : Path ) -> None :
304300 """Validates that the image exists and that it is either a SVG or fits into the 512x512 bounding box."""
305301 if not img_path .exists ():
306302 msg = f"Image does not exist: { img_path } "
307- return ValidationError (msg )
303+ raise ValidationError (msg )
308304 if img_path .suffix == ".svg" :
309305 return None
310306 with Image .open (img_path ) as img :
@@ -317,77 +313,120 @@ def check_image(img_path: Path) -> None | ValidationError:
317313 Actual dimensions (width, height): ({ width } , ({ height } ))."
318314 """
319315 )
320- return ValidationError (msg )
316+ raise ValidationError (msg )
321317 return None
322318
323319
324- def validate_packages ( # noqa: C901
325- schema_file : Traversable , registry_dir : Path , github_token : str | None = None
326- ) -> tuple [Mapping [str , Sequence [Exception ]], Sequence [ScverseEcosystemPackages ]]:
327- """Find all package `meta.yaml` files in the registry dir and yield package records."""
328- schema = json .loads (schema_file .read_bytes ())
320+ class DomainBasedRateLimiterRepository (AbstractRateLimiterRepository ):
321+ """Apply different rate limits based on the domain being requested."""
322+
323+ @override
324+ def get_identifier (self , request : httpx .Request ) -> str :
325+ return request .url .host
329326
330- # Create HTTP client with retry configuration using httpx_retries transport
331- retry_transport = RetryTransport ( retry = Retry ( total = 3 , backoff_factor = 2 ))
332- retry_client = httpx . Client ( follow_redirects = True , timeout = 30.0 , transport = retry_transport )
327+ @ override
328+ def create ( self , request : httpx . Request ) -> AiolimiterAsyncLimiter :
329+ return AiolimiterAsyncLimiter . create ( Rate . create ( magnitude = 25 ) )
333330
334- # using different link checkers,
335- # because each of them may point to the same URL and this wouldn't qualify as duplicate
336- check_home = LinkChecker (retry_client )
337- check_docs = LinkChecker (retry_client )
338- check_tutorial = LinkChecker (retry_client )
339331
340- check_gh_users = GitHubUserValidator (retry_client , github_token )
341- check_pypi = PyPIValidator (retry_client )
342- check_conda = CondaValidator (retry_client )
343- check_cran = CRANValidator (retry_client )
344- check_bioc = BioconductorValidator (retry_client )
332+ @dataclass
333+ class Checker :
334+ schema_file : Traversable
335+ registry_dir : Path
336+ _ : KW_ONLY
337+ github_token : str | None = None
345338
346- errors : defaultdict [ str , ErrorList ] = defaultdict ( ErrorList )
347- package_metadata : list [ ScverseEcosystemPackages ] = []
339+ def __post_init__ ( self ) -> None :
340+ self . schema = json . loads ( self . schema_file . read_bytes ())
348341
349- for tmp_meta_file in sorted (registry_dir .rglob ("meta.yaml" ), key = lambda x : x .parent .name ):
350- pkg_id = tmp_meta_file .parent .name
351- pkg_errors = errors [pkg_id ]
352- log .info (f"Validating { pkg_id } " )
353- with tmp_meta_file .open () as f :
342+ # Create HTTP client with retry configuration using httpx_retries transport
343+ transport : httpx .AsyncBaseTransport = AsyncMultiRateLimitedTransport .create (
344+ repository = DomainBasedRateLimiterRepository ()
345+ )
346+ transport = RetryTransport (transport , Retry (total = 3 , backoff_factor = 2 ))
347+ self .client = httpx .AsyncClient (follow_redirects = True , timeout = 30.0 , transport = transport )
348+
349+ # using different link checkers,
350+ # because each of them may point to the same URL and this wouldn't qualify as duplicate
351+ self .check_home = LinkChecker (self .client )
352+ self .check_docs = LinkChecker (self .client )
353+ self .check_tutorial = LinkChecker (self .client )
354+
355+ self .check_gh_users = GitHubUserValidator (self .client , self .github_token )
356+ self .check_pypi = PyPIValidator (self .client )
357+ self .check_conda = CondaValidator (self .client )
358+ self .check_cran = CRANValidator (self .client )
359+ self .check_bioc = BioconductorValidator (self .client )
360+
361+ async def validate_packages (self ) -> tuple [Mapping [str , Sequence [Exception ]], Sequence [ScverseEcosystemPackages ]]:
362+ """Find all package `meta.yaml` files in the registry dir and yield package records."""
363+
364+ errors : dict [str , list [ValidationError | jsonschema .ValidationError ]] = {}
365+ package_metadata : list [ScverseEcosystemPackages ] = []
366+
367+ async with self .client :
368+ async for check in asyncio .as_completed (
369+ self .check_package (meta_path )
370+ for meta_path in sorted (self .registry_dir .rglob ("meta.yaml" ), key = lambda x : x .parent .name )
371+ ):
372+ pkg_id , tmp_meta , pkg_errors = await check
373+ errors [pkg_id ] = pkg_errors
374+ package_metadata .append (tmp_meta )
375+
376+ return errors , package_metadata
377+
378+ async def check_package (
379+ self , meta_file : Path
380+ ) -> tuple [str , ScverseEcosystemPackages , list [ValidationError | jsonschema .ValidationError ]]:
381+ pkg_id = meta_file .parent .name
382+ with meta_file .open () as f :
354383 tmp_meta = cast ("ScverseEcosystemPackages" , yaml .load (f , yaml .SafeLoader ))
355384
385+ pkg_errors : list [ValidationError | jsonschema .ValidationError ] = []
356386 try :
357- jsonschema .validate (tmp_meta , schema )
387+ jsonschema .validate (tmp_meta , self . schema )
358388 except jsonschema .ValidationError as e :
359389 pkg_errors .append (e )
360390
391+ # Check logo (if available) and make path relative to root of registry
392+ if "logo" in tmp_meta :
393+ img_path = self .registry_dir / pkg_id / tmp_meta ["logo" ]
394+ try :
395+ check_image (img_path )
396+ except ValidationError as e :
397+ pkg_errors .append (e )
398+ tmp_meta ["logo" ] = str (img_path )
399+
400+ log .info (f"Validating { pkg_id } " )
401+ async for check in asyncio .as_completed (self .http_checks (pkg_id , tmp_meta )):
402+ try :
403+ await check
404+ except ValidationError as e :
405+ pkg_errors .append (e )
406+
407+ return pkg_id , tmp_meta , pkg_errors
408+
409+ def http_checks (self , pkg_id : str , tmp_meta : ScverseEcosystemPackages ) -> Generator [Awaitable [Exception | None ]]:
361410 # Check and register all links
362- pkg_errors . append ( check_home (tmp_meta ["project_home" ], pkg_id ) )
363- pkg_errors . append ( check_docs (tmp_meta ["documentation_home" ], pkg_id ) )
411+ yield self . check_home (tmp_meta ["project_home" ], pkg_id )
412+ yield self . check_docs (tmp_meta ["documentation_home" ], pkg_id )
364413 if url := tmp_meta .get ("tutorials_home" ):
365- pkg_errors . append ( check_tutorial (url , pkg_id ) )
414+ yield self . check_tutorial (url , pkg_id )
366415
367416 # Validate GitHub usernames in contact field
368417 if usernames := tmp_meta .get ("contact" ):
369- pkg_errors . append ( check_gh_users (usernames , pkg_id ) )
418+ yield self . check_gh_users (usernames , pkg_id )
370419
371420 # Validate install packages
372421 if install_info := tmp_meta .get ("install" ):
373422 if pypi_name := install_info .get ("pypi" ):
374- pkg_errors . append ( check_pypi (pypi_name , pkg_id ) )
423+ yield self . check_pypi (pypi_name , pkg_id )
375424 if conda_name := install_info .get ("conda" ):
376- pkg_errors . append ( check_conda (conda_name , pkg_id ) )
425+ yield self . check_conda (conda_name , pkg_id )
377426 if cran_name := install_info .get ("cran" ):
378- pkg_errors . append ( check_cran (cran_name , pkg_id ) )
427+ yield self . check_cran (cran_name , pkg_id )
379428 if bioconductor_name := install_info .get ("bioconductor" ):
380- pkg_errors .append (check_bioc (bioconductor_name , pkg_id ))
381-
382- # Check logo (if available) and make path relative to root of registry
383- if "logo" in tmp_meta :
384- img_path = registry_dir / pkg_id / tmp_meta ["logo" ]
385- pkg_errors .append (check_image (img_path ))
386- tmp_meta ["logo" ] = str (img_path )
387-
388- package_metadata .append (tmp_meta )
389-
390- return errors , package_metadata
429+ yield self .check_bioc (bioconductor_name , pkg_id )
391430
392431
393432def make_output (
@@ -463,7 +502,8 @@ def main(args: Sequence[str] | None = None) -> None:
463502 parsed_args .outdir .mkdir (parents = True )
464503
465504 log .info ("Starting validation" )
466- errors , packages = validate_packages (schema_file , parsed_args .registry_dir , github_token )
505+ checker = Checker (schema_file , parsed_args .registry_dir , github_token = github_token )
506+ errors , packages = asyncio .run (checker .validate_packages ())
467507
468508 if any (errors .values ()):
469509 log .error ("Validation error occured in at least one package. Exiting." )
0 commit comments