Skip to content

Commit d6c1eca

Browse files
committed
perf: fast checking
1 parent 30c71fc commit d6c1eca

3 files changed

Lines changed: 116 additions & 74 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ repos:
7272
pass_filenames: false
7373
additional_dependencies:
7474
- httpx
75+
- httpx-limiter
7576
- httpx-retries
7677
- types-jsonschema
7778
- pillow

scripts/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ classifiers = [
2020
dynamic = [ "version" ]
2121
dependencies = [
2222
"httpx",
23+
"httpx-limiter[aiolimiter]",
2324
"httpx-retries",
2425
"jsonschema>=4.25.1",
2526
"pillow>=12",

scripts/src/ecosystem_scripts/validate_registry.py

Lines changed: 114 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,34 @@
44
from __future__ import annotations
55

66
import argparse
7+
import asyncio
78
import json
89
import os
910
import re
1011
import shutil
1112
import sys
12-
from collections import defaultdict
1313
from dataclasses import KW_ONLY, dataclass, field
1414
from importlib.resources import files
1515
from pathlib import Path
1616
from textwrap import dedent
17-
from typing import TYPE_CHECKING, cast
17+
from typing import TYPE_CHECKING, cast, override
1818

1919
import httpx
2020
import jsonschema
2121
import yaml
22+
from httpx_limiter import ( # type: ignore[attr-defined]
23+
AbstractRateLimiterRepository,
24+
AsyncMultiRateLimitedTransport,
25+
Rate,
26+
)
27+
from httpx_limiter.aiolimiter import AiolimiterAsyncLimiter # type: ignore[attr-defined]
2228
from httpx_retries import Retry, RetryTransport
2329
from PIL import Image
2430

2531
from ._logging import log, setup_logging
2632

2733
if TYPE_CHECKING:
28-
from collections.abc import Iterable, Mapping, Sequence
34+
from collections.abc import Awaitable, Generator, Iterable, Mapping, Sequence
2935
from importlib.resources.abc import Traversable
3036

3137
from .schema import ScverseEcosystemPackages # pyright: ignore[reportMissingModuleSource]
@@ -40,16 +46,6 @@ class ValidationError(Exception):
4046
pass
4147

4248

43-
class ErrorList(list[Exception]):
44-
"""List of error messages. Ignores None objects, and logs an error when one gets added."""
45-
46-
def append(self, obj: Exception | None) -> None:
47-
if obj is None:
48-
return
49-
log.error(f"Validation error: {obj}")
50-
return super().append(obj)
51-
52-
5349
RE_RTD = re.compile(
5450
r"https?://(?P<domain>.*\.(?:readthedocs\.io|rtfd\.io|readthedocs-hosted\.com))/(?P<version>en/[^/]+)(?P<path>.*)"
5551
)
@@ -59,15 +55,15 @@ def append(self, obj: Exception | None) -> None:
5955
class HTTPValidator[E = str]:
6056
"""Validate HTTP URLs."""
6157

62-
client: httpx.Client
58+
client: httpx.AsyncClient
6359
_: KW_ONLY
6460
validated: set[E] = field(default_factory=set)
6561

6662

6763
class LinkChecker(HTTPValidator):
6864
"""Track known links and validate URLs."""
6965

70-
def __call__(self, url: str, context: str) -> None | ValidationError:
66+
async def __call__(self, url: str, context: str) -> None | ValidationError:
7167
"""Check if URL is duplicate, validate it exists, and register it.
7268
7369
Parameters
@@ -88,7 +84,7 @@ def __call__(self, url: str, context: str) -> None | ValidationError:
8884
return ValidationError(msg)
8985

9086
try:
91-
response = self.client.head(url)
87+
response = await self.client.head(url)
9288
except Exception as e:
9389
msg = f"URL {url} is not reachable: {e}"
9490
return ValidationError(msg)
@@ -107,7 +103,7 @@ class GitHubUserValidator(HTTPValidator):
107103

108104
github_token: str | None = None
109105

110-
def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationError:
106+
async def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationError:
111107
"""Validate that a GitHub username exists.
112108
113109
Parameters
@@ -128,7 +124,7 @@ def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationE
128124
q = "\n".join(f"user{i}: user(login: {json.dumps(name)}) {{ login }}" for i, name in enumerate(unvalidated))
129125

130126
try:
131-
response = self.client.post(
127+
response = await self.client.post(
132128
"https://api.github.com/graphql", headers=headers, json={"query": f"query {{ {q} }}"}
133129
)
134130
except Exception as e:
@@ -154,7 +150,7 @@ def __call__(self, usernames: Sequence[str], context: str) -> None | ValidationE
154150
class PyPIValidator(HTTPValidator):
155151
"""Validate PyPI package names against the PyPI API."""
156152

157-
def __call__(self, package_name: str, context: str) -> None | ValidationError:
153+
async def __call__(self, package_name: str, context: str) -> None | ValidationError:
158154
"""Validate that a PyPI package exists.
159155
160156
Parameters
@@ -168,7 +164,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
168164
return None
169165

170166
try:
171-
response = self.client.head(f"https://pypi.org/pypi/{package_name}/json")
167+
response = await self.client.head(f"https://pypi.org/pypi/{package_name}/json")
172168
except Exception as e:
173169
msg = f"{context}: Failed to validate PyPI package {package_name!r}: {e}"
174170
return ValidationError(msg)
@@ -189,7 +185,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
189185
class CondaValidator(HTTPValidator):
190186
"""Validate Conda package identifiers using the Anaconda API."""
191187

192-
def __call__(self, package_spec: str, context: str) -> None | ValidationError:
188+
async def __call__(self, package_spec: str, context: str) -> None | ValidationError:
193189
"""Validate that a Conda package exists.
194190
195191
Parameters
@@ -211,7 +207,7 @@ def __call__(self, package_spec: str, context: str) -> None | ValidationError:
211207

212208
# Check package exists on the channel
213209
try:
214-
response = self.client.head(f"https://api.anaconda.org/package/{channel}/{package_name}")
210+
response = await self.client.head(f"https://api.anaconda.org/package/{channel}/{package_name}")
215211
except Exception as e:
216212
msg = f"{context}: Failed to validate Conda package '{package_spec}': {e}"
217213
return ValidationError(msg)
@@ -232,7 +228,7 @@ def __call__(self, package_spec: str, context: str) -> None | ValidationError:
232228
class CRANValidator(HTTPValidator):
233229
"""Validate CRAN package names using the CRAN API."""
234230

235-
def __call__(self, package_name: str, context: str) -> None | ValidationError:
231+
async def __call__(self, package_name: str, context: str) -> None | ValidationError:
236232
"""Validate that a CRAN package exists.
237233
238234
Parameters
@@ -247,7 +243,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
247243

248244
# CRAN packages can be checked via the packages database
249245
try:
250-
response = self.client.head(f"https://crandb.r-pkg.org/{package_name}")
246+
response = await self.client.head(f"https://crandb.r-pkg.org/{package_name}")
251247
except Exception as e:
252248
msg = f"{context}: Failed to validate CRAN package '{package_name}': {e}"
253249
return ValidationError(msg)
@@ -268,7 +264,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
268264
class BioconductorValidator(HTTPValidator):
269265
"""Validate Bioconductor package names using the Bioconductor API."""
270266

271-
def __call__(self, package_name: str, context: str) -> None | ValidationError:
267+
async def __call__(self, package_name: str, context: str) -> None | ValidationError:
272268
"""Validate that a Bioconductor package exists.
273269
274270
Parameters
@@ -283,7 +279,7 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
283279

284280
# Bioconductor packages can be checked via their web API
285281
try:
286-
response = self.client.head(f"https://bioconductor.org/packages/{package_name}/")
282+
response = await self.client.head(f"https://bioconductor.org/packages/{package_name}/")
287283
except Exception as e:
288284
msg = f"{context}: Failed to validate Bioconductor package '{package_name}': {e}"
289285
return ValidationError(msg)
@@ -300,11 +296,11 @@ def __call__(self, package_name: str, context: str) -> None | ValidationError:
300296
return None
301297

302298

303-
def check_image(img_path: Path) -> None | ValidationError:
299+
def check_image(img_path: Path) -> None:
304300
"""Validates that the image exists and that it is either a SVG or fits into the 512x512 bounding box."""
305301
if not img_path.exists():
306302
msg = f"Image does not exist: {img_path}"
307-
return ValidationError(msg)
303+
raise ValidationError(msg)
308304
if img_path.suffix == ".svg":
309305
return None
310306
with Image.open(img_path) as img:
@@ -317,77 +313,120 @@ def check_image(img_path: Path) -> None | ValidationError:
317313
Actual dimensions (width, height): ({width}, ({height}))."
318314
"""
319315
)
320-
return ValidationError(msg)
316+
raise ValidationError(msg)
321317
return None
322318

323319

324-
def validate_packages( # noqa: C901
325-
schema_file: Traversable, registry_dir: Path, github_token: str | None = None
326-
) -> tuple[Mapping[str, Sequence[Exception]], Sequence[ScverseEcosystemPackages]]:
327-
"""Find all package `meta.yaml` files in the registry dir and yield package records."""
328-
schema = json.loads(schema_file.read_bytes())
320+
class DomainBasedRateLimiterRepository(AbstractRateLimiterRepository):
321+
"""Apply different rate limits based on the domain being requested."""
322+
323+
@override
324+
def get_identifier(self, request: httpx.Request) -> str:
325+
return request.url.host
329326

330-
# Create HTTP client with retry configuration using httpx_retries transport
331-
retry_transport = RetryTransport(retry=Retry(total=3, backoff_factor=2))
332-
retry_client = httpx.Client(follow_redirects=True, timeout=30.0, transport=retry_transport)
327+
@override
328+
def create(self, request: httpx.Request) -> AiolimiterAsyncLimiter:
329+
return AiolimiterAsyncLimiter.create(Rate.create(magnitude=25))
333330

334-
# using different link checkers,
335-
# because each of them may point to the same URL and this wouldn't qualify as duplicate
336-
check_home = LinkChecker(retry_client)
337-
check_docs = LinkChecker(retry_client)
338-
check_tutorial = LinkChecker(retry_client)
339331

340-
check_gh_users = GitHubUserValidator(retry_client, github_token)
341-
check_pypi = PyPIValidator(retry_client)
342-
check_conda = CondaValidator(retry_client)
343-
check_cran = CRANValidator(retry_client)
344-
check_bioc = BioconductorValidator(retry_client)
332+
@dataclass
333+
class Checker:
334+
schema_file: Traversable
335+
registry_dir: Path
336+
_: KW_ONLY
337+
github_token: str | None = None
345338

346-
errors: defaultdict[str, ErrorList] = defaultdict(ErrorList)
347-
package_metadata: list[ScverseEcosystemPackages] = []
339+
def __post_init__(self) -> None:
340+
self.schema = json.loads(self.schema_file.read_bytes())
348341

349-
for tmp_meta_file in sorted(registry_dir.rglob("meta.yaml"), key=lambda x: x.parent.name):
350-
pkg_id = tmp_meta_file.parent.name
351-
pkg_errors = errors[pkg_id]
352-
log.info(f"Validating {pkg_id}")
353-
with tmp_meta_file.open() as f:
342+
# Create HTTP client with retry configuration using httpx_retries transport
343+
transport: httpx.AsyncBaseTransport = AsyncMultiRateLimitedTransport.create(
344+
repository=DomainBasedRateLimiterRepository()
345+
)
346+
transport = RetryTransport(transport, Retry(total=3, backoff_factor=2))
347+
self.client = httpx.AsyncClient(follow_redirects=True, timeout=30.0, transport=transport)
348+
349+
# using different link checkers,
350+
# because each of them may point to the same URL and this wouldn't qualify as duplicate
351+
self.check_home = LinkChecker(self.client)
352+
self.check_docs = LinkChecker(self.client)
353+
self.check_tutorial = LinkChecker(self.client)
354+
355+
self.check_gh_users = GitHubUserValidator(self.client, self.github_token)
356+
self.check_pypi = PyPIValidator(self.client)
357+
self.check_conda = CondaValidator(self.client)
358+
self.check_cran = CRANValidator(self.client)
359+
self.check_bioc = BioconductorValidator(self.client)
360+
361+
async def validate_packages(self) -> tuple[Mapping[str, Sequence[Exception]], Sequence[ScverseEcosystemPackages]]:
362+
"""Find all package `meta.yaml` files in the registry dir and yield package records."""
363+
364+
errors: dict[str, list[ValidationError | jsonschema.ValidationError]] = {}
365+
package_metadata: list[ScverseEcosystemPackages] = []
366+
367+
async with self.client:
368+
async for check in asyncio.as_completed(
369+
self.check_package(meta_path)
370+
for meta_path in sorted(self.registry_dir.rglob("meta.yaml"), key=lambda x: x.parent.name)
371+
):
372+
pkg_id, tmp_meta, pkg_errors = await check
373+
errors[pkg_id] = pkg_errors
374+
package_metadata.append(tmp_meta)
375+
376+
return errors, package_metadata
377+
378+
async def check_package(
379+
self, meta_file: Path
380+
) -> tuple[str, ScverseEcosystemPackages, list[ValidationError | jsonschema.ValidationError]]:
381+
pkg_id = meta_file.parent.name
382+
with meta_file.open() as f:
354383
tmp_meta = cast("ScverseEcosystemPackages", yaml.load(f, yaml.SafeLoader))
355384

385+
pkg_errors: list[ValidationError | jsonschema.ValidationError] = []
356386
try:
357-
jsonschema.validate(tmp_meta, schema)
387+
jsonschema.validate(tmp_meta, self.schema)
358388
except jsonschema.ValidationError as e:
359389
pkg_errors.append(e)
360390

391+
# Check logo (if available) and make path relative to root of registry
392+
if "logo" in tmp_meta:
393+
img_path = self.registry_dir / pkg_id / tmp_meta["logo"]
394+
try:
395+
check_image(img_path)
396+
except ValidationError as e:
397+
pkg_errors.append(e)
398+
tmp_meta["logo"] = str(img_path)
399+
400+
log.info(f"Validating {pkg_id}")
401+
async for check in asyncio.as_completed(self.http_checks(pkg_id, tmp_meta)):
402+
try:
403+
await check
404+
except ValidationError as e:
405+
pkg_errors.append(e)
406+
407+
return pkg_id, tmp_meta, pkg_errors
408+
409+
def http_checks(self, pkg_id: str, tmp_meta: ScverseEcosystemPackages) -> Generator[Awaitable[Exception | None]]:
361410
# Check and register all links
362-
pkg_errors.append(check_home(tmp_meta["project_home"], pkg_id))
363-
pkg_errors.append(check_docs(tmp_meta["documentation_home"], pkg_id))
411+
yield self.check_home(tmp_meta["project_home"], pkg_id)
412+
yield self.check_docs(tmp_meta["documentation_home"], pkg_id)
364413
if url := tmp_meta.get("tutorials_home"):
365-
pkg_errors.append(check_tutorial(url, pkg_id))
414+
yield self.check_tutorial(url, pkg_id)
366415

367416
# Validate GitHub usernames in contact field
368417
if usernames := tmp_meta.get("contact"):
369-
pkg_errors.append(check_gh_users(usernames, pkg_id))
418+
yield self.check_gh_users(usernames, pkg_id)
370419

371420
# Validate install packages
372421
if install_info := tmp_meta.get("install"):
373422
if pypi_name := install_info.get("pypi"):
374-
pkg_errors.append(check_pypi(pypi_name, pkg_id))
423+
yield self.check_pypi(pypi_name, pkg_id)
375424
if conda_name := install_info.get("conda"):
376-
pkg_errors.append(check_conda(conda_name, pkg_id))
425+
yield self.check_conda(conda_name, pkg_id)
377426
if cran_name := install_info.get("cran"):
378-
pkg_errors.append(check_cran(cran_name, pkg_id))
427+
yield self.check_cran(cran_name, pkg_id)
379428
if bioconductor_name := install_info.get("bioconductor"):
380-
pkg_errors.append(check_bioc(bioconductor_name, pkg_id))
381-
382-
# Check logo (if available) and make path relative to root of registry
383-
if "logo" in tmp_meta:
384-
img_path = registry_dir / pkg_id / tmp_meta["logo"]
385-
pkg_errors.append(check_image(img_path))
386-
tmp_meta["logo"] = str(img_path)
387-
388-
package_metadata.append(tmp_meta)
389-
390-
return errors, package_metadata
429+
yield self.check_bioc(bioconductor_name, pkg_id)
391430

392431

393432
def make_output(
@@ -463,7 +502,8 @@ def main(args: Sequence[str] | None = None) -> None:
463502
parsed_args.outdir.mkdir(parents=True)
464503

465504
log.info("Starting validation")
466-
errors, packages = validate_packages(schema_file, parsed_args.registry_dir, github_token)
505+
checker = Checker(schema_file, parsed_args.registry_dir, github_token=github_token)
506+
errors, packages = asyncio.run(checker.validate_packages())
467507

468508
if any(errors.values()):
469509
log.error("Validation error occured in at least one package. Exiting.")

0 commit comments

Comments
 (0)