Skip to content

Commit fe777ce

Browse files
grstflying-sheep
andauthored
Retry link checking with backoff (#321)
Co-authored-by: Philipp A. <flying-sheep@web.de>
1 parent 9d58706 commit fe777ce

4 files changed

Lines changed: 68 additions & 24 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ repos:
7272
pass_filenames: false
7373
additional_dependencies:
7474
- httpx
75+
- httpx-retries
7576
- types-jsonschema
7677
- pillow
7778
- pygithub

scripts/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ classifiers = [
2222
dynamic = [ "version" ]
2323
dependencies = [
2424
"httpx",
25+
"httpx-retries",
2526
"jsonschema>=4.25.1",
2627
"pillow>=12",
2728
"pygithub",

scripts/src/ecosystem_scripts/validate_registry.py

Lines changed: 52 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import httpx
1818
import jsonschema
1919
import yaml
20+
from httpx_retries import Retry, RetryTransport
2021
from PIL import Image
2122

2223
from ._logging import log, setup_logging
@@ -50,8 +51,9 @@ def append(self, obj: Exception | None) -> None:
5051
class LinkChecker:
5152
"""Track known links and validate URLs."""
5253

53-
def __init__(self) -> None:
54+
def __init__(self, client: httpx.Client) -> None:
5455
self.known_links: set[str] = set()
56+
self.client = client
5557

5658
def check_and_register(self, url: str, context: str) -> None | ValidationError:
5759
"""Check if URL is duplicate, validate it exists, and register it.
@@ -67,7 +69,12 @@ def check_and_register(self, url: str, context: str) -> None | ValidationError:
6769
msg = f"{context}: Duplicate link: {url}"
6870
return ValidationError(msg)
6971

70-
response = httpx.head(url, follow_redirects=True)
72+
try:
73+
response = self.client.head(url)
74+
except Exception as e:
75+
msg = f"URL {url} is not reachable: {e}"
76+
return ValidationError(msg)
77+
7178
if response.status_code != httpx.codes.OK:
7279
msg = f"URL {url} is not reachable (error {response.status_code}). "
7380
return ValidationError(msg)
@@ -79,7 +86,8 @@ def check_and_register(self, url: str, context: str) -> None | ValidationError:
7986
class GitHubUserValidator:
8087
"""Validate GitHub usernames using the GitHub API."""
8188

82-
def __init__(self, github_token: str | None = None) -> None:
89+
def __init__(self, client: httpx.Client, github_token: str | None = None) -> None:
90+
self.client = client
8391
self.github_token = github_token
8492
self.validated_users: set[str] = set()
8593

@@ -102,7 +110,14 @@ def validate_usernames(self, usernames: Sequence[str], context: str) -> None | V
102110
headers["Authorization"] = f"token {self.github_token}"
103111

104112
q = "\n".join(f"user{i}: user(login: {json.dumps(name)}) {{ login }}" for i, name in enumerate(unvalidated))
105-
response = httpx.post("https://api.github.com/graphql", headers=headers, json={"query": f"query {{ {q} }}"})
113+
114+
try:
115+
response = self.client.post(
116+
"https://api.github.com/graphql", headers=headers, json={"query": f"query {{ {q} }}"}
117+
)
118+
except Exception as e:
119+
msg = f"{context}: Failed to validate GitHub users {unvalidated!r}: {e}"
120+
return ValidationError(msg)
106121

107122
if response.status_code != httpx.codes.OK:
108123
msg = f"{context}: Failed to validate GitHub users {unvalidated!r} (error {response.status_code})"
@@ -122,7 +137,8 @@ def validate_usernames(self, usernames: Sequence[str], context: str) -> None | V
122137
class PyPIValidator:
123138
"""Validate PyPI package names against the PyPI API."""
124139

125-
def __init__(self) -> None:
140+
def __init__(self, client: httpx.Client) -> None:
141+
self.client = client
126142
self.validated_packages: set[str] = set()
127143

128144
def validate_package(self, package_name: str, context: str) -> None | ValidationError:
@@ -138,7 +154,11 @@ def validate_package(self, package_name: str, context: str) -> None | Validation
138154
if package_name in self.validated_packages:
139155
return None
140156

141-
response = httpx.head(f"https://pypi.org/pypi/{package_name}/json", follow_redirects=True)
157+
try:
158+
response = self.client.head(f"https://pypi.org/pypi/{package_name}/json")
159+
except Exception as e:
160+
msg = f"{context}: Failed to validate PyPI package {package_name!r}: {e}"
161+
return ValidationError(msg)
142162

143163
if response.status_code == httpx.codes.NOT_FOUND:
144164
msg = f"{context}: PyPI package {package_name!r} does not exist"
@@ -155,7 +175,8 @@ def validate_package(self, package_name: str, context: str) -> None | Validation
155175
class CondaValidator:
156176
"""Validate Conda package identifiers using the Anaconda API."""
157177

158-
def __init__(self) -> None:
178+
def __init__(self, client: httpx.Client) -> None:
179+
self.client = client
159180
self.validated_packages: set[str] = set()
160181

161182
def validate_package(self, package_spec: str, context: str) -> None | ValidationError:
@@ -179,10 +200,11 @@ def validate_package(self, package_spec: str, context: str) -> None | Validation
179200
channel, package_name = package_spec.split("::", 1)
180201

181202
# Check package exists on the channel
182-
response = httpx.head(
183-
f"https://api.anaconda.org/package/{channel}/{package_name}",
184-
follow_redirects=True,
185-
)
203+
try:
204+
response = self.client.head(f"https://api.anaconda.org/package/{channel}/{package_name}")
205+
except Exception as e:
206+
msg = f"{context}: Failed to validate Conda package '{package_spec}': {e}"
207+
return ValidationError(msg)
186208

187209
if response.status_code == httpx.codes.NOT_FOUND:
188210
msg = f"{context}: Conda package '{package_spec}' does not exist"
@@ -199,7 +221,8 @@ def validate_package(self, package_spec: str, context: str) -> None | Validation
199221
class CRANValidator:
200222
"""Validate CRAN package names using the CRAN API."""
201223

202-
def __init__(self) -> None:
224+
def __init__(self, client: httpx.Client) -> None:
225+
self.client = client
203226
self.validated_packages: set[str] = set()
204227

205228
def validate_package(self, package_name: str, context: str) -> None | ValidationError:
@@ -216,10 +239,11 @@ def validate_package(self, package_name: str, context: str) -> None | Validation
216239
return None
217240

218241
# CRAN packages can be checked via the packages database
219-
response = httpx.head(
220-
f"https://crandb.r-pkg.org/{package_name}",
221-
follow_redirects=True,
222-
)
242+
try:
243+
response = self.client.head(f"https://crandb.r-pkg.org/{package_name}")
244+
except Exception as e:
245+
msg = f"{context}: Failed to validate CRAN package '{package_name}': {e}"
246+
return ValidationError(msg)
223247

224248
if response.status_code == httpx.codes.NOT_FOUND:
225249
msg = f"{context}: CRAN package '{package_name}' does not exist"
@@ -260,16 +284,20 @@ def validate_packages(
260284
"""Find all package `meta.yaml` files in the registry dir and yield package records."""
261285
schema = json.loads(schema_file.read_bytes())
262286

287+
# Create HTTP client with retry configuration using httpx_retries transport
288+
retry_transport = RetryTransport(retry=Retry(total=3, backoff_factor=2))
289+
retry_client = httpx.Client(follow_redirects=True, timeout=30.0, transport=retry_transport)
290+
263291
# using different link checkers,
264292
# because each of them may point to the same URL and this wouldn't qualify as duplicate
265-
link_checker_home = LinkChecker()
266-
link_checker_docs = LinkChecker()
267-
link_checker_tutorials = LinkChecker()
268-
269-
github_validator = GitHubUserValidator(github_token)
270-
pypi_validator = PyPIValidator()
271-
conda_validator = CondaValidator()
272-
cran_validator = CRANValidator()
293+
link_checker_home = LinkChecker(retry_client)
294+
link_checker_docs = LinkChecker(retry_client)
295+
link_checker_tutorials = LinkChecker(retry_client)
296+
297+
github_validator = GitHubUserValidator(retry_client, github_token)
298+
pypi_validator = PyPIValidator(retry_client)
299+
conda_validator = CondaValidator(retry_client)
300+
cran_validator = CRANValidator(retry_client)
273301

274302
errors: defaultdict[str, ErrorList] = defaultdict(ErrorList)
275303
package_metadata: list[ScverseEcosystemPackages] = []

scripts/uv.lock

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)