Skip to content

Commit bca152c

Browse files
add pipeline and retry logic for rate limiting
1 parent c8916d2 commit bca152c

9 files changed

Lines changed: 380 additions & 7 deletions

File tree

.github/workflows/ci.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
name: CI
3+
4+
on:
5+
push:
6+
branches: [ main ]
7+
pull_request:
8+
branches: [ main ]
9+
10+
jobs:
11+
test:
12+
runs-on: ubuntu-latest
13+
strategy:
14+
matrix:
15+
python-version: ["3.9", "3.10", "3.11", "3.12"]
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
pip install -e .[dev]
29+
30+
- name: Run tests
31+
run: |
32+
pytest -v tests/

.kiro/specs/real-api-tests/requirements.md

Whitespace-only changes.

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ with R34Client() as client:
9292
| `timeout` | float | 30.0 | Request timeout in seconds |
9393
| `posts_per_page` | int | 42 | Posts per page for pagination |
9494
| `headers` | dict | Browser-like headers | Custom HTTP headers |
95+
| `max_retries` | int | 5 | Max retries for rate limits (429) |
9596

9697
## License
9798

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,14 @@ classifiers = [
2626
dependencies = [
2727
"httpx>=0.24.0",
2828
"selectolax>=0.3.0",
29+
"tenacity>=8.2.0",
2930
]
3031

3132
[project.optional-dependencies]
3233
dev = [
3334
"pytest>=7.0",
3435
"pytest-asyncio>=0.21",
36+
"respx>=0.20.0",
3537
]
3638

3739
[project.urls]

rule34scraper/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""RHI API Wrapper - A simple wrapper for rule34 image board."""
22

33
from .models import Post, Tag, PostComment, PostDetails, UserProfile
4-
from .client import R34Client, AsyncR34Client
4+
from .client import R34Client, AsyncR34Client, RateLimitError
55
from .parser import (
66
PostParser,
77
SidebarParser,
@@ -18,6 +18,7 @@
1818
"UserProfile",
1919
"R34Client",
2020
"AsyncR34Client",
21+
"RateLimitError",
2122
"PostParser",
2223
"SidebarParser",
2324
"PostDetailsParser",

rule34scraper/client.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,35 @@
11
"""HTTP client for RHI API."""
22

3+
import logging
34
from pathlib import Path
45
from typing import Dict, List, Optional, Tuple, Union
56

67
import httpx
8+
from tenacity import (
9+
retry,
10+
stop_after_attempt,
11+
wait_exponential,
12+
retry_if_exception_type,
13+
before_sleep_log,
14+
)
715

816
from .models import Post, Tag, PostDetails, UserProfile
917
from .parser import PostParser, SidebarParser, PostDetailsParser, UserProfileParser
1018

19+
logger = logging.getLogger(__name__)
20+
21+
22+
class RateLimitError(Exception):
23+
"""Raised when the server returns a 429 Too Many Requests response."""
24+
pass
25+
26+
27+
def _check_rate_limit(response: httpx.Response) -> httpx.Response:
28+
"""Check response for rate limiting and raise if detected."""
29+
if response.status_code == 429:
30+
raise RateLimitError(f"Rate limited: {response.status_code}")
31+
return response
32+
1133
DEFAULT_BASE_URL = "https://rule34.xxx/index.php"
1234
DEFAULT_POSTS_PER_PAGE = 42
1335

@@ -27,6 +49,7 @@ def __init__(
2749
timeout: float = 30.0,
2850
posts_per_page: int = DEFAULT_POSTS_PER_PAGE,
2951
headers: Optional[Dict[str, str]] = None,
52+
max_retries: int = 5,
3053
):
3154
"""Initialize the client.
3255
@@ -35,12 +58,14 @@ def __init__(
3558
timeout: Request timeout in seconds.
3659
posts_per_page: Number of posts per page (for pagination offset).
3760
headers: Custom headers to use (merges with defaults).
61+
max_retries: Maximum number of retry attempts for rate-limited requests.
3862
"""
3963
self.base_url = base_url.rstrip("/")
4064
self.posts_per_page = posts_per_page
4165
self._timeout = timeout
4266
self._headers = {**DEFAULT_HEADERS, **(headers or {})}
4367
self._client: Optional[httpx.Client] = None
68+
self._max_retries = max_retries
4469

4570
@property
4671
def client(self) -> httpx.Client:
@@ -49,6 +74,21 @@ def client(self) -> httpx.Client:
4974
self._client = httpx.Client(timeout=self._timeout, headers=self._headers)
5075
return self._client
5176

77+
def _get(self, url: str, params: Dict = None) -> httpx.Response:
78+
"""Execute GET request with retry logic for rate limits."""
79+
@retry(
80+
stop=stop_after_attempt(self._max_retries),
81+
wait=wait_exponential(multiplier=1, min=1.0, max=60.0),
82+
retry=retry_if_exception_type((RateLimitError, httpx.TransportError)),
83+
before_sleep=before_sleep_log(logger, logging.WARNING),
84+
reraise=True,
85+
)
86+
def _request() -> httpx.Response:
87+
response = self.client.get(url, params=params)
88+
return _check_rate_limit(response)
89+
90+
return _request()
91+
5292
def close(self) -> None:
5393
"""Close the HTTP client."""
5494
if self._client:
@@ -74,7 +114,7 @@ def get_posts(self, tags: str = "", page: int = 1) -> Tuple[List[Post], List[Tag
74114
offset = (page - 1) * self.posts_per_page
75115
params = {"page": "post", "s": "list", "tags": tags, "pid": offset}
76116

77-
response = self.client.get(self.base_url, params=params)
117+
response = self._get(self.base_url, params=params)
78118
response.raise_for_status()
79119

80120
html = response.text
@@ -96,14 +136,14 @@ def get_sidebar_tags(self, tags: str = "") -> List[Tag]:
96136
def get_post_details(self, post_id: int) -> Optional[PostDetails]:
97137
"""Fetch detailed info for a specific post."""
98138
params = {"page": "post", "s": "view", "id": post_id}
99-
response = self.client.get(self.base_url, params=params)
139+
response = self._get(self.base_url, params=params)
100140
response.raise_for_status()
101141
return PostDetailsParser.parse_html(response.text)
102142

103143
def get_user_profile(self, username: str) -> Optional[UserProfile]:
104144
"""Fetch user profile by username."""
105145
params = {"page": "account", "s": "profile", "uname": username}
106-
response = self.client.get(self.base_url, params=params)
146+
response = self._get(self.base_url, params=params)
107147
response.raise_for_status()
108148
return UserProfileParser.parse_html(response.text, self.base_url)
109149

@@ -170,6 +210,7 @@ def __init__(
170210
timeout: float = 30.0,
171211
posts_per_page: int = DEFAULT_POSTS_PER_PAGE,
172212
headers: Optional[Dict[str, str]] = None,
213+
max_retries: int = 5,
173214
):
174215
"""Initialize the async client.
175216
@@ -178,12 +219,14 @@ def __init__(
178219
timeout: Request timeout in seconds.
179220
posts_per_page: Number of posts per page.
180221
headers: Custom headers to use.
222+
max_retries: Maximum number of retry attempts for rate-limited requests.
181223
"""
182224
self.base_url = base_url.rstrip("/")
183225
self.posts_per_page = posts_per_page
184226
self._timeout = timeout
185227
self._headers = {**DEFAULT_HEADERS, **(headers or {})}
186228
self._client: Optional[httpx.AsyncClient] = None
229+
self._max_retries = max_retries
187230

188231
@property
189232
def client(self) -> httpx.AsyncClient:
@@ -192,6 +235,21 @@ def client(self) -> httpx.AsyncClient:
192235
self._client = httpx.AsyncClient(timeout=self._timeout, headers=self._headers)
193236
return self._client
194237

238+
async def _get(self, url: str, params: Dict = None) -> httpx.Response:
239+
"""Execute GET request with retry logic for rate limits."""
240+
@retry(
241+
stop=stop_after_attempt(self._max_retries),
242+
wait=wait_exponential(multiplier=1, min=1.0, max=60.0),
243+
retry=retry_if_exception_type((RateLimitError, httpx.TransportError)),
244+
before_sleep=before_sleep_log(logger, logging.WARNING),
245+
reraise=True,
246+
)
247+
async def _request() -> httpx.Response:
248+
response = await self.client.get(url, params=params)
249+
return _check_rate_limit(response)
250+
251+
return await _request()
252+
195253
async def close(self) -> None:
196254
"""Close the HTTP client."""
197255
if self._client:
@@ -209,7 +267,7 @@ async def get_posts(self, tags: str = "", page: int = 1) -> Tuple[List[Post], Li
209267
offset = (page - 1) * self.posts_per_page
210268
params = {"page": "post", "s": "list", "tags": tags, "pid": offset}
211269

212-
response = await self.client.get(self.base_url, params=params)
270+
response = await self._get(self.base_url, params=params)
213271
response.raise_for_status()
214272

215273
html = response.text
@@ -226,14 +284,14 @@ async def search(self, tags: str, page: int = 1) -> List[Post]:
226284
async def get_post_details(self, post_id: int) -> Optional[PostDetails]:
227285
"""Fetch detailed info for a specific post."""
228286
params = {"page": "post", "s": "view", "id": post_id}
229-
response = await self.client.get(self.base_url, params=params)
287+
response = await self._get(self.base_url, params=params)
230288
response.raise_for_status()
231289
return PostDetailsParser.parse_html(response.text)
232290

233291
async def get_user_profile(self, username: str) -> Optional[UserProfile]:
234292
"""Fetch user profile by username."""
235293
params = {"page": "account", "s": "profile", "uname": username}
236-
response = await self.client.get(self.base_url, params=params)
294+
response = await self._get(self.base_url, params=params)
237295
response.raise_for_status()
238296
return UserProfileParser.parse_html(response.text, self.base_url)
239297

test_output.txt

3.53 KB
Binary file not shown.

tests/test_client.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
"""Integration tests against the live Rule34 API."""
2+
3+
import pytest
4+
from rule34scraper.client import AsyncR34Client, R34Client
5+
6+
7+
class TestSyncClient:
8+
"""Tests for the synchronous R34Client."""
9+
10+
def test_get_posts_returns_results(self):
11+
"""Fetching posts should return a non-empty list."""
12+
with R34Client() as client:
13+
posts, tags = client.get_posts(tags="solo", page=1)
14+
15+
assert len(posts) > 0
16+
assert len(tags) > 0
17+
18+
def test_post_has_required_fields(self):
19+
"""Each post should have all required fields populated."""
20+
with R34Client() as client:
21+
posts, _ = client.get_posts(tags="solo", page=1)
22+
post = posts[0]
23+
24+
assert post.id > 0
25+
assert post.preview_url.startswith("http")
26+
assert len(post.tags) > 0
27+
assert post.rating in ("safe", "questionable", "explicit")
28+
assert post.detail_url.startswith("http")
29+
30+
def test_search_returns_posts(self):
31+
"""Search method should return posts matching tags."""
32+
with R34Client() as client:
33+
posts = client.search(tags="1girl", page=1)
34+
35+
assert len(posts) > 0
36+
37+
def test_get_post_details(self):
38+
"""Fetching post details should return complete info."""
39+
with R34Client() as client:
40+
posts, _ = client.get_posts(tags="solo", page=1)
41+
post_id = posts[0].id
42+
43+
details = client.get_post_details(post_id)
44+
45+
assert details is not None
46+
assert details.id == post_id
47+
assert details.image_url.startswith("http")
48+
assert details.width > 0
49+
assert details.height > 0
50+
assert details.rating in ("safe", "questionable", "explicit")
51+
52+
def test_get_sidebar_tags(self):
53+
"""Sidebar tags should have name, count, and type."""
54+
with R34Client() as client:
55+
tags = client.get_sidebar_tags(tags="solo")
56+
57+
assert len(tags) > 0
58+
tag = tags[0]
59+
assert tag.name
60+
assert tag.count >= 0
61+
assert tag.type
62+
63+
def test_pagination_returns_different_posts(self):
64+
"""Different pages should return different posts."""
65+
with R34Client() as client:
66+
posts_page1, _ = client.get_posts(tags="solo", page=1)
67+
posts_page2, _ = client.get_posts(tags="solo", page=2)
68+
69+
ids_page1 = {p.id for p in posts_page1}
70+
ids_page2 = {p.id for p in posts_page2}
71+
72+
assert ids_page1 != ids_page2
73+
74+
75+
@pytest.mark.asyncio
76+
class TestAsyncClient:
77+
"""Tests for the asynchronous AsyncR34Client."""
78+
79+
async def test_get_posts_returns_results(self):
80+
"""Fetching posts should return a non-empty list."""
81+
async with AsyncR34Client() as client:
82+
posts, tags = await client.get_posts(tags="solo", page=1)
83+
84+
assert len(posts) > 0
85+
assert len(tags) > 0
86+
87+
async def test_post_has_required_fields(self):
88+
"""Each post should have all required fields populated."""
89+
async with AsyncR34Client() as client:
90+
posts, _ = await client.get_posts(tags="solo", page=1)
91+
post = posts[0]
92+
93+
assert post.id > 0
94+
assert post.preview_url.startswith("http")
95+
assert len(post.tags) > 0
96+
assert post.rating in ("safe", "questionable", "explicit")
97+
assert post.detail_url.startswith("http")
98+
99+
async def test_search_returns_posts(self):
100+
"""Search method should return posts matching tags."""
101+
async with AsyncR34Client() as client:
102+
posts = await client.search(tags="1girl", page=1)
103+
104+
assert len(posts) > 0
105+
106+
async def test_get_post_details(self):
107+
"""Fetching post details should return complete info."""
108+
async with AsyncR34Client() as client:
109+
posts, _ = await client.get_posts(tags="solo", page=1)
110+
post_id = posts[0].id
111+
112+
details = await client.get_post_details(post_id)
113+
114+
assert details is not None
115+
assert details.id == post_id
116+
assert details.image_url.startswith("http")
117+
assert details.width > 0
118+
assert details.height > 0
119+
assert details.rating in ("safe", "questionable", "explicit")
120+
121+
async def test_get_sidebar_tags(self):
122+
"""Sidebar tags should have name, count, and type."""
123+
async with AsyncR34Client() as client:
124+
posts, tags = await client.get_posts(tags="solo", page=1)
125+
126+
assert len(tags) > 0
127+
tag = tags[0]
128+
assert tag.name
129+
assert tag.count >= 0
130+
assert tag.type
131+
132+
async def test_pagination_returns_different_posts(self):
133+
"""Different pages should return different posts."""
134+
async with AsyncR34Client() as client:
135+
posts_page1, _ = await client.get_posts(tags="solo", page=1)
136+
posts_page2, _ = await client.get_posts(tags="solo", page=2)
137+
138+
ids_page1 = {p.id for p in posts_page1}
139+
ids_page2 = {p.id for p in posts_page2}
140+
141+
assert ids_page1 != ids_page2

0 commit comments

Comments
 (0)