Skip to content

Commit 5a6c333

Browse files
committed
feat: users
1 parent 2221233 commit 5a6c333

7 files changed

Lines changed: 122 additions & 93 deletions

File tree

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2024
3+
Copyright (c) 2026
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 57 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# rule34scraper
22

3-
A fast Python API wrapper for booru-style image boards using selectolax (Lexbor engine).
3+
A high-performance Python API wrapper for **Rule34.xxx** (and other booru-style image boards). Built for speed and reliability, it uses `selectolax` (Lexbor engine) for lightning-fast parsing and `httpx` for both synchronous and asynchronous requests.
44

55
## Installation
66

@@ -10,90 +10,96 @@ pip install rule34scraper
1010

1111
## Usage
1212

13-
### Basic Usage
13+
### Basic Searching
14+
15+
Returns a list of `Post` objects from a search result page.
1416

1517
```python
1618
from rule34scraper import R34Client
1719

1820
with R34Client() as client:
19-
# Search posts by tags
20-
posts, tags = client.get_posts(tags="landscape", page=1)
21+
# Search posts by tags (landscape, highres)
22+
posts, tags = client.get_posts(tags="landscape highres", page=1)
2123

2224
for post in posts:
23-
print(f"ID: {post.id}, Score: {post.score}, Rating: {post.rating}")
24-
25-
# Get post details
26-
details = client.get_post_details(posts[0].id)
27-
print(f"Image: {details.image_url}")
28-
print(f"Size: {details.width}x{details.height}")
29-
30-
# Download image
31-
client.download_post(details, directory="downloads/")
25+
print(f"ID: {post.id} | Score: {post.score} | Rating: {post.rating}")
3226
```
3327

34-
### Custom Base URL
28+
### Detailed Metadata (and Creator info)
3529

36-
```python
37-
from rule34scraper import R34Client
30+
Search results provide basic info. For full metadata (including the **creator** name, high-res URLs, and comments), use `get_post_details`.
3831

39-
# Use a different booru site
40-
client = R34Client(
41-
base_url="https://example.com/index.php",
42-
posts_per_page=42,
43-
timeout=60.0,
44-
)
45-
46-
# Custom headers
47-
client = R34Client(
48-
base_url="https://example.com/index.php",
49-
headers={"Cookie": "session=abc123"},
50-
)
32+
```python
33+
with R34Client() as client:
34+
posts, _ = client.get_posts(tags="fantasy")
35+
36+
# Get deep details for a specific post
37+
details = client.get_post_details(posts[0].id)
38+
39+
print(f"Post #{details.id} | Creator: {details.creator.name}")
40+
print(f"Full Image: {details.image_url} ({details.width}x{details.height})")
41+
print(f"Tags: {', '.join([t.name for t in details.tags[:5]])}")
5142
```
5243

53-
### Async Client
44+
### Async Usage
45+
46+
Ideal for high-throughput applications.
5447

5548
```python
5649
import asyncio
5750
from rule34scraper import AsyncR34Client
5851

5952
async def main():
6053
async with AsyncR34Client() as client:
61-
posts, tags = await client.get_posts(tags="portrait", page=1)
62-
details = await client.get_post_details(posts[0].id)
63-
print(f"Image: {details.image_url}")
54+
posts, _ = await client.get_posts(tags="portrait")
55+
if posts:
56+
details = await client.get_post_details(posts[0].id)
57+
print(f"Async Detail Result: {details.id}")
6458

6559
asyncio.run(main())
6660
```
6761

68-
### User Profiles
62+
### User Profiles & Favorites
6963

7064
```python
7165
with R34Client() as client:
7266
profile = client.get_user_profile("username")
73-
print(f"User: {profile.username} (ID: {profile.id})")
74-
print(f"Level: {profile.level}")
75-
print(f"Posts: {profile.post_count}")
67+
print(f"User: {profile.username} | Level: {profile.level}")
7668
print(f"Favorites: {profile.favorite_count}")
69+
70+
# Access recent uploads or favorite posts
71+
for fav in profile.recent_favorites[:5]:
72+
print(f"Favorite Post: {fav.detail_url}")
7773
```
7874

79-
## Models
75+
### Downloading Media
76+
77+
```python
78+
with R34Client() as client:
79+
details = client.get_post_details(123456)
80+
# Automatically handles file naming and subdirectory creation
81+
filepath = client.download_post(details, directory="my_collection")
82+
print(f"Saved to: {filepath}")
83+
```
8084

81-
- `Post` - Thumbnail entry from search results
82-
- `PostDetails` - Full post metadata (image URL, dimensions, tags, comments)
83-
- `Tag` - Tag with name, count, and type
84-
- `PostComment` - User comment on a post
85-
- `UserProfile` - User profile with stats and recent posts
85+
## Configuration
8686

87-
## Configuration Options
87+
| Option | Type | Default | Description |
88+
|--------|------|---------|-------------|
89+
| `base_url` | `str` | `https://rule34.xxx` | Base domain for requests. |
90+
| `timeout` | `float` | `30.0` | Request timeout in seconds. |
91+
| `posts_per_page` | `int` | `42` | Default count for pagination. |
92+
| `max_retries` | `int` | `5` | Retry attempts on rate limiting. |
93+
| `headers` | `dict` | *Browser-like* | Custom User-Agent or Cookies. |
94+
95+
## Models
8896

89-
| Parameter | Type | Default | Description |
90-
|-----------|------|---------|-------------|
91-
| `base_url` | str | `https://rule34.xxx/index.php` | Base URL for the API |
92-
| `timeout` | float | 30.0 | Request timeout in seconds |
93-
| `posts_per_page` | int | 42 | Posts per page for pagination |
94-
| `headers` | dict | Browser-like headers | Custom HTTP headers |
95-
| `max_retries` | int | 5 | Max retries for rate limits (429) |
97+
- **Post**: Basic entry from listings (id, preview_url, tags, score, rating).
98+
- **PostDetails**: Full data from the post page (creator, image_url, sample_url, dimensions, comments).
99+
- **Tag**: Tag entry with `name`, `count`, and `type` (e.g., character, artist).
100+
- **UserProfile**: User stats, join date, and lists of recent uploads/favorites.
101+
- **PostComment**: Comment on a post with user, text, and timestamp.
96102

97103
## License
98104

99-
MIT
105+
MIT - See the [LICENSE](LICENSE) file for details.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "rule34scraper"
7-
version = "1.0.2"
7+
version = "1.0.3"
88
description = "A fast Python API wrapper for rule34 image board"
99
readme = "README.md"
1010
license = "MIT"
@@ -27,6 +27,7 @@ dependencies = [
2727
"httpx>=0.24.0",
2828
"selectolax>=0.3.0",
2929
"tenacity>=8.2.0",
30+
"aiofiles>=23.2.0",
3031
]
3132

3233
[project.optional-dependencies]

rule34scraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@
2525
"CommentParser",
2626
"UserProfileParser",
2727
]
28-
__version__ = "0.1.0"
28+
__version__ = "1.0.3"

rule34scraper/client.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@
55
from typing import Dict, List, Optional, Tuple, Union
66

77
import httpx
8+
import aiofiles
89
from tenacity import (
9-
retry,
10+
Retrying,
11+
AsyncRetrying,
1012
stop_after_attempt,
1113
wait_exponential,
1214
retry_if_exception_type,
@@ -76,18 +78,17 @@ def client(self) -> httpx.Client:
7678

7779
def _get(self, url: str, params: Dict = None) -> httpx.Response:
7880
"""Execute GET request with retry logic for rate limits."""
79-
@retry(
81+
for attempt in Retrying(
8082
stop=stop_after_attempt(self._max_retries),
8183
wait=wait_exponential(multiplier=1, min=1.0, max=60.0),
8284
retry=retry_if_exception_type((RateLimitError, httpx.TransportError)),
8385
before_sleep=before_sleep_log(logger, logging.WARNING),
8486
reraise=True,
85-
)
86-
def _request() -> httpx.Response:
87-
response = self.client.get(url, params=params)
88-
return _check_rate_limit(response)
87+
):
88+
with attempt:
89+
response = self.client.get(url, params=params)
90+
return _check_rate_limit(response)
8991

90-
return _request()
9192

9293
def close(self) -> None:
9394
"""Close the HTTP client."""
@@ -237,18 +238,19 @@ def client(self) -> httpx.AsyncClient:
237238

238239
async def _get(self, url: str, params: Dict = None) -> httpx.Response:
239240
"""Execute GET request with retry logic for rate limits."""
240-
@retry(
241+
async for attempt in AsyncRetrying(
241242
stop=stop_after_attempt(self._max_retries),
242243
wait=wait_exponential(multiplier=1, min=1.0, max=60.0),
243244
retry=retry_if_exception_type((RateLimitError, httpx.TransportError)),
244245
before_sleep=before_sleep_log(logger, logging.WARNING),
245246
reraise=True,
246-
)
247-
async def _request() -> httpx.Response:
248-
response = await self.client.get(url, params=params)
249-
return _check_rate_limit(response)
250-
251-
return await _request()
247+
):
248+
with attempt:
249+
response = await self.client.get(url, params=params)
250+
return _check_rate_limit(response)
251+
252+
# This line is theoretically unreachable due to reraise=True
253+
raise RuntimeError("Unreachable code reached")
252254

253255
async def close(self) -> None:
254256
"""Close the HTTP client."""
@@ -314,9 +316,9 @@ async def download(
314316
async with httpx.AsyncClient(headers=download_headers, follow_redirects=True, timeout=60.0) as client:
315317
async with client.stream("GET", url) as response:
316318
response.raise_for_status()
317-
with open(path, "wb") as f:
319+
async with aiofiles.open(path, "wb") as f:
318320
async for chunk in response.aiter_bytes(chunk_size=chunk_size):
319-
f.write(chunk)
321+
await f.write(chunk)
320322

321323
return path
322324

rule34scraper/models.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,6 @@
22
from typing import List, Optional
33

44

5-
@dataclass
6-
class Post:
7-
id: int
8-
preview_url: str
9-
tags: List[str]
10-
score: int
11-
rating: str
12-
detail_url: str
13-
is_video: bool = False
14-
15-
165
@dataclass
176
class Tag:
187
name: str
@@ -29,6 +18,24 @@ class PostComment:
2918
timestamp: str
3019

3120

21+
22+
@dataclass
23+
class User:
24+
name: str
25+
id: Optional[int] = None
26+
27+
28+
@dataclass
29+
class Post:
30+
id: int
31+
preview_url: str
32+
tags: List[str]
33+
score: int
34+
rating: str
35+
detail_url: str
36+
is_video: bool = False
37+
38+
3239
@dataclass
3340
class PostDetails:
3441
id: int
@@ -38,7 +45,7 @@ class PostDetails:
3845
height: int
3946
rating: str
4047
score: int
41-
uploader: str
48+
creator: User
4249
posted_at: str
4350
source_url: Optional[str]
4451
tags: List[Tag]

rule34scraper/parser.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
from typing import List, Optional, Tuple
33
from selectolax.lexbor import LexborHTMLParser, LexborNode
44

5-
from .models import Post, Tag, PostComment, PostDetails, UserProfile
5+
from .models import Post, Tag, PostComment, PostDetails, UserProfile, User
66

77
DEFAULT_BASE_URL = "https://rule34.xxx"
88

99

1010
class PostParser:
11-
SCORE_PATTERN = re.compile(r"score:(\d+)")
12-
RATING_PATTERN = re.compile(r"rating:(\w+)")
11+
SCORE_PATTERN = re.compile(r"score:(\d+)", re.IGNORECASE)
12+
RATING_PATTERN = re.compile(r"rating:(\w+)", re.IGNORECASE)
1313

1414
@classmethod
1515
def parse_html(cls, html: str, base_url: str = DEFAULT_BASE_URL) -> List[Post]:
@@ -40,11 +40,14 @@ def _parse_thumb(cls, thumb: LexborNode, base_url: str = DEFAULT_BASE_URL) -> Op
4040
preview_url = img_attrs.get("src", "")
4141

4242
alt_text = img_attrs.get("alt", "")
43-
tags = alt_text.split() if alt_text else []
43+
title_text = img_attrs.get("title", "")
44+
# Combine both for metadata extraction as R34 is inconsistent
45+
metadata_source = f"{alt_text} {title_text}"
4446

45-
title = img_attrs.get("title", "")
46-
score = cls._extract_score(title)
47-
rating = cls._extract_rating(title)
47+
tags = alt_text.split() if alt_text else []
48+
49+
score = cls._extract_score(metadata_source)
50+
rating = cls._extract_rating(metadata_source)
4851

4952
href = attrs.get("href", "")
5053
if href.startswith("http"):
@@ -123,7 +126,8 @@ def _parse_tag_item(cls, li: LexborNode) -> Optional[Tag]:
123126

124127
class PostDetailsParser:
125128
ID_PATTERN = re.compile(r"Id:\s*(\d+)")
126-
POSTED_PATTERN = re.compile(r"Posted:\s*(.*?)\s*by")
129+
POSTED_PATTERN = re.compile(r"Posted:\s*(.*?)(?:\s*by|$)", re.IGNORECASE)
130+
UPLOADER_PATTERN = re.compile(r"by\s+(.*)$", re.IGNORECASE)
127131
IMAGE_JS_PATTERN = re.compile(r"image\s*=\s*(\{[^}]+\})")
128132
WIDTH_PATTERN = re.compile(r"['\"]?width['\"]?\s*:\s*(\d+)")
129133
HEIGHT_PATTERN = re.compile(r"['\"]?height['\"]?\s*:\s*(\d+)")
@@ -146,7 +150,7 @@ def parse_html(cls, html: str) -> Optional[PostDetails]:
146150
post_id = 0
147151
rating = "unknown"
148152
score = 0
149-
uploader = ""
153+
uploader_name = "unknown"
150154
posted_at = ""
151155
source_url = None
152156

@@ -173,9 +177,18 @@ def parse_html(cls, html: str) -> Optional[PostDetails]:
173177
elif "Posted:" in text:
174178
match = cls.POSTED_PATTERN.search(text)
175179
posted_at = match.group(1).strip() if match else ""
180+
181+
# Try to get uploader from link first
176182
uploader_link = li.css_first("a")
177183
if uploader_link:
178-
uploader = uploader_link.text(strip=True)
184+
uploader_name = uploader_link.text(strip=True)
185+
else:
186+
# Fallback to text parsing (for Anonymous)
187+
up_match = cls.UPLOADER_PATTERN.search(text)
188+
if up_match:
189+
uploader_name = up_match.group(1).strip()
190+
elif "by " in text:
191+
uploader_name = text.split("by ")[-1].strip()
179192

180193
elif "Source:" in text:
181194
source_link = li.css_first("a")
@@ -193,7 +206,7 @@ def parse_html(cls, html: str) -> Optional[PostDetails]:
193206
height=height,
194207
rating=rating,
195208
score=score,
196-
uploader=uploader,
209+
creator=User(name=uploader_name),
197210
posted_at=posted_at,
198211
source_url=source_url,
199212
tags=tags,

0 commit comments

Comments
 (0)