Skip to content

Commit fe56c62

Browse files
authored
Merge-build: v2.10.1-beta
2 parents c10c7b9 + fed2f7e commit fe56c62

77 files changed

Lines changed: 5361 additions & 2311 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/FUNDING.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
github: jasoneri
22
ko_fi: jsoneri
3-
custom: ["https://paypal.me/jsoneri"]
3+
custom: ["https://paypal.me/jsoneri","https://app.unifans.io/c/jsoneri"]

.github/workflows/hitomi-db.yml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ jobs:
2828

2929
- name: Install uv
3030
uses: astral-sh/setup-uv@v7
31-
31+
- name: Install Qt runtime libs for smoke test
32+
uses: tlambert03/setup-qt-libs@v1
3233
- name: Install dependencies
3334
run: uv sync
3435

@@ -37,7 +38,7 @@ jobs:
3738
continue-on-error: true
3839
run: |
3940
uv run python -m utils.website.hitomi.scape_dataset \
40-
--db-path assets/hitomi.db \
41+
--db-path __temp/hitomi.db \
4142
--workers 2 \
4243
--max-retries 5 \
4344
--timeout 15
@@ -47,20 +48,20 @@ jobs:
4748
if: steps.scrape.outcome == 'failure'
4849
run: |
4950
uv run python -m utils.website.hitomi.scape_dataset \
50-
--db-path assets/hitomi.db \
51+
--db-path __temp/hitomi.db \
5152
--workers 1 \
5253
--max-retries 8 \
5354
--timeout 20 \
5455
--only-failed
5556
5657
- name: Quality gate
57-
run: uv run python .github/scripts/hitomi_db_gate.py assets/hitomi.db
58+
run: uv run python .github/scripts/hitomi_db_gate.py __temp/hitomi.db
5859

5960
- name: Generate manifest
6061
run: |
6162
uv run python .github/scripts/generate_hitomi_manifest.py \
62-
assets/hitomi.db \
63-
--output assets/hitomi-manifest.json
63+
__temp/hitomi.db \
64+
--output __temp/hitomi-manifest.json
6465
6566
- name: Upload to release
6667
env:
@@ -74,13 +75,13 @@ jobs:
7475
gh release delete-asset "$TAG" hitomi.db --yes 2>/dev/null || true
7576
gh release delete-asset "$TAG" hitomi-manifest.json --yes 2>/dev/null || true
7677
gh release upload "$TAG" \
77-
assets/hitomi.db \
78-
assets/hitomi-manifest.json
78+
__temp/hitomi.db \
79+
__temp/hitomi-manifest.json
7980
else
8081
gh release create "$TAG" \
8182
--title "Preset Assets" \
8283
--notes "Auto-managed preset assets (hitomi.db, etc.)" \
8384
--latest=false \
84-
assets/hitomi.db \
85-
assets/hitomi-manifest.json
85+
__temp/hitomi.db \
86+
__temp/hitomi-manifest.json
8687
fi

ComicSpider/spiders/basecomicspider.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,19 +234,18 @@ def _process_episode(self, ep: Episode):
234234
callback = self.parse_fin_page if self._enable_episode_dispatch else self.parse_section
235235
yield scrapy.Request(
236236
url=final_url,
237-
callback=callback,
238-
headers={**self.ua, 'Referer': self.request_referer(final_url)},
239-
meta=meta,
240-
dont_filter=True,
237+
callback=callback, headers={**self.ua, 'Referer': self.request_referer(final_url)},
238+
meta=meta, dont_filter=True,
241239
)
242240

243241
def _process_book(self, book: BookInfo):
244242
url = book.url if book.url and book.url.startswith("http") else self.book_id_url % book.id
245243
final_url = self.transfer_url(url)
246244
yield scrapy.Request(
247-
url=final_url, callback=self.parse_section,
248-
headers={**self.ua, 'Referer': self.request_referer(final_url)},
249-
meta={'book': book}, dont_filter=True)
245+
url=final_url,
246+
callback=self.parse_section, headers={**self.ua, 'Referer': self.request_referer(final_url)},
247+
meta={'book': book}, dont_filter=True,
248+
)
250249

251250
def start_requests(self):
252251
self.preready()

ComicSpider/spiders/ehentai.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ class EHentaiSpider(BaseComicSpider3):
2424
domain = domain
2525
search_url_head = f'https://{domain}/?f_search='
2626
mappings = {
27-
res.EHentai.MAPPINGS_INDEX: f'https://{domain}',
28-
res.EHentai.MAPPINGS_POPULAR: f'https://{domain}/popular'
27+
res.SPIDER.Completer.index: f'https://{domain}',
28+
res.SPIDER.Completer.popular: f'https://{domain}/popular'
2929
}
3030
frame_book_format = ['title', 'book_pages', 'preview_url'] # , 'book_idx']
3131
turn_page_info = (r"page=\d+",)

ComicSpider/spiders/jestful.py

Lines changed: 7 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -7,38 +7,13 @@
77

88
class JestfulSpider(BaseComicSpider):
99
name = "jestful"
10-
ua = JestfulUtils.ua
1110
image_ua = JestfulUtils.image_ua
12-
domain = JestfulUtils.domain
1311
custom_settings = {
1412
"DOWNLOADER_MIDDLEWARES": {
15-
"ComicSpider.middlewares.UAMiddleware": 5,
1613
"ComicSpider.middlewares.RefererMiddleware": 10,
1714
"ComicSpider.middlewares.FakeMiddleware": 30,
1815
}
1916
}
20-
_enable_episode_dispatch = True
21-
22-
def frame_section(self, response):
23-
reqer = self.spider_site_runtime.reqer
24-
parser = self.spider_site_runtime.parser
25-
book = response.meta.get("book")
26-
if book is None:
27-
raise ValueError("jestful frame_section requires response.meta['book']")
28-
owner_state = parser.parse_book_owner_state(response.text, owner_url=response.url)
29-
chapter_url = reqer.tokenized_url(
30-
reqer.listing_url(owner_state["loader_slug"]), domain=self.domain
31-
)
32-
chapter_resp = reqer.cli.get(
33-
chapter_url,
34-
headers=reqer.headers(referer=response.url),
35-
follow_redirects=True,
36-
timeout=12,
37-
)
38-
chapter_resp.raise_for_status()
39-
episodes = parser.parse_episodes_from_list_html(chapter_resp.text, book, domain=self.domain)
40-
frame_results = {ep.idx: ep for ep in episodes}
41-
return self.say.frame_section_print(frame_results)
4217

4318
def _build_episode_items(self, ep, page_urls, *, chapter_referer):
4419
book = ep.from_book
@@ -64,41 +39,18 @@ def _yield_episode_items(self, ep, page_urls, *, chapter_referer):
6439
yield scrapy.Request(
6540
url=f'https://fakefakefa.com/{item["image_urls"][0]}',
6641
callback=self.process_item,
67-
meta={'item': item},
42+
meta={'item': item, 'referer': chapter_referer},
6843
dont_filter=True,
6944
)
7045
self._emit_process("fin")
7146

7247
def _process_episode(self, ep):
73-
if getattr(ep, "page_urls", None):
74-
chapter_referer = getattr(ep, "chapter_referer", None) or ep.url
75-
yield from self._yield_episode_items(ep, list(ep.page_urls), chapter_referer=chapter_referer)
76-
return
77-
yield from super()._process_episode(ep)
78-
79-
def parse_fin_page(self, response):
80-
parser = self.spider_site_runtime.parser
81-
reqer = self.spider_site_runtime.reqer
82-
ep = response.meta["ep"]
83-
chapter_referer = response.url
84-
cid = parser.parse_chapter_image_cid(response.text, chapter_url=chapter_referer)
85-
iog_url = reqer.build_iog_url(cid, domain=self.domain)
86-
yield scrapy.Request(
87-
url=iog_url,
88-
callback=self.parse_iog_page,
89-
headers=reqer.build_iog_headers(referer=chapter_referer),
90-
meta={"ep": ep, "chapter_referer": chapter_referer},
91-
dont_filter=True,
92-
)
93-
94-
def parse_iog_page(self, response):
95-
parser = self.spider_site_runtime.parser
96-
ep = response.meta["ep"]
97-
chapter_referer = response.meta.get("chapter_referer") or ep.url
98-
page_urls = parser.parse_iog_image_urls(response.text, request_url=response.url)
99-
for item in self._build_episode_items(ep, page_urls, chapter_referer=chapter_referer):
100-
yield item
101-
self._emit_process("fin")
48+
page_urls = list(getattr(ep, "page_urls", None) or [])
49+
chapter_referer = getattr(ep, "chapter_referer", None) or ep.url
50+
if not page_urls or not chapter_referer:
51+
missing = "page_urls" if not page_urls else "chapter_referer"
52+
raise ValueError(f"jestful episode requires {missing}: {ep!r}")
53+
yield from self._yield_episode_items(ep, page_urls, chapter_referer=chapter_referer)
10254

10355
def image_request_meta(self, *, url, item):
10456
referer = getattr(self, "_chapter_referers", {}).get(item.get("uuid_md5"))

ComicSpider/spiders/kaobei.py

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
# -*- coding: utf-8 -*-
2-
import re
3-
42
import scrapy
53

6-
from utils.processed_class import Url
74
from utils.website import KaobeiUtils
85
from utils.website.schema import KbFrameBook as FrameBook
9-
from .basecomicspider import BaseComicSpider, ComicspiderItem, conf
6+
from .basecomicspider import BaseComicSpider, ComicspiderItem
107

118

129
class KaobeiSpider(BaseComicSpider):
@@ -28,26 +25,13 @@ class KaobeiSpider(BaseComicSpider):
2825
preset_book_frame = FrameBook(domain)
2926
turn_page_info = (r"offset=\d+", None, 30)
3027
section_limit = 300
31-
_enable_episode_dispatch = True
3228

3329
@classmethod
3430
def from_crawler(cls, crawler, *args, **kwargs):
3531
spider = super().from_crawler(crawler, *args, **kwargs)
3632
spider.spider_site_runtime.reqer.get_aes_key()
3733
return spider
3834

39-
def frame_section(self, response):
40-
book = response.meta.get("book")
41-
episodes = self.spider_site_runtime.parser.parse_episodes(
42-
response.json()['results'], book, url=response.url,
43-
show_dhb=conf.kbShowDhb,
44-
)
45-
frame_results = {ep.idx: ep for ep in episodes}
46-
self.say.frame_section_print(frame_results)
47-
48-
def mk_page_tasks(self, **kw):
49-
return [kw['url']]
50-
5135
def _build_episode_items(self, ep, page_urls):
5236
book = ep.from_book
5337
uid, u_md5 = ep.id_and_md5()
@@ -75,19 +59,9 @@ def _yield_episode_items(self, ep, page_urls):
7559
self._emit_process('fin')
7660

7761
def _process_episode(self, ep):
78-
if getattr(ep, 'page_urls', None):
79-
yield from self._yield_episode_items(ep, list(ep.page_urls))
80-
return
81-
yield from super()._process_episode(ep)
82-
83-
def parse_fin_page(self, response):
84-
ep = response.meta['ep']
85-
imageData = self.spider_site_runtime.parser.parse_page_urls_from_html(
86-
response.text, url=response.url,
87-
)
88-
for item in self._build_episode_items(ep, [url_item['url'] for url_item in imageData]):
89-
yield item
90-
self._emit_process('fin')
62+
if not getattr(ep, 'page_urls', None):
63+
raise ValueError(f"kaobei episode requires page_urls: {ep!r}")
64+
yield from self._yield_episode_items(ep, list(ep.page_urls))
9165

9266
def process_item(self, response):
9367
item = response.meta['item']

ComicSpider/spiders/mangabz.py

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,46 @@
11
# -*- coding: utf-8 -*-
22
import re
33

4-
from utils.website import MangabzUtils
5-
from utils.website.schema import MbBody as Body, MbSearchBody as SearchBody, mb_curr_time_format as curr_time_format
6-
from .basecomicspider import FormReqBaseComicSpider, ComicspiderItem
4+
import scrapy
75

8-
domain = MangabzUtils.domain
6+
from .basecomicspider import BaseComicSpider, ComicspiderItem
97

108

11-
class MangabzSpider(FormReqBaseComicSpider):
9+
class MangabzSpider(BaseComicSpider):
1210
name = 'mangabz'
13-
ua = MangabzUtils.ua
14-
num_of_row = 50
15-
domain = domain
1611
custom_settings = {
17-
"DOWNLOADER_MIDDLEWARES": {'ComicSpider.middlewares.MangabzUAMiddleware': 5,
18-
'ComicSpider.middlewares.ComicDlAllProxyMiddleware': 6},
12+
"DOWNLOADER_MIDDLEWARES": {'ComicSpider.middlewares.ComicDlAllProxyMiddleware': 6,
13+
'ComicSpider.middlewares.FakeMiddleware': 30},
1914
"ITEM_PIPELINES": {'ComicSpider.pipelines.MangabzComicPipeline': 50}
2015
}
21-
search_url_head = f"https://{domain}/pager.ashx"
22-
mappings = {"更新": ["manga-list-0-0-2", "2"],
23-
"人气": ["manga-list", "10"],
24-
}
25-
body = Body()
26-
_enable_episode_dispatch = True
2716

28-
def frame_section(self, response):
29-
book = response.meta.get("book")
30-
episodes = self.spider_site_runtime.parser.parse_episodes(response, book, domain)
31-
frame_results = {ep.idx: ep for ep in episodes}
32-
return self.say.frame_section_print(frame_results)
33-
34-
def parse_fin_page(self, response):
35-
ep = response.meta['ep']
17+
def _build_episode_items(self, ep, page_urls):
3618
book = ep.from_book
3719
uid, u_md5 = ep.id_and_md5()
38-
img_list = self.spider_site_runtime.parser.parse_page_urls_from_html(response.text)
3920
group_infos = {'title':book.name,'section':ep.name,'uuid':uid,'uuid_md5':u_md5}
40-
ep.pages = len(img_list)
21+
ep.pages = len(page_urls)
4122
self.set_task(ep)
42-
for img_url in img_list:
23+
for idx, img_url in enumerate(page_urls, start=1):
4324
item = ComicspiderItem()
4425
item.update(**group_infos)
45-
page = int(re.search(r'/(\d+)_\d+\.', img_url).group(1))
26+
matched = re.search(r'/(\d+)_\d+\.', img_url)
27+
page = int(matched.group(1)) if matched else idx
4628
item['page'] = page
4729
item['image_urls'] = [img_url]
30+
if self.job_context:
31+
self.job_context.total += 1
4832
self.total += 1
4933
yield item
34+
35+
def _process_episode(self, ep):
36+
if not getattr(ep, 'page_urls', None):
37+
raise ValueError(f"mangabz episode requires page_urls: {ep!r}")
38+
for item in self._build_episode_items(ep, list(ep.page_urls)):
39+
yield scrapy.Request(
40+
url=f'https://fakefakefa.com/{item["image_urls"][0]}', callback=self.process_item,
41+
meta={'item': item}, dont_filter=True,
42+
)
5043
self._emit_process('fin')
44+
45+
def process_item(self, response):
46+
yield response.meta['item']

ComicSpider/spiders/manhuagui.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
4+
from utils.website import ManhuaguiUtils
5+
from .basecomicspider import BaseComicSpider, ComicspiderItem
6+
7+
8+
class ManhuaguiSpider(BaseComicSpider):
9+
name = "manhuagui"
10+
image_ua = ManhuaguiUtils.image_ua
11+
custom_settings = {
12+
"DOWNLOADER_MIDDLEWARES": {
13+
"ComicSpider.middlewares.ComicDlAllProxyMiddleware": 6,
14+
"ComicSpider.middlewares.FakeMiddleware": 30,
15+
}
16+
}
17+
18+
def _build_episode_items(self, ep, page_urls):
19+
book = ep.from_book
20+
uid, u_md5 = ep.id_and_md5()
21+
group_infos = {"title": book.name, "section": ep.name, "uuid": uid, "uuid_md5": u_md5}
22+
ep.pages = len(page_urls)
23+
self.set_task(ep)
24+
for page, image_url in enumerate(page_urls, start=1):
25+
item = ComicspiderItem()
26+
item.update(**group_infos)
27+
item["page"] = page
28+
item["image_urls"] = [image_url]
29+
if self.job_context:
30+
self.job_context.total += 1
31+
self.total += 1
32+
yield item
33+
34+
def _process_episode(self, ep):
35+
if not getattr(ep, "page_urls", None):
36+
raise ValueError(f"manhuagui episode requires page_urls: {ep!r}")
37+
for item in self._build_episode_items(ep, list(ep.page_urls)):
38+
yield scrapy.Request(
39+
url=f'https://fakefakefa.com/{item["image_urls"][0]}', callback=self.process_item,
40+
meta={"item": item}, dont_filter=True,
41+
)
42+
self._emit_process("fin")
43+
44+
def process_item(self, response):
45+
yield response.meta["item"]

0 commit comments

Comments
 (0)