Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
github: jasoneri
ko_fi: jsoneri
custom: ["https://paypal.me/jsoneri"]
custom: ["https://paypal.me/jsoneri","https://app.unifans.io/c/jsoneri"]
21 changes: 11 additions & 10 deletions .github/workflows/hitomi-db.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ jobs:

- name: Install uv
uses: astral-sh/setup-uv@v7

- name: Install Qt runtime libs for smoke test
uses: tlambert03/setup-qt-libs@v1
- name: Install dependencies
run: uv sync

Expand All @@ -37,7 +38,7 @@ jobs:
continue-on-error: true
run: |
uv run python -m utils.website.hitomi.scape_dataset \
--db-path assets/hitomi.db \
--db-path __temp/hitomi.db \
--workers 2 \
--max-retries 5 \
--timeout 15
Expand All @@ -47,20 +48,20 @@ jobs:
if: steps.scrape.outcome == 'failure'
run: |
uv run python -m utils.website.hitomi.scape_dataset \
--db-path assets/hitomi.db \
--db-path __temp/hitomi.db \
--workers 1 \
--max-retries 8 \
--timeout 20 \
--only-failed

- name: Quality gate
run: uv run python .github/scripts/hitomi_db_gate.py assets/hitomi.db
run: uv run python .github/scripts/hitomi_db_gate.py __temp/hitomi.db

- name: Generate manifest
run: |
uv run python .github/scripts/generate_hitomi_manifest.py \
assets/hitomi.db \
--output assets/hitomi-manifest.json
__temp/hitomi.db \
--output __temp/hitomi-manifest.json

- name: Upload to release
env:
Expand All @@ -74,13 +75,13 @@ jobs:
gh release delete-asset "$TAG" hitomi.db --yes 2>/dev/null || true
gh release delete-asset "$TAG" hitomi-manifest.json --yes 2>/dev/null || true
gh release upload "$TAG" \
assets/hitomi.db \
assets/hitomi-manifest.json
__temp/hitomi.db \
__temp/hitomi-manifest.json
else
gh release create "$TAG" \
--title "Preset Assets" \
--notes "Auto-managed preset assets (hitomi.db, etc.)" \
--latest=false \
assets/hitomi.db \
assets/hitomi-manifest.json
__temp/hitomi.db \
__temp/hitomi-manifest.json
fi
13 changes: 6 additions & 7 deletions ComicSpider/spiders/basecomicspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,19 +234,18 @@ def _process_episode(self, ep: Episode):
callback = self.parse_fin_page if self._enable_episode_dispatch else self.parse_section
yield scrapy.Request(
url=final_url,
callback=callback,
headers={**self.ua, 'Referer': self.request_referer(final_url)},
meta=meta,
dont_filter=True,
callback=callback, headers={**self.ua, 'Referer': self.request_referer(final_url)},
meta=meta, dont_filter=True,
)

def _process_book(self, book: BookInfo):
url = book.url if book.url and book.url.startswith("http") else self.book_id_url % book.id
final_url = self.transfer_url(url)
yield scrapy.Request(
url=final_url, callback=self.parse_section,
headers={**self.ua, 'Referer': self.request_referer(final_url)},
meta={'book': book}, dont_filter=True)
url=final_url,
callback=self.parse_section, headers={**self.ua, 'Referer': self.request_referer(final_url)},
meta={'book': book}, dont_filter=True,
)

def start_requests(self):
self.preready()
Expand Down
4 changes: 2 additions & 2 deletions ComicSpider/spiders/ehentai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ class EHentaiSpider(BaseComicSpider3):
domain = domain
search_url_head = f'https://{domain}/?f_search='
mappings = {
res.EHentai.MAPPINGS_INDEX: f'https://{domain}',
res.EHentai.MAPPINGS_POPULAR: f'https://{domain}/popular'
res.SPIDER.Completer.index: f'https://{domain}',
res.SPIDER.Completer.popular: f'https://{domain}/popular'
}
frame_book_format = ['title', 'book_pages', 'preview_url'] # , 'book_idx']
turn_page_info = (r"page=\d+",)
Expand Down
62 changes: 7 additions & 55 deletions ComicSpider/spiders/jestful.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,13 @@

class JestfulSpider(BaseComicSpider):
name = "jestful"
ua = JestfulUtils.ua
image_ua = JestfulUtils.image_ua
domain = JestfulUtils.domain
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"ComicSpider.middlewares.UAMiddleware": 5,
"ComicSpider.middlewares.RefererMiddleware": 10,
"ComicSpider.middlewares.FakeMiddleware": 30,
}
}
_enable_episode_dispatch = True

def frame_section(self, response):
reqer = self.spider_site_runtime.reqer
parser = self.spider_site_runtime.parser
book = response.meta.get("book")
if book is None:
raise ValueError("jestful frame_section requires response.meta['book']")
owner_state = parser.parse_book_owner_state(response.text, owner_url=response.url)
chapter_url = reqer.tokenized_url(
reqer.listing_url(owner_state["loader_slug"]), domain=self.domain
)
chapter_resp = reqer.cli.get(
chapter_url,
headers=reqer.headers(referer=response.url),
follow_redirects=True,
timeout=12,
)
chapter_resp.raise_for_status()
episodes = parser.parse_episodes_from_list_html(chapter_resp.text, book, domain=self.domain)
frame_results = {ep.idx: ep for ep in episodes}
return self.say.frame_section_print(frame_results)

def _build_episode_items(self, ep, page_urls, *, chapter_referer):
book = ep.from_book
Expand All @@ -64,41 +39,18 @@ def _yield_episode_items(self, ep, page_urls, *, chapter_referer):
yield scrapy.Request(
url=f'https://fakefakefa.com/{item["image_urls"][0]}',
callback=self.process_item,
meta={'item': item},
meta={'item': item, 'referer': chapter_referer},
dont_filter=True,
)
self._emit_process("fin")

def _process_episode(self, ep):
if getattr(ep, "page_urls", None):
chapter_referer = getattr(ep, "chapter_referer", None) or ep.url
yield from self._yield_episode_items(ep, list(ep.page_urls), chapter_referer=chapter_referer)
return
yield from super()._process_episode(ep)

def parse_fin_page(self, response):
parser = self.spider_site_runtime.parser
reqer = self.spider_site_runtime.reqer
ep = response.meta["ep"]
chapter_referer = response.url
cid = parser.parse_chapter_image_cid(response.text, chapter_url=chapter_referer)
iog_url = reqer.build_iog_url(cid, domain=self.domain)
yield scrapy.Request(
url=iog_url,
callback=self.parse_iog_page,
headers=reqer.build_iog_headers(referer=chapter_referer),
meta={"ep": ep, "chapter_referer": chapter_referer},
dont_filter=True,
)

def parse_iog_page(self, response):
parser = self.spider_site_runtime.parser
ep = response.meta["ep"]
chapter_referer = response.meta.get("chapter_referer") or ep.url
page_urls = parser.parse_iog_image_urls(response.text, request_url=response.url)
for item in self._build_episode_items(ep, page_urls, chapter_referer=chapter_referer):
yield item
self._emit_process("fin")
page_urls = list(getattr(ep, "page_urls", None) or [])
chapter_referer = getattr(ep, "chapter_referer", None) or ep.url
if not page_urls or not chapter_referer:
missing = "page_urls" if not page_urls else "chapter_referer"
raise ValueError(f"jestful episode requires {missing}: {ep!r}")
yield from self._yield_episode_items(ep, page_urls, chapter_referer=chapter_referer)

def image_request_meta(self, *, url, item):
referer = getattr(self, "_chapter_referers", {}).get(item.get("uuid_md5"))
Expand Down
34 changes: 4 additions & 30 deletions ComicSpider/spiders/kaobei.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# -*- coding: utf-8 -*-
import re

import scrapy

from utils.processed_class import Url
from utils.website import KaobeiUtils
from utils.website.schema import KbFrameBook as FrameBook
from .basecomicspider import BaseComicSpider, ComicspiderItem, conf
from .basecomicspider import BaseComicSpider, ComicspiderItem


class KaobeiSpider(BaseComicSpider):
Expand All @@ -28,26 +25,13 @@ class KaobeiSpider(BaseComicSpider):
preset_book_frame = FrameBook(domain)
turn_page_info = (r"offset=\d+", None, 30)
section_limit = 300
_enable_episode_dispatch = True

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
spider.spider_site_runtime.reqer.get_aes_key()
return spider

def frame_section(self, response):
book = response.meta.get("book")
episodes = self.spider_site_runtime.parser.parse_episodes(
response.json()['results'], book, url=response.url,
show_dhb=conf.kbShowDhb,
)
frame_results = {ep.idx: ep for ep in episodes}
self.say.frame_section_print(frame_results)

def mk_page_tasks(self, **kw):
return [kw['url']]

def _build_episode_items(self, ep, page_urls):
book = ep.from_book
uid, u_md5 = ep.id_and_md5()
Expand Down Expand Up @@ -75,19 +59,9 @@ def _yield_episode_items(self, ep, page_urls):
self._emit_process('fin')

def _process_episode(self, ep):
if getattr(ep, 'page_urls', None):
yield from self._yield_episode_items(ep, list(ep.page_urls))
return
yield from super()._process_episode(ep)

def parse_fin_page(self, response):
ep = response.meta['ep']
imageData = self.spider_site_runtime.parser.parse_page_urls_from_html(
response.text, url=response.url,
)
for item in self._build_episode_items(ep, [url_item['url'] for url_item in imageData]):
yield item
self._emit_process('fin')
if not getattr(ep, 'page_urls', None):
raise ValueError(f"kaobei episode requires page_urls: {ep!r}")
yield from self._yield_episode_items(ep, list(ep.page_urls))

def process_item(self, response):
item = response.meta['item']
Expand Down
52 changes: 24 additions & 28 deletions ComicSpider/spiders/mangabz.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,46 @@
# -*- coding: utf-8 -*-
import re

from utils.website import MangabzUtils
from utils.website.schema import MbBody as Body, MbSearchBody as SearchBody, mb_curr_time_format as curr_time_format
from .basecomicspider import FormReqBaseComicSpider, ComicspiderItem
import scrapy

domain = MangabzUtils.domain
from .basecomicspider import BaseComicSpider, ComicspiderItem


class MangabzSpider(FormReqBaseComicSpider):
class MangabzSpider(BaseComicSpider):
name = 'mangabz'
ua = MangabzUtils.ua
num_of_row = 50
domain = domain
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {'ComicSpider.middlewares.MangabzUAMiddleware': 5,
'ComicSpider.middlewares.ComicDlAllProxyMiddleware': 6},
"DOWNLOADER_MIDDLEWARES": {'ComicSpider.middlewares.ComicDlAllProxyMiddleware': 6,
'ComicSpider.middlewares.FakeMiddleware': 30},
"ITEM_PIPELINES": {'ComicSpider.pipelines.MangabzComicPipeline': 50}
}
search_url_head = f"https://{domain}/pager.ashx"
mappings = {"更新": ["manga-list-0-0-2", "2"],
"人气": ["manga-list", "10"],
}
body = Body()
_enable_episode_dispatch = True

def frame_section(self, response):
book = response.meta.get("book")
episodes = self.spider_site_runtime.parser.parse_episodes(response, book, domain)
frame_results = {ep.idx: ep for ep in episodes}
return self.say.frame_section_print(frame_results)

def parse_fin_page(self, response):
ep = response.meta['ep']
def _build_episode_items(self, ep, page_urls):
book = ep.from_book
uid, u_md5 = ep.id_and_md5()
img_list = self.spider_site_runtime.parser.parse_page_urls_from_html(response.text)
group_infos = {'title':book.name,'section':ep.name,'uuid':uid,'uuid_md5':u_md5}
ep.pages = len(img_list)
ep.pages = len(page_urls)
self.set_task(ep)
for img_url in img_list:
for idx, img_url in enumerate(page_urls, start=1):
item = ComicspiderItem()
item.update(**group_infos)
page = int(re.search(r'/(\d+)_\d+\.', img_url).group(1))
matched = re.search(r'/(\d+)_\d+\.', img_url)
page = int(matched.group(1)) if matched else idx
item['page'] = page
item['image_urls'] = [img_url]
if self.job_context:
self.job_context.total += 1
self.total += 1
yield item

def _process_episode(self, ep):
if not getattr(ep, 'page_urls', None):
raise ValueError(f"mangabz episode requires page_urls: {ep!r}")
for item in self._build_episode_items(ep, list(ep.page_urls)):
yield scrapy.Request(
url=f'https://fakefakefa.com/{item["image_urls"][0]}', callback=self.process_item,
meta={'item': item}, dont_filter=True,
)
self._emit_process('fin')

def process_item(self, response):
yield response.meta['item']
45 changes: 45 additions & 0 deletions ComicSpider/spiders/manhuagui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import scrapy

from utils.website import ManhuaguiUtils
from .basecomicspider import BaseComicSpider, ComicspiderItem


class ManhuaguiSpider(BaseComicSpider):
name = "manhuagui"
image_ua = ManhuaguiUtils.image_ua
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"ComicSpider.middlewares.ComicDlAllProxyMiddleware": 6,
"ComicSpider.middlewares.FakeMiddleware": 30,
}
}

def _build_episode_items(self, ep, page_urls):
book = ep.from_book
uid, u_md5 = ep.id_and_md5()
group_infos = {"title": book.name, "section": ep.name, "uuid": uid, "uuid_md5": u_md5}
ep.pages = len(page_urls)
self.set_task(ep)
for page, image_url in enumerate(page_urls, start=1):
item = ComicspiderItem()
item.update(**group_infos)
item["page"] = page
item["image_urls"] = [image_url]
if self.job_context:
self.job_context.total += 1
self.total += 1
yield item

def _process_episode(self, ep):
if not getattr(ep, "page_urls", None):
raise ValueError(f"manhuagui episode requires page_urls: {ep!r}")
for item in self._build_episode_items(ep, list(ep.page_urls)):
yield scrapy.Request(
url=f'https://fakefakefa.com/{item["image_urls"][0]}', callback=self.process_item,
meta={"item": item}, dont_filter=True,
)
self._emit_process("fin")

def process_item(self, response):
yield response.meta["item"]
Loading
Loading