v2.1.10: 适配JM大陆直连域名的HTML正则，优化保存图片的代码，更新文档 (#89)

hect0x7 · web-flow · commit 68ba3b844210 · 2023-08-05T23:40:43.000+08:00
diff --git a/README.md b/README.md
@@ -38,17 +38,18 @@ jmcomic.download_album('422866')  # 传入要下载的album的id，即可下载
 ## 项目特点
 
 - **绕过Cloudflare的反爬虫**
-- 支持使用**Github Action**下载漫画，不会编程都能用（[教程：使用Github Actions下载禁漫本子](./assets/docs/教程：使用Github%20Actions下载禁漫本子.md)）
-- 可配置性强
+- 支持使用**Github Actions**下载漫画，不会编程都能用（[教程：使用Github Actions下载禁漫本子](./assets/docs/教程：使用Github%20Actions下载禁漫本子.md)）
+- **可配置性强**
   - 不配置也能使用，十分方便
   - 配置可以从**配置文件**生成，支持多种文件格式，无需写Python代码
-  - 配置点有：`是否使用磁盘缓存` `图片类型转换` `下载路径` `请求元信息（headers,cookies,代理）等 `
-- 可扩展性强
+  - 配置点有：`是否使用磁盘缓存` `图片类型转换` `下载路径` `请求元信息（headers,cookies,代理）`等 
+- **可扩展性强**
   - 支持自定义本子/章节/图片下载前后的回调函数
   - 支持自定义debug日志的开关/格式
   - 支持自定义Option/Client/实体类
+  - ...
 - 支持重试和域名切换机制
-- 多线程下载（可细化到一图一线程，效率极高）
+- **多线程下载**（可细化到一图一线程，效率极高）
 - 跟进了JM最新的图片分割算法（2023-02-08）
 
 ## 使用小说明
diff --git a/assets/docs/images/5.png b/assets/docs/images/5.png
diff --git a/assets/docs/教程：使用Github Actions下载禁漫本子.md b/assets/docs/教程：使用Github Actions下载禁漫本子.md
@@ -3,22 +3,38 @@
 一共需要三步：
 
 1. fork一份我的代码仓库。
-2. 填写你需要下载的本子id，提交commit。
+2. 填写你需要下载的本子id。
 3. 等待Github Actions下载完成，下载成品zip文件。
 
 下面截图解析这三步的详细过程。
 
 ## 1. fork一份我的代码仓库
 
-访问这个网址: [https://github.com/hect0x7/JMComic-Crawler-Python/fork].
+访问下面这个网址：
+
+`https://github.com/hect0x7/JMComic-Crawler-Python/fork`
 
 直接拉到页面最底部，如下所示：
 
 ![1](./images/1.png)
 
-## 2. 填写你需要下载的本子id，提交commit
+## 2. 填写你需要下载的本子id
+
+### 2.1. 方式一（最新、推荐）
+
+访问下面这个网址：
+
+`https://github.com/hect0x7/JMComic-Crawler-Python/actions/workflows/download_dispatch.yml`
+
+按下图步骤进行操作：
+
+![5](./images/5.png)
+
+
+
+### 2.2. 方式二
 
-访问下面这个网址
+访问下面这个网址：
 
 `https://github.com/你的用户名/JMComic-Crawler-Python/edit/workflow/usage/workflow_download.py`
 
diff --git a/src/jmcomic/__init__.py b/src/jmcomic/__init__.py
@@ -2,6 +2,6 @@
 # 被依赖方 <--- 使用方
 # config <--- entity <--- toolkit <--- client <--- option <--- downloader
 
-__version__ = '2.1.9'
+__version__ = '2.1.10'
 
 from .api import *
diff --git a/src/jmcomic/jm_client_impl.py b/src/jmcomic/jm_client_impl.py
@@ -133,7 +133,7 @@ def get_jmcomic_domain_all(self, postman=None):
     # noinspection PyUnusedLocal
     def fallback(self, request, url, domain_index, retry_count, **kwargs):
         msg = f"请求重试全部失败: [{url}], {self.domain_list}"
-        jm_debug('fallback', msg)
+        jm_debug('req.fallback', msg)
         raise AssertionError(msg)
 
 
diff --git a/src/jmcomic/jm_client_interface.py b/src/jmcomic/jm_client_interface.py
@@ -48,14 +48,14 @@ def transfer_to(self,
         img_url = img_url or self.url
 
         if decode_image is False:
-            # 不解密图片，直接返回
+            # 不解密图片，直接保存文件
             JmImageSupport.save_resp_img(
                 self,
                 path,
                 need_convert=suffix_not_equal(img_url, path),
             )
         else:
-            # 解密图片，需要 photo_id、scramble_id
+            # 解密图片并保存文件
             JmImageSupport.decode_and_save(
                 JmImageSupport.get_num_by_url(scramble_id, img_url),
                 JmImageSupport.open_Image(self.content),
@@ -241,6 +241,7 @@ def download_image(self,
     def save_image_resp(self, decode_image, img_save_path, img_url, resp, scramble_id):
         # gif图无需加解密，需要最先判断
         if self.img_is_not_need_to_decode(img_url, resp):
+            # 相当于调用save_directly，但使用save_resp_img可以统一调用入口
             JmImageSupport.save_resp_img(resp, img_save_path, False)
         else:
             resp.transfer_to(img_save_path, scramble_id, decode_image, img_url)
diff --git a/src/jmcomic/jm_config.py b/src/jmcomic/jm_config.py
@@ -24,16 +24,14 @@ class JmModuleConfig:
     JM_PUB_URL = f'{PROT}jmcomic2.bet'
     JM_CDN_IMAGE_URL_TEMPLATE = PROT + 'cdn-msp.{domain}/media/photos/{photo_id}/{index:05}{suffix}'  # index 从1开始
     JM_IMAGE_SUFFIX = ['.jpg', '.webp', '.png', '.gif']
-    # 缓存字段
-    DOMAIN = None
-    DOMAIN_LIST = None
 
-    # 访问JM可能会遇到的异常网页
+    # JM的异常网页内容
     JM_ERROR_RESPONSE_TEXT = {
         "Could not connect to mysql! Please check your database settings!": "禁漫服务器内部报错",
         "Restricted Access!": "禁漫拒绝你所在ip地区的访问，你可以选择: 换域名/换代理",
     }
 
+    # JM的异常网页code
     JM_ERROR_STATUS_CODE = {
         403: 'ip地区禁止访问/爬虫被识别',
         520: '520: Web server is returning an unknown error (禁漫服务器内部报错)',
@@ -48,18 +46,22 @@ class JmModuleConfig:
     # API的相关配置
     MAGIC_18COMICAPPCONTENT = '18comicAPPContent'
 
-    # 下载时的一些默认值
+    # 下载时的一些默认值配置
     default_author = 'default-author'
-    default_photo_title = 'default-photo-title'
-    default_photo_id = 'default-photo-id'
 
-    # debug
-    enable_jm_debug = True
-    debug_executor = default_jm_debug
-    postman_constructor = default_postman_constructor
+    # 模块级别的可重写配置
+    DOMAIN = None
+    DOMAIN_LIST = None
     DOWNLOADER_CLASS = None
     OPTION_CLASS = None
 
+    # 执行debug的函数
+    debug_executor = default_jm_debug
+    # postman构造函数
+    postman_constructor = default_postman_constructor
+    # debug开关标记
+    enable_jm_debug = True
+
     @classmethod
     def downloader_class(cls):
         if cls.DOWNLOADER_CLASS is not None:
diff --git a/src/jmcomic/jm_option.py b/src/jmcomic/jm_option.py
@@ -252,7 +252,7 @@ def new_jm_client(self, **kwargs) -> JmcomicClient:
         # domain_list
         domain_list = self.client.domain
         if len(domain_list) == 0:
-            domain_list = JmModuleConfig.get_jmcomic_domain_all(postman)[:-1]
+            domain_list = [JmcomicText.parse_to_jm_domain(JmModuleConfig.get_jmcomic_url(postman))]
 
         # client
         client = self.jm_client_impl_mapping[self.client.impl](
diff --git a/src/jmcomic/jm_toolkit.py b/src/jmcomic/jm_toolkit.py
@@ -12,7 +12,7 @@ class JmcomicText:
     pattern_html_photo_title = compile('<title>(.*?)\|.*</title>')
     # pattern_html_photo_data_original_list = compile('data-original="(.*?)" id="album_photo_.+?"')
     pattern_html_photo_data_original_domain = compile('src="https://(.*?)/media/albums/blank')
-    pattern_html_photo_data_original_0 = compile('data-original="(.*?)" id="album_photo')
+    pattern_html_photo_data_original_0 = compile('data-original="(.*?)"[ \n]*?id="album_photo')
     pattern_html_photo_keywords = compile('<meta name="keywords"[\s\S]*?content="(.*?)"')
     pattern_html_photo_series_id = compile('var series_id = (\d+);')
     pattern_html_photo_sort = compile('var sort = (\d+);')
@@ -207,41 +207,33 @@ class JmImageSupport:
     @classmethod
     def save_resp_img(cls, resp: Any, filepath: str, need_convert=True):
         """
-        保存图片的响应数据
-        @param resp: Response对象
-        @param filepath: 响应数据保存的绝对路径
-        @param need_convert: True 使用PIL打开图片再保存; False 直接保存resp.content;
-        如果需要改变图片的格式，比如 .jpg → .png，则需要neet_convert=True。
-        如果不需要改变文件的格式，使用need_convert=False可以跳过PIL解析图片，效率更高。
+        接收HTTP响应对象，将其保存到图片文件.
+        如果需要改变图片的文件格式，比如 .jpg → .png，则需要指定参数 neet_convert=True.
+        如果不需要改变图片的文件格式，使用 need_convert=False，可以跳过PIL解析图片，效率更高.
+
+        @param resp: HTTP响应对象
+        @param filepath: 图片文件路径
+        @param need_convert: 是否转换图片
         """
-        if need_convert is True:
-            cls.save_image(cls.open_Image(resp.content), filepath)
+        if need_convert is False:
+            cls.save_directly(resp, filepath)
         else:
-            save_resp_content(resp, filepath)
+            cls.save_image(cls.open_Image(resp.content), filepath)
 
     @classmethod
-    def save_resp_decoded_img(cls,
-                              resp: Any,
-                              image: JmImageDetail,
-                              filepath: str
-                              ) -> None:
-        cls.decode_and_save(
-            cls.get_num_by_detail(image),
-            cls.open_Image(resp.content),
-            filepath
-        )
+    def save_image(cls, image: Image, filepath: str):
+        """
+        保存图片
+
+        @param image: PIL.Image对象
+        @param filepath: 保存文件路径
+        """
+        image.save(filepath)
 
     @classmethod
-    def decode_disk_img(cls,
-                        image: JmImageDetail,
-                        img_filepath: str,
-                        decoded_save_path: str
-                        ) -> None:
-        cls.decode_and_save(
-            cls.get_num_by_detail(image),
-            cls.open_Image(img_filepath),
-            decoded_save_path
-        )
+    def save_directly(cls, resp, filepath):
+        from common import save_resp_content
+        save_resp_content(resp, filepath)
 
     @classmethod
     def decode_and_save(cls,
@@ -292,10 +284,6 @@ def open_Image(cls, fp: Union[str, bytes]):
         fp = fp if isinstance(fp, str) else BytesIO(fp)
         return Image.open(fp)
 
-    @classmethod
-    def save_image(cls, image: Image, filepath: str):
-        image.save(filepath)
-
     @classmethod
     def get_num(cls, scramble_id, aid, filename: str) -> int:
         """
diff --git a/usage/getting_started.py b/usage/getting_started.py
@@ -12,33 +12,6 @@
 # 如果没有配置，则会使用 JmOption.default()，下载的路径是[当前工作文件夹/本子章节名称/图片].
 
 
-"""
---------------------
-    批量下载介绍
---------------------
-"""
-# 如果你想要批量下载，可以使用 list/set/tuple/生成器 作为第一个参数。
-# 第二个参数依然是可选的JmOption对象
-jmcomic.download_album(['422866', '1', '2', '3'])  # list
-jmcomic.download_album({'422866', '1', '2', '3'})  # set
-jmcomic.download_album(('422866', '1', '2', '3'))  # tuple
-jmcomic.download_album(aid for aid in ('422866', '1', '2', '3'))  # 生成器
-
-
-"""
---------------------
-    获取域名介绍
---------------------
-"""
-# 方式1: 访问禁漫发布页
-url_ls = jmcomic.JmModuleConfig.get_jmcomic_url_all()
-print(url_ls)
-
-# 方式2（可能会报错，需要你自己配置梯子）
-url = jmcomic.JmModuleConfig.get_jmcomic_url()
-print(url)
-
-
 """
 --------------------
     配置文件介绍
@@ -67,3 +40,30 @@
         }
     }
 })
+
+
+"""
+--------------------
+    批量下载介绍
+--------------------
+"""
+# 如果你想要批量下载，可以使用 list/set/tuple/生成器 作为第一个参数。
+# 第二个参数依然是可选的JmOption对象
+jmcomic.download_album(['422866', '1', '2', '3'])  # list
+jmcomic.download_album({'422866', '1', '2', '3'})  # set
+jmcomic.download_album(('422866', '1', '2', '3'))  # tuple
+jmcomic.download_album(aid for aid in ('422866', '1', '2', '3'))  # 生成器
+
+
+"""
+--------------------
+    获取域名介绍
+--------------------
+"""
+# 方式1: 访问禁漫发布页
+url_ls = jmcomic.JmModuleConfig.get_jmcomic_url_all()
+print(url_ls)
+
+# 方式2（可能会报错，需要你自己配置梯子）
+url = jmcomic.JmModuleConfig.get_jmcomic_url()
+print(url)
diff --git a/usage/sample_usage.py b/usage/sample_usage.py
@@ -4,7 +4,6 @@
     f'你的配置文件路径，例如: D:/a/b/c/jmcomic/config.yml'
 )
 client = option.build_jm_client()
-client.enable_cache(debug=True)
 
 
 @timeit('下载本子集: ')
@@ -21,14 +20,17 @@ def download_jm_album():
 
 @timeit('获取实体类: ')
 def get_album_photo_detail():
-    # 启用缓存，会缓存id → album和photo的实体类
+    # 本子实体类
     album: JmAlbumDetail = client.get_album_detail('427413')
 
-    def show(photo):
-        photo: JmPhotoDetail = client.get_photo_detail(photo.photo_id, False)
-        for img in photo:
-            img: JmImageDetail
-            print(img.img_url)
+    def show(photo: JmPhotoDetail):
+        # 章节实体类
+        photo = client.get_photo_detail(photo.photo_id, False)
+
+        # 图片实体类
+        image: JmImageDetail
+        for image in photo:
+            print(image.img_url)
 
     multi_thread_launcher(
         iter_objs=album,
@@ -69,6 +71,7 @@ def main():
     search_jm_album()
     download_jm_album()
     get_album_photo_detail()
+    search_and_download()
 
 
 if __name__ == '__main__':