Update req

dongweiming · dongweiming · commit c6b5811126eb · 2025-04-02T11:11:28.000+08:00
diff --git a/convert.py b/convert.py
@@ -0,0 +1,93 @@
+import sys
+import re
+
+from bs4 import BeautifulSoup
+import requests
+from langchain.chat_models import init_chat_model
+
+model = init_chat_model("gpt-4o-mini", model_provider="openai")
+
+
+def html_to_markdown(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    soup = soup.find('div', class_='post-content')
+
+    # 提取代码块
+    for figure in soup.find_all('figure'):
+        try:
+            lang = figure.get('class', ['', ''])[1]
+        except IndexError:
+            lang = ''
+        figure.replace_with(f'```{lang}\n{figure.text}\n```')
+
+    for figure in soup.find_all('figure'):
+        try:
+            lang = figure.get('class', ['', ''])[1]
+        except IndexError:
+            lang = ''
+        figure.replace_with(f'```{lang}\n{figure.text}\n```')
+
+    # 提取代码块
+    for pre in soup.find_all('pre'):
+        code = pre.find('code')
+        if code:
+            lang = code.get('class', [''])[0].replace('language-', '')  # 获取代码语言
+            pre.replace_with(f'```{lang}\n{code.text}\n```')
+
+    # 转换加粗、斜体、标题等
+    for strong in soup.find_all('strong'):
+        strong.replace_with(f'**{strong.text}**')
+    for em in soup.find_all('em'):
+        em.replace_with(f'*{em.text}*')
+    for h in range(1, 7):
+        for tag in soup.find_all(f'h{h}'):
+            # 移除标题中的 <em> 或 <i> 斜体
+            for em in tag.find_all(['em', 'i']):
+                em.unwrap()
+            tag.replace_with(f'{"#" * h} {tag.text}')
+
+    # 处理无序列表
+    for ul in soup.find_all('ul'):
+        for li in ul.find_all('li'):
+            li.replace_with(f'- {li.text}')
+        ul.unwrap()  # 移除 <ul> 标签
+    # 处理有序列表
+    for ol in soup.find_all('ol'):
+        for i, li in enumerate(ol.find_all('li'), start=1):
+            li.replace_with(f'{i}. {li.text}')
+        ol.unwrap()  # 移除 <ol> 标签
+
+    # 处理 <a> 链接，去掉 web.archive.org 前缀
+    for a_tag in soup.find_all('a', href=True):
+        cleaned_href = re.sub(r'^https?://web\.archive\.org/web/\d+/', '', a_tag['href'])
+        a_tag.replace_with(f'[{a_tag.text}]({cleaned_href})')
+
+    # 处理段落
+    for p in soup.find_all('p'):
+        p.insert_before("\n")
+        p.insert_after("\n")
+
+    return soup.get_text()
+
+
+def fetch(url):
+    r = requests.get(url)
+    r.encoding = r.apparent_encoding
+    markdown_text = html_to_markdown(r.text)
+    soup = BeautifulSoup(r.text, 'html.parser')
+    date = soup.find('time', class_='time').text.strip()
+    title = soup.find('h1', class_='title').text.strip()
+    read = int(int(soup.find('span', class_='read').text.split(':')[1]) * 1.5)
+    desc = soup.find('meta', property="og:description").attrs['content']
+    tags = [i.text.strip() for i in soup.find_all('a', class_='article-tag-list-link')]
+    slug = model.invoke(f"Please generate an English slug based on the following title: {title}, please only say the slug and do not use pinyin", temperature=0).content
+    return title, slug, date, read, desc, tags, markdown_text
+
+
+def main():
+    for i in range(1, 133):
+        ...
+
+if __name__ == '__main__':
+    url = sys.argv[1]
+    print(fetch(url))
diff --git a/requirements.txt b/requirements.txt
@@ -1,28 +1,65 @@
-click==8.0.4
-aiohttp==3.8.5
-WTForms==3.0.1
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.14
+aiomysql==0.0.22
 aioredis==1.3.1
-cython==0.29.28
-mistune==0.8.4
+aiosignal==1.3.2
+aiosmtplib==1.1.6
+aiosqlite==0.17.0
+annotated-types==0.7.0
+arq==0.22
+async-timeout==5.0.1
 asyncblink==0.3.2
-werkzeug==2.2.3
-Pygments==2.11.2
+attrs==25.3.0
+beautifulsoup4==4.13.3
+blinker==1.6.3
+click==8.0.4
 cssmin==0.2.0
-PyYAML==6.0
+Cython==0.29.28
+extraction==0.3
+frozenlist==1.5.0
 gunicorn==20.1.0
+hiredis==3.1.0
+html5lib==1.1
+httptools==0.6.4
+idna==3.10
+iso8601==0.1.16
+lxml==5.3.1
+Mako==1.3.9
+MarkupSafe==3.0.2
+mistune==0.8.4
+multidict==5.2.0
 pangu==4.0.6.1
+propcache==0.3.1
+pydantic==1.10.21
+pydantic_core==2.33.0
+Pygments==2.11.2
+PyJWT==2.1.0
+PyMySQL==0.9.3
+pypika-tortoise==0.1.6
+pytz==2025.2
+PyYAML==6.0.2
+raven==6.10.0
+raven-aiohttp==0.7.0
 redis==4.4.4
-aiosmtplib==1.1.6
-arq==0.22
-tortoise-orm==0.18.1
-lxml==4.9.1
-extraction==0.3
 sanic==21.12.2
 sanic-jwt==1.7.0
 Sanic-Mako==0.7.0
+-e git+https://github.com/dongweiming/sanic-oauth.git@3cc63a9ab0dad56b9bca9a70b4788dec0ca0dae6#egg=sanic_oauth
 sanic-routing==0.7.2
 sanic-sentry==0.1.7
 sanic-session==0.8.0
 Sanic-WTF==0.6.0
-aiomysql==0.0.22
--e git+https://github.com/dongweiming/sanic-oauth.git@master#egg=sanic-oauth
+setuptools==78.1.0
+six==1.17.0
+soupsieve==2.6
+tortoise-orm==0.18.1
+typing-inspection==0.4.0
+typing_extensions==4.13.0
+ujson==5.10.0
+uvloop==0.21.0
+webencodings==0.5.1
+websockets==10.0
+Werkzeug==0.16.1
+WTForms==3.0.1
+yarl==1.18.3