Skip to content

Commit c6b5811

Browse files
committed
Update req
1 parent 0130b1e commit c6b5811

File tree

2 files changed

+145
-15
lines changed

2 files changed

+145
-15
lines changed

convert.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import sys
2+
import re
3+
4+
from bs4 import BeautifulSoup
5+
import requests
6+
from langchain.chat_models import init_chat_model
7+
8+
model = init_chat_model("gpt-4o-mini", model_provider="openai")
9+
10+
11+
def html_to_markdown(html):
12+
soup = BeautifulSoup(html, 'html.parser')
13+
soup = soup.find('div', class_='post-content')
14+
15+
# 提取代码块
16+
for figure in soup.find_all('figure'):
17+
try:
18+
lang = figure.get('class', ['', ''])[1]
19+
except IndexError:
20+
lang = ''
21+
figure.replace_with(f'```{lang}\n{figure.text}\n```')
22+
23+
for figure in soup.find_all('figure'):
24+
try:
25+
lang = figure.get('class', ['', ''])[1]
26+
except IndexError:
27+
lang = ''
28+
figure.replace_with(f'```{lang}\n{figure.text}\n```')
29+
30+
# 提取代码块
31+
for pre in soup.find_all('pre'):
32+
code = pre.find('code')
33+
if code:
34+
lang = code.get('class', [''])[0].replace('language-', '') # 获取代码语言
35+
pre.replace_with(f'```{lang}\n{code.text}\n```')
36+
37+
# 转换加粗、斜体、标题等
38+
for strong in soup.find_all('strong'):
39+
strong.replace_with(f'**{strong.text}**')
40+
for em in soup.find_all('em'):
41+
em.replace_with(f'*{em.text}*')
42+
for h in range(1, 7):
43+
for tag in soup.find_all(f'h{h}'):
44+
# 移除标题中的 <em> 或 <i> 斜体
45+
for em in tag.find_all(['em', 'i']):
46+
em.unwrap()
47+
tag.replace_with(f'{"#" * h} {tag.text}')
48+
49+
# 处理无序列表
50+
for ul in soup.find_all('ul'):
51+
for li in ul.find_all('li'):
52+
li.replace_with(f'- {li.text}')
53+
ul.unwrap() # 移除 <ul> 标签
54+
# 处理有序列表
55+
for ol in soup.find_all('ol'):
56+
for i, li in enumerate(ol.find_all('li'), start=1):
57+
li.replace_with(f'{i}. {li.text}')
58+
ol.unwrap() # 移除 <ol> 标签
59+
60+
# 处理 <a> 链接,去掉 web.archive.org 前缀
61+
for a_tag in soup.find_all('a', href=True):
62+
cleaned_href = re.sub(r'^https?://web\.archive\.org/web/\d+/', '', a_tag['href'])
63+
a_tag.replace_with(f'[{a_tag.text}]({cleaned_href})')
64+
65+
# 处理段落
66+
for p in soup.find_all('p'):
67+
p.insert_before("\n")
68+
p.insert_after("\n")
69+
70+
return soup.get_text()
71+
72+
73+
def fetch(url):
74+
r = requests.get(url)
75+
r.encoding = r.apparent_encoding
76+
markdown_text = html_to_markdown(r.text)
77+
soup = BeautifulSoup(r.text, 'html.parser')
78+
date = soup.find('time', class_='time').text.strip()
79+
title = soup.find('h1', class_='title').text.strip()
80+
read = int(int(soup.find('span', class_='read').text.split(':')[1]) * 1.5)
81+
desc = soup.find('meta', property="og:description").attrs['content']
82+
tags = [i.text.strip() for i in soup.find_all('a', class_='article-tag-list-link')]
83+
slug = model.invoke(f"Please generate an English slug based on the following title: {title}, please only say the slug and do not use pinyin", temperature=0).content
84+
return title, slug, date, read, desc, tags, markdown_text
85+
86+
87+
def main():
88+
for i in range(1, 133):
89+
...
90+
91+
if __name__ == '__main__':
92+
url = sys.argv[1]
93+
print(fetch(url))

requirements.txt

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,65 @@
1-
click==8.0.4
2-
aiohttp==3.8.5
3-
WTForms==3.0.1
1+
aiofiles==24.1.0
2+
aiohappyeyeballs==2.6.1
3+
aiohttp==3.11.14
4+
aiomysql==0.0.22
45
aioredis==1.3.1
5-
cython==0.29.28
6-
mistune==0.8.4
6+
aiosignal==1.3.2
7+
aiosmtplib==1.1.6
8+
aiosqlite==0.17.0
9+
annotated-types==0.7.0
10+
arq==0.22
11+
async-timeout==5.0.1
712
asyncblink==0.3.2
8-
werkzeug==2.2.3
9-
Pygments==2.11.2
13+
attrs==25.3.0
14+
beautifulsoup4==4.13.3
15+
blinker==1.6.3
16+
click==8.0.4
1017
cssmin==0.2.0
11-
PyYAML==6.0
18+
Cython==0.29.28
19+
extraction==0.3
20+
frozenlist==1.5.0
1221
gunicorn==20.1.0
22+
hiredis==3.1.0
23+
html5lib==1.1
24+
httptools==0.6.4
25+
idna==3.10
26+
iso8601==0.1.16
27+
lxml==5.3.1
28+
Mako==1.3.9
29+
MarkupSafe==3.0.2
30+
mistune==0.8.4
31+
multidict==5.2.0
1332
pangu==4.0.6.1
33+
propcache==0.3.1
34+
pydantic==1.10.21
35+
pydantic_core==2.33.0
36+
Pygments==2.11.2
37+
PyJWT==2.1.0
38+
PyMySQL==0.9.3
39+
pypika-tortoise==0.1.6
40+
pytz==2025.2
41+
PyYAML==6.0.2
42+
raven==6.10.0
43+
raven-aiohttp==0.7.0
1444
redis==4.4.4
15-
aiosmtplib==1.1.6
16-
arq==0.22
17-
tortoise-orm==0.18.1
18-
lxml==4.9.1
19-
extraction==0.3
2045
sanic==21.12.2
2146
sanic-jwt==1.7.0
2247
Sanic-Mako==0.7.0
48+
-e git+https://github.com/dongweiming/sanic-oauth.git@3cc63a9ab0dad56b9bca9a70b4788dec0ca0dae6#egg=sanic_oauth
2349
sanic-routing==0.7.2
2450
sanic-sentry==0.1.7
2551
sanic-session==0.8.0
2652
Sanic-WTF==0.6.0
27-
aiomysql==0.0.22
28-
-e git+https://github.com/dongweiming/sanic-oauth.git@master#egg=sanic-oauth
53+
setuptools==78.1.0
54+
six==1.17.0
55+
soupsieve==2.6
56+
tortoise-orm==0.18.1
57+
typing-inspection==0.4.0
58+
typing_extensions==4.13.0
59+
ujson==5.10.0
60+
uvloop==0.21.0
61+
webencodings==0.5.1
62+
websockets==10.0
63+
Werkzeug==0.16.1
64+
WTForms==3.0.1
65+
yarl==1.18.3

0 commit comments

Comments
 (0)