-
Notifications
You must be signed in to change notification settings - Fork 171
/
Copy pathconvert.py
93 lines (76 loc) · 3.1 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import re
from bs4 import BeautifulSoup
import requests
from langchain.chat_models import init_chat_model
model = init_chat_model("gpt-4o-mini", model_provider="openai")
def html_to_markdown(html):
soup = BeautifulSoup(html, 'html.parser')
soup = soup.find('div', class_='post-content')
# 提取代码块
for figure in soup.find_all('figure'):
try:
lang = figure.get('class', ['', ''])[1]
except IndexError:
lang = ''
figure.replace_with(f'```{lang}\n{figure.text}\n```')
for figure in soup.find_all('figure'):
try:
lang = figure.get('class', ['', ''])[1]
except IndexError:
lang = ''
figure.replace_with(f'```{lang}\n{figure.text}\n```')
# 提取代码块
for pre in soup.find_all('pre'):
code = pre.find('code')
if code:
lang = code.get('class', [''])[0].replace('language-', '') # 获取代码语言
pre.replace_with(f'```{lang}\n{code.text}\n```')
# 转换加粗、斜体、标题等
for strong in soup.find_all('strong'):
strong.replace_with(f'**{strong.text}**')
for em in soup.find_all('em'):
em.replace_with(f'*{em.text}*')
for h in range(1, 7):
for tag in soup.find_all(f'h{h}'):
# 移除标题中的 <em> 或 <i> 斜体
for em in tag.find_all(['em', 'i']):
em.unwrap()
tag.replace_with(f'{"#" * h} {tag.text}')
# 处理无序列表
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
li.replace_with(f'- {li.text}')
ul.unwrap() # 移除 <ul> 标签
# 处理有序列表
for ol in soup.find_all('ol'):
for i, li in enumerate(ol.find_all('li'), start=1):
li.replace_with(f'{i}. {li.text}')
ol.unwrap() # 移除 <ol> 标签
# 处理 <a> 链接,去掉 web.archive.org 前缀
for a_tag in soup.find_all('a', href=True):
cleaned_href = re.sub(r'^https?://web\.archive\.org/web/\d+/', '', a_tag['href'])
a_tag.replace_with(f'[{a_tag.text}]({cleaned_href})')
# 处理段落
for p in soup.find_all('p'):
p.insert_before("\n")
p.insert_after("\n")
return soup.get_text()
def fetch(url):
r = requests.get(url)
r.encoding = r.apparent_encoding
markdown_text = html_to_markdown(r.text)
soup = BeautifulSoup(r.text, 'html.parser')
date = soup.find('time', class_='time').text.strip()
title = soup.find('h1', class_='title').text.strip()
read = int(int(soup.find('span', class_='read').text.split(':')[1]) * 1.5)
desc = soup.find('meta', property="og:description").attrs['content']
tags = [i.text.strip() for i in soup.find_all('a', class_='article-tag-list-link')]
slug = model.invoke(f"Please generate an English slug based on the following title: {title}, please only say the slug and do not use pinyin", temperature=0).content
return title, slug, date, read, desc, tags, markdown_text
def main():
for i in range(1, 133):
...
if __name__ == '__main__':
url = sys.argv[1]
print(fetch(url))