|
| 1 | +import sys |
| 2 | +import re |
| 3 | + |
| 4 | +from bs4 import BeautifulSoup |
| 5 | +import requests |
| 6 | +from langchain.chat_models import init_chat_model |
| 7 | + |
| 8 | +model = init_chat_model("gpt-4o-mini", model_provider="openai") |
| 9 | + |
| 10 | + |
| 11 | +def html_to_markdown(html): |
| 12 | + soup = BeautifulSoup(html, 'html.parser') |
| 13 | + soup = soup.find('div', class_='post-content') |
| 14 | + |
| 15 | + # 提取代码块 |
| 16 | + for figure in soup.find_all('figure'): |
| 17 | + try: |
| 18 | + lang = figure.get('class', ['', ''])[1] |
| 19 | + except IndexError: |
| 20 | + lang = '' |
| 21 | + figure.replace_with(f'```{lang}\n{figure.text}\n```') |
| 22 | + |
| 23 | + for figure in soup.find_all('figure'): |
| 24 | + try: |
| 25 | + lang = figure.get('class', ['', ''])[1] |
| 26 | + except IndexError: |
| 27 | + lang = '' |
| 28 | + figure.replace_with(f'```{lang}\n{figure.text}\n```') |
| 29 | + |
| 30 | + # 提取代码块 |
| 31 | + for pre in soup.find_all('pre'): |
| 32 | + code = pre.find('code') |
| 33 | + if code: |
| 34 | + lang = code.get('class', [''])[0].replace('language-', '') # 获取代码语言 |
| 35 | + pre.replace_with(f'```{lang}\n{code.text}\n```') |
| 36 | + |
| 37 | + # 转换加粗、斜体、标题等 |
| 38 | + for strong in soup.find_all('strong'): |
| 39 | + strong.replace_with(f'**{strong.text}**') |
| 40 | + for em in soup.find_all('em'): |
| 41 | + em.replace_with(f'*{em.text}*') |
| 42 | + for h in range(1, 7): |
| 43 | + for tag in soup.find_all(f'h{h}'): |
| 44 | + # 移除标题中的 <em> 或 <i> 斜体 |
| 45 | + for em in tag.find_all(['em', 'i']): |
| 46 | + em.unwrap() |
| 47 | + tag.replace_with(f'{"#" * h} {tag.text}') |
| 48 | + |
| 49 | + # 处理无序列表 |
| 50 | + for ul in soup.find_all('ul'): |
| 51 | + for li in ul.find_all('li'): |
| 52 | + li.replace_with(f'- {li.text}') |
| 53 | + ul.unwrap() # 移除 <ul> 标签 |
| 54 | + # 处理有序列表 |
| 55 | + for ol in soup.find_all('ol'): |
| 56 | + for i, li in enumerate(ol.find_all('li'), start=1): |
| 57 | + li.replace_with(f'{i}. {li.text}') |
| 58 | + ol.unwrap() # 移除 <ol> 标签 |
| 59 | + |
| 60 | + # 处理 <a> 链接,去掉 web.archive.org 前缀 |
| 61 | + for a_tag in soup.find_all('a', href=True): |
| 62 | + cleaned_href = re.sub(r'^https?://web\.archive\.org/web/\d+/', '', a_tag['href']) |
| 63 | + a_tag.replace_with(f'[{a_tag.text}]({cleaned_href})') |
| 64 | + |
| 65 | + # 处理段落 |
| 66 | + for p in soup.find_all('p'): |
| 67 | + p.insert_before("\n") |
| 68 | + p.insert_after("\n") |
| 69 | + |
| 70 | + return soup.get_text() |
| 71 | + |
| 72 | + |
| 73 | +def fetch(url): |
| 74 | + r = requests.get(url) |
| 75 | + r.encoding = r.apparent_encoding |
| 76 | + markdown_text = html_to_markdown(r.text) |
| 77 | + soup = BeautifulSoup(r.text, 'html.parser') |
| 78 | + date = soup.find('time', class_='time').text.strip() |
| 79 | + title = soup.find('h1', class_='title').text.strip() |
| 80 | + read = int(int(soup.find('span', class_='read').text.split(':')[1]) * 1.5) |
| 81 | + desc = soup.find('meta', property="og:description").attrs['content'] |
| 82 | + tags = [i.text.strip() for i in soup.find_all('a', class_='article-tag-list-link')] |
| 83 | + slug = model.invoke(f"Please generate an English slug based on the following title: {title}, please only say the slug and do not use pinyin", temperature=0).content |
| 84 | + return title, slug, date, read, desc, tags, markdown_text |
| 85 | + |
| 86 | + |
| 87 | +def main(): |
| 88 | + for i in range(1, 133): |
| 89 | + ... |
| 90 | + |
| 91 | +if __name__ == '__main__': |
| 92 | + url = sys.argv[1] |
| 93 | + print(fetch(url)) |
0 commit comments