Auto_Blog/blog_fetcher.py at main · T0mZhu/Auto_Blog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import re
import asyncio
import random
import urllib.request
import json
from playwright.async_api import Page
from config import Config


class BlogFetcher:
    def __init__(self, browser_manager):
        self.browser_manager = browser_manager

    async def get_user_articles(self, user_url: str) -> list:
        user_id = self._extract_user_id(user_url)

        if not user_id:
            print(f"无法提取用户ID: {user_url}")
            return []

        print(f"正在获取用户文章列表, 用户ID: {user_id}")

        articles = await self._fetch_articles_from_api(user_id)

        if articles:
            print(f"共获取到 {len(articles)} 篇文章")
        else:
            print("API获取失败，尝试页面抓取...")
            articles = await self._fetch_articles_from_page(user_url)

        return articles

    async def _fetch_articles_from_api(self, user_id: str) -> list:
        articles = []
        cursor = "0"

        try:
            for page_num in range(10):
                url = f"https://api.juejin.cn/content_api/v1/article/list_by_user?user_id={user_id}&cursor={cursor}&category_id="

                req = urllib.request.Request(url)
                req.add_header('User-Agent', 'Mozilla/5.0')

                with urllib.request.urlopen(req, timeout=10) as response:
                    data = json.loads(response.read().decode('utf-8'))

                    if data.get('data') and data['data'].get('article_list'):
                        article_list = data['data']['article_list']
                        if not article_list:
                            break

                        for article in article_list:
                            article_url = f"https://juejin.cn/post/{article.get('article_id', '')}"
                            if article_url not in articles:
                                articles.append(article_url)

                        cursor = data['data'].get('cursor', '')
                        if not cursor:
                            break

                        print(f"API获取到 {len(articles)} 篇文章...")
                    else:
                        break

        except Exception as e:
            print(f"API获取失败: {str(e)}")

        return articles

    async def _fetch_articles_from_page(self, user_url: str) -> list:
        page = None
        articles = []

        try:
            user_url = self._normalize_user_url_to_home(user_url)
            print(f"正在从页面获取用户文章: {user_url}")

            page = await self.browser_manager.new_page()

            await page.goto(user_url, wait_until="networkidle", timeout=30000)

            await asyncio.sleep(5)

            title = await page.title()
            print(f"页面标题: {title}")

            content = await page.content()
            print(f"页面内容长度: {len(content)}")

            if '暂无内容' in content or '没有' in content:
                print("用户主页没有内容")

            print("\n正在尝试查找文章链接...")

            post_links = await page.query_selector_all('[href*="/post/"]')
            print(f"找到 {len(post_links)} 个文章链接")

            for link in post_links:
                try:
                    href = await link.get_attribute('href')
                    if href:
                        if href.startswith('/'):
                            href = 'https://juejin.cn' + href
                        if href not in articles:
                            articles.append(href)
                except:
                    continue

            print(f"页面获取到 {len(articles)} 篇文章")

            if articles:
                print("前5个链接:", articles[:5])

        except Exception as e:
            print(f"页面获取失败: {str(e)}")
            import traceback
            traceback.print_exc()
        finally:
            if page:
                await page.close()

        return articles

    def _normalize_user_url_to_home(self, user_url: str) -> str:
        user_url = user_url.strip()

        if not user_url.startswith('http'):
            if 'juejin.cn' not in user_url:
                user_url = f"https://juejin.cn/user/{user_url}"
            else:
                user_url = f"https://juejin.cn/{user_url}"

        user_url = user_url.rstrip('/')

        if user_url.endswith('/posts'):
            user_url = user_url[:-6]

        match = re.search(r'juejin\.cn/(?:user/)?(\w+)', user_url)
        if match:
            user_id = match.group(1)
            return f"https://juejin.cn/user/{user_id}"

        return user_url

    def _extract_user_id(self, user_url: str) -> str:
        user_url = user_url.strip()

        if user_url.isdigit():
            return user_url

        match = re.search(r'juejin\.cn/(?:user/)?(\w+)', user_url)
        if match:
            return match.group(1)
        return None

    def _normalize_user_url(self, user_url: str) -> str:
        user_url = user_url.strip()

        if not user_url.startswith('http'):
            if 'juejin.cn' not in user_url:
                user_url = f"https://juejin.cn/user/{user_url}/posts"
            else:
                user_url = f"https://juejin.cn/{user_url}/posts"

        if 'juejin.cn/user/' in user_url and '/posts' not in user_url:
            user_url = user_url.rstrip('/') + '/posts'

        match = re.search(r'juejin\.cn/(?:user/)?(\w+)', user_url)
        if match:
            user_id = match.group(1)
            return f"https://juejin.cn/user/{user_id}/posts"

        return user_url