Skip to content

Commit 5a6af93

Browse files
committed
feat: update data
1 parent fb7f216 commit 5a6af93

File tree

3 files changed

+1825
-1243
lines changed

3 files changed

+1825
-1243
lines changed

spiders/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
### 使用
44

55
- `python -m ensurepip`
6-
- `pip install --user pipenv`
6+
- `pip install pipenv`
77
- `pipenv install`
88
- `pipenv run python -m today`
99

spiders/today.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,24 @@ def parseLiItem(liItem):
4848

4949

5050
def getDayEvents(html):
51-
match = re.compile(
52-
"(<span class=\"mw-headline\" id=\"大事[记記纪紀]\">[\s\S]*?)<h2>").search(html)
51+
# Find the main 大事记 section
52+
main_section_pattern = re.compile(
53+
r'<div class="mw-heading mw-heading2"><h2 id=\"大事[记記纪紀]\">.*?</h2>.*?</div>([\s\S]*?)<div class="mw-heading mw-heading2">')
54+
main_match = main_section_pattern.search(html)
55+
5356
list = []
5457

55-
if match:
56-
bsObj = BeautifulSoup(match.group(1), "html.parser").findAll("li")
58+
if main_match:
59+
# Get all content between 大事记 and the next main heading
60+
content = main_match.group(1)
61+
62+
# Process all <li> elements within this section (across all subsections)
63+
bsObj = BeautifulSoup(content, "html.parser").findAll("li")
5764
for li in bsObj:
5865
list.extend(parseLiItem(li))
66+
else:
67+
print("未找到大事记部分")
68+
5969
print("共有 %s 条" % list.__len__())
6070

6171
return list
@@ -67,7 +77,8 @@ def main():
6777

6878
for date in dateList:
6979
print("正在获取 %s 的数据..." % date)
70-
r = requests.get("https://zh.wikipedia.org/zh-cn/%s" % date)
80+
r = requests.get("https://zh.wikipedia.org/wiki/%s" %
81+
date, headers={'Accept-Language': 'en-US,zh-CN;q=0.5'})
7182
events = getDayEvents(r.text)
7283
dd = datetime.datetime.strptime(date, "%m月%d日")
7384

@@ -76,7 +87,7 @@ def main():
7687
data[dd.month][dd.day] = events
7788

7889
with open("../src/data/today_in_history.json", "w", encoding="utf-8") as f:
79-
json.dump(data, f, ensure_ascii=False, indent=4)
90+
json.dump(data, f, ensure_ascii=False, indent=2)
8091

8192

8293
if __name__ == "__main__":

0 commit comments

Comments
 (0)