@@ -48,14 +48,24 @@ def parseLiItem(liItem):
4848
4949
5050def getDayEvents (html ):
51- match = re .compile (
52- "(<span class=\" mw-headline\" id=\" 大事[记記纪紀]\" >[\s\S]*?)<h2>" ).search (html )
51+ # Find the main 大事记 section
52+ main_section_pattern = re .compile (
53+ r'<div class="mw-heading mw-heading2"><h2 id=\"大事[记記纪紀]\">.*?</h2>.*?</div>([\s\S]*?)<div class="mw-heading mw-heading2">' )
54+ main_match = main_section_pattern .search (html )
55+
5356 list = []
5457
55- if match :
56- bsObj = BeautifulSoup (match .group (1 ), "html.parser" ).findAll ("li" )
58+ if main_match :
59+ # Get all content between 大事记 and the next main heading
60+ content = main_match .group (1 )
61+
62+ # Process all <li> elements within this section (across all subsections)
63+ bsObj = BeautifulSoup (content , "html.parser" ).findAll ("li" )
5764 for li in bsObj :
5865 list .extend (parseLiItem (li ))
66+ else :
67+ print ("未找到大事记部分" )
68+
5969 print ("共有 %s 条" % list .__len__ ())
6070
6171 return list
@@ -67,7 +77,8 @@ def main():
6777
6878 for date in dateList :
6979 print ("正在获取 %s 的数据..." % date )
70- r = requests .get ("https://zh.wikipedia.org/zh-cn/%s" % date )
80+ r = requests .get ("https://zh.wikipedia.org/wiki/%s" %
81+ date , headers = {'Accept-Language' : 'en-US,zh-CN;q=0.5' })
7182 events = getDayEvents (r .text )
7283 dd = datetime .datetime .strptime (date , "%m月%d日" )
7384
@@ -76,7 +87,7 @@ def main():
7687 data [dd .month ][dd .day ] = events
7788
7889 with open ("../src/data/today_in_history.json" , "w" , encoding = "utf-8" ) as f :
79- json .dump (data , f , ensure_ascii = False , indent = 4 )
90+ json .dump (data , f , ensure_ascii = False , indent = 2 )
8091
8192
8293if __name__ == "__main__" :
0 commit comments