44import json
55
66client = OpenAI (api_key = "api_key" )
7- filename = ' 行政手續摘要.json'
7+ filename = " 行政手續摘要.json"
88
99# 從 urls.txt 讀取 base_url
1010main_urls = []
11- with open (' ./web_crawler_urls.txt' , 'r' , encoding = ' utf-8' ) as file :
11+ with open (" ./web_crawler_urls.txt" , "r" , encoding = " utf-8" ) as file :
1212 for line in file :
1313 url = line .strip ().strip ('"' )
14- if url .startswith (' http' ):
14+ if url .startswith (" http" ):
1515 main_urls .append (url )
1616
1717# 初始化 JSON 資料
1818existing_data = []
1919if os .path .exists (filename ):
20- with open (filename , 'r' , encoding = ' utf-8' ) as f :
20+ with open (filename , "r" , encoding = " utf-8" ) as f :
2121 try :
2222 existing_data = json .load (f )
2323 if not isinstance (existing_data , list ):
2828# 對每個 URL 進行爬取和處理
2929for base_url in main_urls :
3030 prefixed_url = f"https://r.jina.ai/{ base_url } "
31-
31+
3232 # 爬取網站內容
3333 try :
3434 response = requests .get (prefixed_url )
5454 請使用**繁體中文**回應
5555 請把所有資訊列點出來
5656 不要反斜線和空白
57- 可以附上連結在回應裡面"""
57+ 可以附上連結在回應裡面""" ,
5858 },
59- {"role" : "user" , "content" : web_data }
59+ {"role" : "user" , "content" : web_data },
6060 ],
61- model = "gpt-4o-mini"
61+ model = "gpt-4o-mini" ,
6262 )
6363
6464 # 解析回應並格式化 JSON
7272 continue
7373
7474# 將所有結果寫入檔案
75- with open (filename , 'w' , encoding = ' utf-8' ) as f :
76- json .dump (existing_data , f , ensure_ascii = False , indent = 4 )
75+ with open (filename , "w" , encoding = " utf-8" ) as f :
76+ json .dump (existing_data , f , ensure_ascii = False , indent = 4 )
0 commit comments