Skip to content

Commit b6529d8

Browse files
committed
refactor: black reformat code
1 parent d71ff58 commit b6529d8

5 files changed

Lines changed: 45 additions & 34 deletions

File tree

crawler/MRE_root/admin_procedure/gen_title_gpt.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,20 @@
44
import json
55

66
client = OpenAI(api_key="api_key")
7-
filename = '行政手續摘要.json'
7+
filename = "行政手續摘要.json"
88

99
# 從 urls.txt 讀取 base_url
1010
main_urls = []
11-
with open('./web_crawler_urls.txt', 'r', encoding='utf-8') as file:
11+
with open("./web_crawler_urls.txt", "r", encoding="utf-8") as file:
1212
for line in file:
1313
url = line.strip().strip('"')
14-
if url.startswith('http'):
14+
if url.startswith("http"):
1515
main_urls.append(url)
1616

1717
# 初始化 JSON 資料
1818
existing_data = []
1919
if os.path.exists(filename):
20-
with open(filename, 'r', encoding='utf-8') as f:
20+
with open(filename, "r", encoding="utf-8") as f:
2121
try:
2222
existing_data = json.load(f)
2323
if not isinstance(existing_data, list):
@@ -28,7 +28,7 @@
2828
# 對每個 URL 進行爬取和處理
2929
for base_url in main_urls:
3030
prefixed_url = f"https://r.jina.ai/{base_url}"
31-
31+
3232
# 爬取網站內容
3333
try:
3434
response = requests.get(prefixed_url)
@@ -54,11 +54,11 @@
5454
請使用**繁體中文**回應
5555
請把所有資訊列點出來
5656
不要反斜線和空白
57-
可以附上連結在回應裡面"""
57+
可以附上連結在回應裡面""",
5858
},
59-
{"role": "user", "content": web_data}
59+
{"role": "user", "content": web_data},
6060
],
61-
model="gpt-4o-mini"
61+
model="gpt-4o-mini",
6262
)
6363

6464
# 解析回應並格式化 JSON
@@ -72,5 +72,5 @@
7272
continue
7373

7474
# 將所有結果寫入檔案
75-
with open(filename, 'w', encoding='utf-8') as f:
76-
json.dump(existing_data, f, ensure_ascii=False, indent=4)
75+
with open(filename, "w", encoding="utf-8") as f:
76+
json.dump(existing_data, f, ensure_ascii=False, indent=4)

crawler/MRE_root/admin_procedure/main.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77

88
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
99

10+
1011
# 將主要網頁的子網頁存入 urls.txt
1112
def get_link(main_url):
1213
links = fetch_suburls(main_url)
1314
with open("./sub_urls/urls.txt", "w") as f:
1415
for link in links:
1516
f.write(f"{link}\n")
1617

18+
1719
def jina():
1820
client = OpenAI()
1921

@@ -34,24 +36,25 @@ def jina():
3436
print(f"Text file saved as: ./txt/{filename}.txt")
3537
else:
3638
print(f"Skipping URL: {url}", "classified as False")
37-
39+
3840
time.sleep(3)
3941
# Clear the content of urls.txt after processing all URLs
4042
open("./sub_urls/urls.txt", "w").close()
4143

44+
4245
if __name__ == "__main__":
4346
# 讀取 txt 檔案並提取 URL 列表
4447
main_urls = []
45-
with open('web_crawler_urls.txt', 'r') as file:
48+
with open("web_crawler_urls.txt", "r") as file:
4649
for line in file:
4750
# 去除首尾空白,只保留純粹的 URL
4851
url = line.strip()
4952
# 檢查是否為有效的 URL(避免處理最後一行或其他無效行)
50-
if url.startswith('http'):
53+
if url.startswith("http"):
5154
main_urls.append(url)
5255

5356
print(main_urls)
5457

5558
for main_url in main_urls:
5659
get_link(main_url)
57-
jina()
60+
jina()

crawler/MRE_root/admin_procedure/utils/content_gen.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,26 @@ def generate_content(client, web_data):
1010
請使用**繁體中文**回應
1111
請把所有資訊列點出來
1212
不要反斜線和空白
13-
**連結請務必保留並且一定要附上,包含下載連結**"""
13+
**連結請務必保留並且一定要附上,包含下載連結**""",
1414
},
15-
{"role": "user", "content": web_data}
15+
{"role": "user", "content": web_data},
1616
],
17-
model="gpt-4o-mini"
17+
model="gpt-4o-mini",
1818
)
1919
return chat_completion.choices[0].message.content.strip()
2020

21+
2122
def question_classifaier(client, web_data):
2223
chat_completion = client.chat.completions.create(
2324
messages=[
2425
{
2526
"role": "system",
2627
"content": """請幫助分辨以下內容是否屬於校園的行政手續問題:
2728
如註冊、申請宿舍、學籍變更等。
28-
**只要輸出True或False即可。**"""
29+
**只要輸出True或False即可。**""",
2930
},
30-
{"role": "user", "content": web_data}
31+
{"role": "user", "content": web_data},
3132
],
32-
model="gpt-4o-mini"
33+
model="gpt-4o-mini",
3334
)
34-
return chat_completion.choices[0].message.content.strip()
35+
return chat_completion.choices[0].message.content.strip()

crawler/MRE_root/admin_procedure/utils/file_ops.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,22 @@
33
import re
44
import os
55

6+
67
def create_safe_filename(base_url):
78
# Create a safe filename from the given base URL
89
parsed_url = urllib.parse.urlparse(base_url)
9-
filename = re.sub(r'[^\w\-\_\.]', '', parsed_url.netloc + parsed_url.path) # 支援網址檔名
10-
if not filename.endswith('.xml'):
11-
filename += '.xml'
10+
filename = re.sub(
11+
r"[^\w\-\_\.]", "", parsed_url.netloc + parsed_url.path
12+
) # 支援網址檔名
13+
if not filename.endswith(".xml"):
14+
filename += ".xml"
1215
return filename
1316

17+
1418
def save_xml(content, filename):
1519
# Save the content as an XML file
16-
root = ET.Element('root')
20+
root = ET.Element("root")
1721
root.text = content
1822
tree = ET.ElementTree(root)
19-
os.makedirs('./xml', exist_ok=True)
20-
tree.write(f'./xml/{filename}', encoding='UTF-8', xml_declaration=True)
23+
os.makedirs("./xml", exist_ok=True)
24+
tree.write(f"./xml/{filename}", encoding="UTF-8", xml_declaration=True)
Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
import requests
22
from bs4 import BeautifulSoup
33

4+
45
def fetch_url_content(url):
56
# Fetch content from the given URL
67
try:
78
response = requests.get(url)
89
response.raise_for_status()
9-
response.encoding = 'UTF-8'
10+
response.encoding = "UTF-8"
1011
return response.text
1112
except requests.RequestException as e:
1213
print(f"Error fetching the URL: {e}")
1314
return None
1415

16+
1517
def fetch_suburls(url):
1618
# Fetch sub-URLs from the given URL
1719
response = requests.get(url)
@@ -21,20 +23,21 @@ def fetch_suburls(url):
2123
print(f"Successfully fetched.")
2224

2325
# 解析網頁內容
24-
soup = BeautifulSoup(response.text, 'html.parser')
26+
soup = BeautifulSoup(response.text, "html.parser")
2527

2628
# 提取所有連結
2729
links = []
28-
for a in soup.find_all('a', href=True):
29-
link = a['href']
30-
if link.startswith('http'):
30+
for a in soup.find_all("a", href=True):
31+
link = a["href"]
32+
if link.startswith("http"):
3133
links.append(link)
32-
elif link.startswith('/'):
34+
elif link.startswith("/"):
3335
links.append(f"{url.rstrip('/')}{link}")
3436
else:
3537
links.append(f"{url.rstrip('/')}/{link}")
3638
return links
3739

40+
3841
if __name__ == "__main__":
3942
main_url = "https://www.google.com/"
40-
print(fetch_suburls(main_url))
43+
print(fetch_suburls(main_url))

0 commit comments

Comments
 (0)