refactor: black reformat code

yenslife · yenslife · commit b6529d8dbc65 · 2025-04-18T20:11:55.000+08:00
diff --git a/crawler/MRE_root/admin_procedure/gen_title_gpt.py b/crawler/MRE_root/admin_procedure/gen_title_gpt.py
@@ -4,20 +4,20 @@
 import json
 
 client = OpenAI(api_key="api_key")
-filename = '行政手續摘要.json'
+filename = "行政手續摘要.json"
 
 # 從 urls.txt 讀取 base_url
 main_urls = []
-with open('./web_crawler_urls.txt', 'r', encoding='utf-8') as file:
+with open("./web_crawler_urls.txt", "r", encoding="utf-8") as file:
     for line in file:
         url = line.strip().strip('"')
-        if url.startswith('http'):
+        if url.startswith("http"):
             main_urls.append(url)
 
 # 初始化 JSON 資料
 existing_data = []
 if os.path.exists(filename):
-    with open(filename, 'r', encoding='utf-8') as f:
+    with open(filename, "r", encoding="utf-8") as f:
         try:
             existing_data = json.load(f)
             if not isinstance(existing_data, list):
@@ -28,7 +28,7 @@
 # 對每個 URL 進行爬取和處理
 for base_url in main_urls:
     prefixed_url = f"https://r.jina.ai/{base_url}"
-    
+
     # 爬取網站內容
     try:
         response = requests.get(prefixed_url)
@@ -54,11 +54,11 @@
                 請使用**繁體中文**回應
                 請把所有資訊列點出來
                 不要反斜線和空白
-                可以附上連結在回應裡面"""
+                可以附上連結在回應裡面""",
             },
-            {"role": "user", "content": web_data}
+            {"role": "user", "content": web_data},
         ],
-        model="gpt-4o-mini"
+        model="gpt-4o-mini",
     )
 
     # 解析回應並格式化 JSON
@@ -72,5 +72,5 @@
         continue
 
 # 將所有結果寫入檔案
-with open(filename, 'w', encoding='utf-8') as f:
-    json.dump(existing_data, f, ensure_ascii=False, indent=4)
+with open(filename, "w", encoding="utf-8") as f:
+    json.dump(existing_data, f, ensure_ascii=False, indent=4)
diff --git a/crawler/MRE_root/admin_procedure/main.py b/crawler/MRE_root/admin_procedure/main.py
@@ -7,13 +7,15 @@
 
 os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
 
+
 # 將主要網頁的子網頁存入 urls.txt
 def get_link(main_url):
     links = fetch_suburls(main_url)
     with open("./sub_urls/urls.txt", "w") as f:
         for link in links:
             f.write(f"{link}\n")
 
+
 def jina():
     client = OpenAI()
 
@@ -34,24 +36,25 @@ def jina():
             print(f"Text file saved as: ./txt/{filename}.txt")
         else:
             print(f"Skipping URL: {url}", "classified as False")
-        
+
         time.sleep(3)
     # Clear the content of urls.txt after processing all URLs
     open("./sub_urls/urls.txt", "w").close()
 
+
 if __name__ == "__main__":
     # 讀取 txt 檔案並提取 URL 列表
     main_urls = []
-    with open('web_crawler_urls.txt', 'r') as file:
+    with open("web_crawler_urls.txt", "r") as file:
         for line in file:
             # 去除首尾空白，只保留純粹的 URL
             url = line.strip()
             # 檢查是否為有效的 URL（避免處理最後一行或其他無效行）
-            if url.startswith('http'):
+            if url.startswith("http"):
                 main_urls.append(url)
 
     print(main_urls)
 
     for main_url in main_urls:
         get_link(main_url)
-        jina()
+        jina()
diff --git a/crawler/MRE_root/admin_procedure/utils/content_gen.py b/crawler/MRE_root/admin_procedure/utils/content_gen.py
@@ -10,25 +10,26 @@ def generate_content(client, web_data):
                 請使用**繁體中文**回應
                 請把所有資訊列點出來
                 不要反斜線和空白
-                **連結請務必保留並且一定要附上，包含下載連結**"""
+                **連結請務必保留並且一定要附上，包含下載連結**""",
             },
-            {"role": "user", "content": web_data}
+            {"role": "user", "content": web_data},
         ],
-        model="gpt-4o-mini"
+        model="gpt-4o-mini",
     )
     return chat_completion.choices[0].message.content.strip()
 
+
 def question_classifaier(client, web_data):
     chat_completion = client.chat.completions.create(
         messages=[
             {
                 "role": "system",
                 "content": """請幫助分辨以下內容是否屬於校園的行政手續問題:
                 如註冊、申請宿舍、學籍變更等。
-                **只要輸出True或False即可。**"""
+                **只要輸出True或False即可。**""",
             },
-            {"role": "user", "content": web_data}
+            {"role": "user", "content": web_data},
         ],
-        model="gpt-4o-mini"
+        model="gpt-4o-mini",
     )
-    return chat_completion.choices[0].message.content.strip()
+    return chat_completion.choices[0].message.content.strip()
diff --git a/crawler/MRE_root/admin_procedure/utils/file_ops.py b/crawler/MRE_root/admin_procedure/utils/file_ops.py
@@ -3,18 +3,22 @@
 import re
 import os
 
+
 def create_safe_filename(base_url):
     # Create a safe filename from the given base URL
     parsed_url = urllib.parse.urlparse(base_url)
-    filename = re.sub(r'[^\w\-\_\.]', '', parsed_url.netloc + parsed_url.path)  # 支援網址檔名
-    if not filename.endswith('.xml'):
-        filename += '.xml'
+    filename = re.sub(
+        r"[^\w\-\_\.]", "", parsed_url.netloc + parsed_url.path
+    )  # 支援網址檔名
+    if not filename.endswith(".xml"):
+        filename += ".xml"
     return filename
 
+
 def save_xml(content, filename):
     # Save the content as an XML file
-    root = ET.Element('root')
+    root = ET.Element("root")
     root.text = content
     tree = ET.ElementTree(root)
-    os.makedirs('./xml', exist_ok=True)
-    tree.write(f'./xml/{filename}', encoding='UTF-8', xml_declaration=True)
+    os.makedirs("./xml", exist_ok=True)
+    tree.write(f"./xml/{filename}", encoding="UTF-8", xml_declaration=True)
diff --git a/crawler/MRE_root/admin_procedure/utils/web_ops.py b/crawler/MRE_root/admin_procedure/utils/web_ops.py
@@ -1,17 +1,19 @@
 import requests
 from bs4 import BeautifulSoup
 
+
 def fetch_url_content(url):
     # Fetch content from the given URL
     try:
         response = requests.get(url)
         response.raise_for_status()
-        response.encoding = 'UTF-8'
+        response.encoding = "UTF-8"
         return response.text
     except requests.RequestException as e:
         print(f"Error fetching the URL: {e}")
         return None
 
+
 def fetch_suburls(url):
     # Fetch sub-URLs from the given URL
     response = requests.get(url)
@@ -21,20 +23,21 @@ def fetch_suburls(url):
         print(f"Successfully fetched.")
 
     # 解析網頁內容
-    soup = BeautifulSoup(response.text, 'html.parser')
+    soup = BeautifulSoup(response.text, "html.parser")
 
     # 提取所有連結
     links = []
-    for a in soup.find_all('a', href=True):
-        link = a['href']
-        if link.startswith('http'):
+    for a in soup.find_all("a", href=True):
+        link = a["href"]
+        if link.startswith("http"):
             links.append(link)
-        elif link.startswith('/'):
+        elif link.startswith("/"):
             links.append(f"{url.rstrip('/')}{link}")
         else:
             links.append(f"{url.rstrip('/')}/{link}")
     return links
 
+
 if __name__ == "__main__":
     main_url = "https://www.google.com/"
-    print(fetch_suburls(main_url))
+    print(fetch_suburls(main_url))