Skip to content

Commit e07e705

Browse files
committed
add crawler_nckuhub and crawler_for_course
1 parent caa78f9 commit e07e705

2 files changed

Lines changed: 263 additions & 0 deletions

File tree

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
'''
2+
爬 https://course.ncku.edu.tw/index.php?c=qry_all (成大選課系統) by seleniumy
3+
'''
4+
import time
5+
import json
6+
import os
7+
import re
8+
from selenium import webdriver
9+
from selenium.webdriver.chrome.service import Service
10+
from selenium.webdriver.chrome.options import Options
11+
from selenium.webdriver.common.by import By
12+
from selenium.webdriver.support.ui import WebDriverWait
13+
from selenium.webdriver.support import expected_conditions as EC
14+
from bs4 import BeautifulSoup
15+
from rich import print
16+
17+
def setup_driver():
18+
# 設定 Chrome 選項
19+
chrome_options = Options()
20+
chrome_options.add_argument('--headless') # 無頭模式,不會開啟瀏覽器視窗
21+
chrome_options.add_argument('--disable-gpu')
22+
chrome_options.add_argument('--no-sandbox')
23+
chrome_options.add_argument('--disable-dev-shm-usage')
24+
25+
# 初始化 webdriver
26+
driver = webdriver.Chrome(options=chrome_options)
27+
return driver
28+
29+
def main():
30+
url = "https://course.ncku.edu.tw/index.php?c=qry_all"
31+
driver = setup_driver()
32+
driver.get(url)
33+
last_height = driver.execute_script("return document.body.scrollHeight")
34+
print(last_height)
35+
36+
# 找到所有導航按鈕
37+
nav_elements = driver.find_elements(By.CLASS_NAME, 'btn_dept')
38+
course_list = [element.text for element in nav_elements if element.text != '']
39+
#for i in course_list: print(i)
40+
# 點擊每個按鈕並收集資料
41+
data = []
42+
43+
html_content=driver.page_source
44+
soup=BeautifulSoup(html_content,'lxml')
45+
div_elements=soup.find_all('li', class_='btn_dept')
46+
counter=len(div_elements)
47+
48+
for course in course_list:
49+
counter -= 1
50+
if counter < 0:
51+
#print(soup)
52+
break
53+
try:
54+
# 使用 XPath 查找按鈕
55+
button_xpath = f"//li[@class='btn_dept'][contains(text(), '{course}')]"
56+
button = WebDriverWait(driver, 10).until(
57+
EC.presence_of_element_located((By.XPATH, button_xpath))
58+
)
59+
60+
# 點擊按鈕
61+
driver.execute_script("arguments[0].click();", button)
62+
time.sleep(1) # 等待頁面載入
63+
64+
# 解析課程資料
65+
soup = BeautifulSoup(driver.page_source, 'lxml')
66+
table = soup.find('table', {'id': 'A9-table'})
67+
68+
if table:
69+
rows = table.find_all('tr')
70+
for row in rows[1:]: # 跳過表頭
71+
columns = row.find_all('td')
72+
course_data = {
73+
'系所名稱': columns[0].text.strip(),
74+
'系號-序號': columns[1].text.strip(),
75+
'年級': columns[2].text.strip(),
76+
'類別': columns[3].text.strip(),
77+
'科目名稱': columns[4].text.strip().split(' ')[0],
78+
'學分': columns[5].text.strip(),
79+
'教師姓名': columns[6].text.strip(),
80+
'已選課人數/餘額': columns[7].text.strip(),
81+
'時間/教室': columns[8].text.strip(),
82+
'是否有餘額': '額' not in columns[7].text.strip()
83+
}
84+
data.append(course_data)
85+
86+
print(f"已掃描: {course}")
87+
driver.back()
88+
WebDriverWait(driver, 10).until(
89+
EC.presence_of_element_located((By.XPATH, '//li[@class="btn_dept"]'))
90+
)
91+
92+
except Exception as e:
93+
print(f"處理 {course} 時發生錯誤: {str(e)}")
94+
continue
95+
96+
# 關閉瀏覽器
97+
driver.quit()
98+
99+
# 儲存資料
100+
timestamp = time.strftime("%Y%m%d_%H%M%S")
101+
filename = f'ncku_courses_11302.json'
102+
with open(filename, 'w', encoding='utf-8') as f:
103+
json.dump(data, f, ensure_ascii=False, indent=2)
104+
105+
print(f"課程資料已保存至: {filename}")
106+
print(f"總共收集到 {len(data)} 門課程")
107+
108+
if __name__ == "__main__":
109+
main()
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
'''
2+
爬 https://nckuhub.com/course,並用 chatgpt 文本摘要評論
3+
*需要openai api_key
4+
'''
5+
6+
import time
7+
import json
8+
import os
9+
import requests
10+
from selenium.webdriver.support import expected_conditions as EC
11+
from bs4 import BeautifulSoup
12+
from rich import print
13+
import openai
14+
15+
openai.api_key = ""
16+
17+
# 定義摘要函式
18+
def summarize_text(content_list):
19+
content_str = "\n".join(content_list)
20+
21+
response = openai.ChatCompletion.create(
22+
model="gpt-4o-mini",
23+
messages=[
24+
{"role": "system", "content": "你是一個擅長文本摘要的 AI,請把課程評價做**30字內**簡潔且具體的摘要。"},
25+
{"role": "user", "content": content_str}
26+
],
27+
temperature=0.5
28+
)
29+
summary = response["choices"][0]["message"]["content"]
30+
print(summary)
31+
return summary
32+
33+
def main():
34+
url = "https://nckuhub.com/course"
35+
response = requests.get(url)
36+
if response.status_code == 200:
37+
# 將回應資料解析為 JSON 格式
38+
data = response.json()
39+
course=data['courses']
40+
41+
id=[]
42+
for i in course:
43+
id.append(i.get('id'))
44+
#print(i)
45+
46+
course_list=[]
47+
48+
print(len(id))
49+
index=1
50+
51+
for i in id:
52+
course_url=url+'/'+str(i)
53+
#print(course_url)
54+
response=requests.get(course_url)
55+
if response.status_code != 200: continue
56+
57+
data = response.json()
58+
courseInfo=data['courseInfo']
59+
60+
course_name=courseInfo["課程名稱"]
61+
print(course_name,index)
62+
index+=1
63+
64+
if_exixt={}
65+
for item in course_list:
66+
if item['科目名稱'] == course_name and item["教師姓名"] == courseInfo["老師"]:
67+
if_exixt=item
68+
69+
###如果課程已存在,更新時間
70+
if if_exixt !={}:
71+
# 將新的「時間」append(type:list)
72+
if isinstance(if_exixt['時間'], list):
73+
if_exixt['時間'].append(courseInfo["時間"])
74+
if_exixt['課程編碼'].append(courseInfo["選課序號"])
75+
else:
76+
# 將原本單個時間轉為列表,並追加新時間
77+
if_exixt['時間'] = [if_exixt['時間'], courseInfo["時間"]]
78+
if_exixt['課程編碼'] = [if_exixt['課程編碼'], courseInfo["選課序號"]]
79+
80+
###如果是新課程,把甜涼收穫轉成float or None
81+
else:
82+
got=data['got']
83+
if isinstance(got, str):
84+
got = float(got)
85+
got=int(got)
86+
if got==0:
87+
got='None'
88+
89+
sweet=data['sweet']
90+
if isinstance(sweet, str):
91+
sweet = float(sweet)
92+
sweet=int(sweet)
93+
if sweet==0:
94+
sweet='None'
95+
96+
cold=data['cold']
97+
if isinstance(cold, str):
98+
cold = float(cold)
99+
cold=int(cold)
100+
if cold==0:
101+
cold='None'
102+
103+
comment_list=[]
104+
comment=data['comment']
105+
106+
for i in comment:
107+
comment_list.append(i['comment'])
108+
109+
summary=[]
110+
111+
#如果評論存在,則文本摘要
112+
if comment:
113+
time.sleep(1)
114+
summary=summarize_text(comment_list)
115+
else:
116+
time.sleep(0.5)
117+
continue
118+
119+
course_data={
120+
'科目名稱':courseInfo["課程名稱"],
121+
'課程編碼':courseInfo["選課序號"],
122+
'系所名稱':courseInfo["系所名稱"],
123+
'教師姓名':courseInfo["老師"],
124+
'時間':courseInfo["時間"],
125+
'收穫':got,
126+
'甜度':sweet,
127+
'涼度':cold,
128+
'評價':summary,
129+
}
130+
course_list.append(course_data)
131+
132+
133+
134+
if index%300==0: #每300筆存一次
135+
timestamp = time.strftime("%Y%m%d_%H%M%S")
136+
filename = f'nckuhub_ts_{timestamp}.json'
137+
with open(filename, 'w', encoding='utf-8') as f:
138+
json.dump(course_list, f, ensure_ascii=False, indent=2)
139+
140+
print(f"課程資料已保存至: {filename}")
141+
print(f"總共收集到 {len(course_list)} 門課程")
142+
143+
# 儲存資料
144+
timestamp = time.strftime("%Y%m%d_%H%M%S")
145+
filename = f'nckuhub_ts_{timestamp}.json'
146+
with open(filename, 'w', encoding='utf-8') as f:
147+
json.dump(course_list, f, ensure_ascii=False, indent=2)
148+
149+
print(f"課程資料已保存至: {filename}")
150+
print(f"總共收集到 {len(course_list)} 門課程")
151+
152+
153+
if __name__ == "__main__":
154+
main()

0 commit comments

Comments
 (0)