Skip to content

Commit 217cf5d

Browse files
authored
Merge pull request #9 from gdsc-ncku/welcome
Welcome
2 parents 783e47a + c11d057 commit 217cf5d

14 files changed

Lines changed: 612 additions & 152 deletions

File tree

Lines changed: 45 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
'''
2-
爬 https://course.ncku.edu.tw/index.php?c=qry_all (成大選課系統) by seleniumy
3-
'''
1+
"""
2+
爬 https://course.ncku.edu.tw/index.php?c=qry_all (成大選課系統) by seleniumy
3+
"""
4+
45
import time
56
import json
67
import os
@@ -14,72 +15,74 @@
1415
from bs4 import BeautifulSoup
1516
from rich import print
1617

18+
1719
def setup_driver():
1820
# 設定 Chrome 選項
1921
chrome_options = Options()
20-
chrome_options.add_argument('--headless') # 無頭模式,不會開啟瀏覽器視窗
21-
chrome_options.add_argument('--disable-gpu')
22-
chrome_options.add_argument('--no-sandbox')
23-
chrome_options.add_argument('--disable-dev-shm-usage')
24-
22+
chrome_options.add_argument("--headless") # 無頭模式,不會開啟瀏覽器視窗
23+
chrome_options.add_argument("--disable-gpu")
24+
chrome_options.add_argument("--no-sandbox")
25+
chrome_options.add_argument("--disable-dev-shm-usage")
26+
2527
# 初始化 webdriver
2628
driver = webdriver.Chrome(options=chrome_options)
2729
return driver
2830

31+
2932
def main():
3033
url = "https://course.ncku.edu.tw/index.php?c=qry_all"
3134
driver = setup_driver()
3235
driver.get(url)
3336
last_height = driver.execute_script("return document.body.scrollHeight")
3437
print(last_height)
35-
38+
3639
# 找到所有導航按鈕
37-
nav_elements = driver.find_elements(By.CLASS_NAME, 'btn_dept')
38-
course_list = [element.text for element in nav_elements if element.text != '']
39-
#for i in course_list: print(i)
40+
nav_elements = driver.find_elements(By.CLASS_NAME, "btn_dept")
41+
course_list = [element.text for element in nav_elements if element.text != ""]
42+
# for i in course_list: print(i)
4043
# 點擊每個按鈕並收集資料
4144
data = []
4245

43-
html_content=driver.page_source
44-
soup=BeautifulSoup(html_content,'lxml')
45-
div_elements=soup.find_all('li', class_='btn_dept')
46-
counter=len(div_elements)
46+
html_content = driver.page_source
47+
soup = BeautifulSoup(html_content, "lxml")
48+
div_elements = soup.find_all("li", class_="btn_dept")
49+
counter = len(div_elements)
4750

4851
for course in course_list:
4952
counter -= 1
5053
if counter < 0:
51-
#print(soup)
54+
# print(soup)
5255
break
5356
try:
5457
# 使用 XPath 查找按鈕
5558
button_xpath = f"//li[@class='btn_dept'][contains(text(), '{course}')]"
5659
button = WebDriverWait(driver, 10).until(
5760
EC.presence_of_element_located((By.XPATH, button_xpath))
5861
)
59-
62+
6063
# 點擊按鈕
6164
driver.execute_script("arguments[0].click();", button)
6265
time.sleep(1) # 等待頁面載入
63-
66+
6467
# 解析課程資料
65-
soup = BeautifulSoup(driver.page_source, 'lxml')
66-
table = soup.find('table', {'id': 'A9-table'})
67-
68+
soup = BeautifulSoup(driver.page_source, "lxml")
69+
table = soup.find("table", {"id": "A9-table"})
70+
6871
if table:
69-
rows = table.find_all('tr')
72+
rows = table.find_all("tr")
7073
for row in rows[1:]: # 跳過表頭
71-
columns = row.find_all('td')
74+
columns = row.find_all("td")
7275
course_data = {
73-
'系所名稱': columns[0].text.strip(),
74-
'系號-序號': columns[1].text.strip(),
75-
'年級': columns[2].text.strip(),
76-
'類別': columns[3].text.strip(),
77-
'科目名稱': columns[4].text.strip().split(' ')[0],
78-
'學分': columns[5].text.strip(),
79-
'教師姓名': columns[6].text.strip(),
80-
'已選課人數/餘額': columns[7].text.strip(),
81-
'時間/教室': columns[8].text.strip(),
82-
'是否有餘額': '額' not in columns[7].text.strip()
76+
"系所名稱": columns[0].text.strip(),
77+
"系號-序號": columns[1].text.strip(),
78+
"年級": columns[2].text.strip(),
79+
"類別": columns[3].text.strip(),
80+
"科目名稱": columns[4].text.strip().split(" ")[0],
81+
"學分": columns[5].text.strip(),
82+
"教師姓名": columns[6].text.strip(),
83+
"已選課人數/餘額": columns[7].text.strip(),
84+
"時間/教室": columns[8].text.strip(),
85+
"是否有餘額": "額" not in columns[7].text.strip(),
8386
}
8487
data.append(course_data)
8588

@@ -88,22 +91,23 @@ def main():
8891
WebDriverWait(driver, 10).until(
8992
EC.presence_of_element_located((By.XPATH, '//li[@class="btn_dept"]'))
9093
)
91-
94+
9295
except Exception as e:
9396
print(f"處理 {course} 時發生錯誤: {str(e)}")
9497
continue
95-
98+
9699
# 關閉瀏覽器
97100
driver.quit()
98-
101+
99102
# 儲存資料
100103
timestamp = time.strftime("%Y%m%d_%H%M%S")
101-
filename = f'ncku_courses_11302.json'
102-
with open(filename, 'w', encoding='utf-8') as f:
104+
filename = f"ncku_courses_11302.json"
105+
with open(filename, "w", encoding="utf-8") as f:
103106
json.dump(data, f, ensure_ascii=False, indent=2)
104-
107+
105108
print(f"課程資料已保存至: {filename}")
106109
print(f"總共收集到 {len(data)} 門課程")
107110

111+
108112
if __name__ == "__main__":
109-
main()
113+
main()
Lines changed: 80 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
'''
2-
爬 https://nckuhub.com/course,並用 chatgpt 文本摘要評論
3-
*需要openai api_key
4-
'''
1+
"""
2+
爬 https://nckuhub.com/course,並用 chatgpt 文本摘要評論
3+
*需要openai api_key
4+
"""
55

66
import time
77
import json
@@ -14,141 +14,151 @@
1414

1515
openai.api_key = ""
1616

17+
1718
# 定義摘要函式
1819
def summarize_text(content_list):
1920
content_str = "\n".join(content_list)
2021

2122
response = openai.ChatCompletion.create(
2223
model="gpt-4o-mini",
2324
messages=[
24-
{"role": "system", "content": "你是一個擅長文本摘要的 AI,請把課程評價做**30字內**簡潔且具體的摘要。"},
25-
{"role": "user", "content": content_str}
25+
{
26+
"role": "system",
27+
"content": "你是一個擅長文本摘要的 AI,請把課程評價做**30字內**簡潔且具體的摘要。",
28+
},
29+
{"role": "user", "content": content_str},
2630
],
27-
temperature=0.5
28-
)
31+
temperature=0.5,
32+
)
2933
summary = response["choices"][0]["message"]["content"]
3034
print(summary)
3135
return summary
3236

37+
3338
def main():
3439
url = "https://nckuhub.com/course"
3540
response = requests.get(url)
3641
if response.status_code == 200:
3742
# 將回應資料解析為 JSON 格式
3843
data = response.json()
39-
course=data['courses']
44+
course = data["courses"]
4045

41-
id=[]
46+
id = []
4247
for i in course:
43-
id.append(i.get('id'))
44-
#print(i)
48+
id.append(i.get("id"))
49+
# print(i)
4550

46-
course_list=[]
51+
course_list = []
4752

4853
print(len(id))
49-
index=1
54+
index = 1
5055

5156
for i in id:
52-
course_url=url+'/'+str(i)
53-
#print(course_url)
54-
response=requests.get(course_url)
55-
if response.status_code != 200: continue
57+
course_url = url + "/" + str(i)
58+
# print(course_url)
59+
response = requests.get(course_url)
60+
if response.status_code != 200:
61+
continue
5662

5763
data = response.json()
58-
courseInfo=data['courseInfo']
64+
courseInfo = data["courseInfo"]
5965

60-
course_name=courseInfo["課程名稱"]
61-
print(course_name,index)
62-
index+=1
66+
course_name = courseInfo["課程名稱"]
67+
print(course_name, index)
68+
index += 1
6369

64-
if_exixt={}
70+
if_exixt = {}
6571
for item in course_list:
66-
if item['科目名稱'] == course_name and item["教師姓名"] == courseInfo["老師"]:
67-
if_exixt=item
72+
if (
73+
item["科目名稱"] == course_name
74+
and item["教師姓名"] == courseInfo["老師"]
75+
):
76+
if_exixt = item
6877

6978
###如果課程已存在,更新時間
70-
if if_exixt !={}:
79+
if if_exixt != {}:
7180
# 將新的「時間」append(type:list)
72-
if isinstance(if_exixt['時間'], list):
73-
if_exixt['時間'].append(courseInfo["時間"])
74-
if_exixt['課程編碼'].append(courseInfo["選課序號"])
81+
if isinstance(if_exixt["時間"], list):
82+
if_exixt["時間"].append(courseInfo["時間"])
83+
if_exixt["課程編碼"].append(courseInfo["選課序號"])
7584
else:
7685
# 將原本單個時間轉為列表,並追加新時間
77-
if_exixt['時間'] = [if_exixt['時間'], courseInfo["時間"]]
78-
if_exixt['課程編碼'] = [if_exixt['課程編碼'], courseInfo["選課序號"]]
86+
if_exixt["時間"] = [if_exixt["時間"], courseInfo["時間"]]
87+
if_exixt["課程編碼"] = [
88+
if_exixt["課程編碼"],
89+
courseInfo["選課序號"],
90+
]
7991

8092
###如果是新課程,把甜涼收穫轉成float or None
8193
else:
82-
got=data['got']
94+
got = data["got"]
8395
if isinstance(got, str):
8496
got = float(got)
85-
got=int(got)
86-
if got==0:
87-
got='None'
97+
got = int(got)
98+
if got == 0:
99+
got = "None"
88100

89-
sweet=data['sweet']
101+
sweet = data["sweet"]
90102
if isinstance(sweet, str):
91103
sweet = float(sweet)
92-
sweet=int(sweet)
93-
if sweet==0:
94-
sweet='None'
104+
sweet = int(sweet)
105+
if sweet == 0:
106+
sweet = "None"
95107

96-
cold=data['cold']
108+
cold = data["cold"]
97109
if isinstance(cold, str):
98110
cold = float(cold)
99-
cold=int(cold)
100-
if cold==0:
101-
cold='None'
111+
cold = int(cold)
112+
if cold == 0:
113+
cold = "None"
102114

103-
comment_list=[]
104-
comment=data['comment']
115+
comment_list = []
116+
comment = data["comment"]
105117

106118
for i in comment:
107-
comment_list.append(i['comment'])
108-
109-
summary=[]
119+
comment_list.append(i["comment"])
110120

111-
#如果評論存在,則文本摘要
121+
summary = []
122+
123+
# 如果評論存在,則文本摘要
112124
if comment:
113125
time.sleep(1)
114-
summary=summarize_text(comment_list)
126+
summary = summarize_text(comment_list)
115127
else:
116128
time.sleep(0.5)
117129
continue
118130

119-
course_data={
120-
'科目名稱':courseInfo["課程名稱"],
121-
'課程編碼':courseInfo["選課序號"],
122-
'系所名稱':courseInfo["系所名稱"],
123-
'教師姓名':courseInfo["老師"],
124-
'時間':courseInfo["時間"],
125-
'收穫':got,
126-
'甜度':sweet,
127-
'涼度':cold,
128-
'評價':summary,
131+
course_data = {
132+
"科目名稱": courseInfo["課程名稱"],
133+
"課程編碼": courseInfo["選課序號"],
134+
"系所名稱": courseInfo["系所名稱"],
135+
"教師姓名": courseInfo["老師"],
136+
"時間": courseInfo["時間"],
137+
"收穫": got,
138+
"甜度": sweet,
139+
"涼度": cold,
140+
"評價": summary,
129141
}
130142
course_list.append(course_data)
131143

132-
133-
134-
if index%300==0: #每300筆存一次
144+
if index % 300 == 0: # 每300筆存一次
135145
timestamp = time.strftime("%Y%m%d_%H%M%S")
136-
filename = f'nckuhub_ts_{timestamp}.json'
137-
with open(filename, 'w', encoding='utf-8') as f:
146+
filename = f"nckuhub_ts_{timestamp}.json"
147+
with open(filename, "w", encoding="utf-8") as f:
138148
json.dump(course_list, f, ensure_ascii=False, indent=2)
139-
149+
140150
print(f"課程資料已保存至: {filename}")
141151
print(f"總共收集到 {len(course_list)} 門課程")
142152

143153
# 儲存資料
144154
timestamp = time.strftime("%Y%m%d_%H%M%S")
145-
filename = f'nckuhub_ts_{timestamp}.json'
146-
with open(filename, 'w', encoding='utf-8') as f:
155+
filename = f"nckuhub_ts_{timestamp}.json"
156+
with open(filename, "w", encoding="utf-8") as f:
147157
json.dump(course_list, f, ensure_ascii=False, indent=2)
148-
158+
149159
print(f"課程資料已保存至: {filename}")
150160
print(f"總共收集到 {len(course_list)} 門課程")
151161

152162

153163
if __name__ == "__main__":
154-
main()
164+
main()

0 commit comments

Comments
 (0)