Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions Modal Lab/courses/course_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from main import scrape_courses

import json
import re
import uuid
import modal

stub = modal.Stub(name="link-scraper")
image = modal.Image.debian_slim().pip_install("langchain", "supabase", "openai", "tiktoken", "python-dotenv")

if stub.is_inside():
from supabase import create_client, Client
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores import SupabaseVectorStore


if modal.is_local():
from dotenv import load_dotenv
load_dotenv()
stub.data_dict = modal.Dict({
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
"SUPABASE_URL": os.getenv("SUPABASE_URL"),
"SUPABASE_SERVICE_KEY": os.getenv("SUPABASE_SERVICE_KEY")
})

@stub.function()
def get_course():
print("scraping courses...")
course = scrape_courses()

return course
# f = open('course_complete_mine.json', 'w')
# json.dump(courses, f, indent=4)
# f.close()

# def get_instr_eval():
# print("scraping instructor evaluations...")
# instr_eval = main_eval_instr.eval_instr_run()
# f = open('all_evals_instr.json', 'w')
# json.dump(instr_eval, f, indent=4)
# f.close()

@stub.function()
def write_to_md(courses, path):
with open(path, "w", encoding="utf-8") as f:
for term in courses:
for course in courses[term]:
print(courses[term][course]['name'])
f.write(f"# {courses[term][course]['name'].strip()}\n")
f.write(f"course name: {courses[term][course]['name'].strip()}\n")
f.write(f"subject: {courses[term][course]['cField']}\n")
f.write(f"course number: {courses[term][course]['cNum']}\n")
f.write(f"instructor: {courses[term][course]['instructor']}\n")
f.write(f"term: {term}\n")
f.write(f"Organization - Class Mean: {courses[term][course]['Organization']['Class Mean']} Rice Mean: {courses[term][course]['Organization']['Rice Mean']} Responses: {courses[term][course]['Organization']['Responses']} Distribution: {courses[term][course]['Organization']['Distribution']}\n")
f.write(f"Assignment - Class Mean: {courses[term][course]['Assignment']['Class Mean']} Rice Mean: {courses[term][course]['Assignment']['Rice Mean']} Responses: {courses[term][course]['Assignment']['Responses']} Distribution: {courses[term][course]['Assignment']['Distribution']}\n")
f.write(f"Overall Quality - Class Mean: {courses[term][course]['Overall Quality']['Class Mean']} Rice Mean: {courses[term][course]['Overall Quality']['Rice Mean']} Responses: {courses[term][course]['Overall Quality']['Responses']} Distribution: {courses[term][course]['Overall Quality']['Distribution']}\n")
f.write(f"Challenge - Class Mean: {courses[term][course]['Challenge']['Class Mean']} Rice Mean: {courses[term][course]['Challenge']['Rice Mean']} Responses: {courses[term][course]['Challenge']['Responses']} Distribution: {courses[term][course]['Challenge']['Distribution']}\n")
f.write(f"Workload - Class Mean: {courses[term][course]['Workload']['Class Mean']} Rice Mean: {courses[term][course]['Workload']['Rice Mean']} Responses: {courses[term][course]['Workload']['Responses']} Distribution: {courses[term][course]['Workload']['Distribution']}\n")
f.write(f"Why take this course - Class Mean: {courses[term][course]['Why take this course']['Class Mean']} Rice Mean: {courses[term][course]['Why take this course']['Rice Mean']} Responses: {courses[term][course]['Why take this course']['Responses']} Distribution: {courses[term][course]['Why take this course']['Distribution']}\n")
f.write(f"Expected Grade - Class Mean: {courses[term][course]['Expected Grade']['Class Mean']} Rice Mean: {courses[term][course]['Expected Grade']['Rice Mean']} Responses: {courses[term][course]['Expected Grade']['Responses']} Distribution: {courses[term][course]['Expected Grade']['Distribution']}\n")
f.write(f"Expected P/F - Class Mean: {courses[term][course]['Expected P/F']['Class Mean']} Rice Mean: {courses[term][course]['Expected P/F']['Rice Mean']} Responses: {courses[term][course]['Expected P/F']['Responses']} Distribution: {courses[term][course]['Expected P/F']['Distribution']}\n")
f.write(f"comments: {courses[term][course]['comments']['comments']}\n")

f.write("\n")

@stub.function()
def add_uuid_to_headers(markdown_file):
with open(markdown_file, 'r') as file:
content = file.readlines()

updated_content = []
for line in content:
if re.match(r'^#[^#]', line):
unique_id = str(uuid.uuid4())
line = line.rstrip() + f" [{unique_id}]\n"
updated_content.append(line)

with open(markdown_file, 'w') as file:
file.writelines(updated_content)

@stub.function(image=image)
def get_pages(path):
pages = []
with open(path, 'r') as f:
contents = f.read()

headers_to_split_on = [
("#", "course name"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(contents)

print(len(md_header_splits))

pages.extend(md_header_splits)
return pages

@stub.function(schedule=modal.Period(days=30), image=image)
def timed_scrape():
courses = get_course()
fp = "courses.md"
write_to_md(courses, fp)
add_uuid_to_headers(fp)
pages = get_pages(fp)
print("Pages:", len(pages))
client: Client = create_client(stub.app.data_dict["SUPABASE_URL"], stub.app.data_dict["SUPABASE_SERVICE_KEY"])
embeddings = OpenAIEmbeddings(openai_api_key=stub.app.data_dict["OPENAI_API_KEY"])
vector_store = SupabaseVectorStore(client=client,
embedding=embeddings,
table_name='courses')
client.table('courses').delete().neq("content", "0").execute()
vector_store.add_documents(pages)
print("Courses updated successfully!")

101 changes: 101 additions & 0 deletions Modal Lab/courses/crn_to_name_map/semester_crn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import requests
from bs4 import BeautifulSoup

cookies = {
'TESTID': 'set',
'SESSID': 'QlYwUk5JNTQ3NTEy',
'_gcl_au': '1.1.951756867.1645388057',
'_fbp': 'fb.1.1645388057597.700770990',
'_hjSessionUser_2310211': 'eyJpZCI6ImU3YWUwOWZkLWYwMjktNTI3ZC04ODBmLTg1NDRmNjkyMzg0NSIsImNyZWF0ZWQiOjE2NDUzODgwNTc0MzAsImV4aXN0aW5nIjp0cnVlfQ==',
'_rdt_uuid': '1646196522440.0741efa9-462d-4447-8002-758e9206d011',
'LPVID': 'Y1MzQwOGE1NWY5MWY5ZjVj',
'_gcl_dc': 'GCL.1646196556.CjwKCAiApfeQBhAUEiwA7K_UH_R2pdM05Ctrz6UTPZiKN9cUQujNCx0HAYZJ1CwBQdkqVSB9L-SJ0BoCd8AQAvD_BwE',
'_gac_UA-66594374-45': '1.1646196556.CjwKCAiApfeQBhAUEiwA7K_UH_R2pdM05Ctrz6UTPZiKN9cUQujNCx0HAYZJ1CwBQdkqVSB9L-SJ0BoCd8AQAvD_BwE',
'_gac_UA-66594374-40': '1.1646196556.CjwKCAiApfeQBhAUEiwA7K_UH_R2pdM05Ctrz6UTPZiKN9cUQujNCx0HAYZJ1CwBQdkqVSB9L-SJ0BoCd8AQAvD_BwE',
'_gid': 'GA1.2.198699051.1647570618',
'cebs': '1',
'_gcl_aw': 'GCL.1647790821.CjwKCAjwoduRBhA4EiwACL5RPzglwBhvdhCIwzDHIlm7kLO-WokubhS38rerdE4TRWGCzoY8WM0t-xoCNTcQAvD_BwE',
'_gac_UA-2249859-53': '1.1647790821.CjwKCAjwoduRBhA4EiwACL5RPzglwBhvdhCIwzDHIlm7kLO-WokubhS38rerdE4TRWGCzoY8WM0t-xoCNTcQAvD_BwE',
'hubspotutk': 'e551c433a9d1bd5a29170e21844fc682',
'__hssrc': '1',
'_gac_UA-45347247-2': '1.1647821867.CjwKCAjwoduRBhA4EiwACL5RP4CXG8Nh0jNIhSInbGQuR_vudKHHnPb7W5MHzD-6FfXJXuwTThX7nhoCgxEQAvD_BwE',
'smartrfi_external_id': 'undefined',
'__hstc': '95890179.e551c433a9d1bd5a29170e21844fc682.1647790821083.1647790821083.1648260737912.2',
'_ga_GXMHQZBLXZ': 'GS1.1.1648567549.1.1.1648567692.0',
'_ga': 'GA1.2.1526291539.1645132294',
'_ga_934XKRXT4Y': 'GS1.1.1648998578.3.1.1648999095.0',
'smartrfi_prospect_id': 'Rice-1',
'_clck': 'mvrgj1|1|f0d|0',
'_uetvid': 'ae55ffb0928911eca53ac39afc65783c',
'_ce.s': 'v11.rlc~1649556207737~v~ec3d28851ac9ca69bb69a8e68042e9ca9baad9ef~vpv~1~ir~1~gtrk.la~l12y2dmn',
'citrix_ns_id': 'e0AyExb9LC//10BM6UAQ+OyUo9o0003',
'IDMSESSID': '1107BE322FBBB55FC6068DE3584A49E252990B7346755BF8139249F0FBB54C052B3C9E986F947729C0C532B6CD6B6951',
'_gat': '1',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
'Accept': 'application/xml, text/xml, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://esther.rice.edu/selfserve/swkscmt.main',
'Accept-Language': 'en-US,en;q=0.9',
}

semester_crn = {}
semester_crn_name = {}
for term in range(2008, 2024):
p_term_fall = str(term) + '10'
params = (
('p_data', 'COURSES'),
('p_term', p_term_fall),
)
response = requests.get('https://esther.rice.edu/selfserve/!swkscmp.ajax', headers=headers, params=params,
cookies=cookies)
html_parsed = BeautifulSoup(response.content, 'html.parser')
course_list = html_parsed.find_all("course")
semester_crn[str(term-1) + '_Fall'] = {(course['crn'], p_term_fall) for course in course_list}
semester_crn_name[str(term - 1) + '_Fall'] = {}
for course in course_list:
semester_crn_name[str(term-1) + '_Fall'][course['crn']] = course['subj']+' '+course['numb']
# semester_crn_name[str(term-1) + '_Fall'] = {[course['subj']+' '+course['numb'], course['crn']] for course in course_list}

p_term_spring = str(term) + '20'
params = (
('p_data', 'COURSES'),
('p_term', p_term_spring),
)
response = requests.get('https://esther.rice.edu/selfserve/!swkscmp.ajax', headers=headers, params=params,
cookies=cookies)
html_parsed = BeautifulSoup(response.content, 'html.parser')
course_list = html_parsed.find_all("course")
semester_crn[str(term) + '_Spring'] = {(course['crn'], p_term_spring) for course in course_list}
semester_crn_name[str(term) + '_Spring'] = {}
for course in course_list:
semester_crn_name[str(term) + '_Spring'][course['crn']] = course['subj']+' '+course['numb']
# semester_crn_name[str(term) + '_Spring'] = {[course['subj']+' '+course['numb'], course['crn']] for course in course_list}


p_term_summer = str(term) + '30'
params = (
('p_data', 'COURSES'),
('p_term', p_term_summer),
)
response = requests.get('https://esther.rice.edu/selfserve/!swkscmp.ajax', headers=headers, params=params,
cookies=cookies)
html_parsed = BeautifulSoup(response.content, 'html.parser')
course_list = html_parsed.find_all("course")
semester_crn[str(term) + '_Summer'] = {(course['crn'], p_term_summer) for course in course_list}
semester_crn_name[str(term) + '_Summer'] = {}
for course in course_list:
semester_crn_name[str(term) + '_Summer'][course['crn']] = course['subj']+' '+course['numb']

for i in range(2008, 2014):
semester_crn.pop(f'{i}_Summer')
semester_crn_name.pop(f'{i}_Summer')

Loading