YACS-RCOS · Tevetron · Nov 12, 2024 · Nov 19, 2024 · Dec 4, 2024 · Feb 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ courses20.xml
 compose-dev.yaml
 rpi_data/get-summer-2023-2.sh
 rpi_data/summer-20232.csv
+.venv
diff --git a/docker-compose.development.yml b/docker-compose.development.yml
@@ -28,12 +28,12 @@ services:
       - ./src/web:/app
       - web_node_modules:/app/node_modules/
     environment:
-      - YACS_API_HOST=http://yacs_api:5000
+      - YACS_API_HOST=http://yacs_api:4000
 
   yacs_api:
-    command: /bin/bash -c "python tables/database_session.py && PYTHONPATH=. alembic upgrade head && uvicorn app:app --reload --host 0.0.0.0 --port 5000"
+    command: /bin/bash -c "python tables/database_session.py && PYTHONPATH=. alembic upgrade head && uvicorn app:app --reload --host 0.0.0.0 --port 4000"
     ports:
-      - 5000:5000
+      - 4000:4000
     volumes:
       - ./src/api:/usr/src
     environment:
@@ -55,3 +55,13 @@ services:
       - POSTGRES_DB=yacs
       - POSTGRES_USER=yacs
       - POSTGRES_PASSWORD=${DB_PASS:-easy_dev_pass}
+
+  yacs_cron:
+    ports:
+      - 4321:4321
+    volumes:
+      - ./src/cron:/usr/src
+    environment:
+      - YACS_API_HOST=http://yacs_api:4000
+      - GECKO_PATH=/usr/local/bin/geckodriver
+      - API_SIGN_KEY=${API_SIGN_KEY:-secretKey}
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -30,3 +30,9 @@ services:
     container_name: yacs_db
     image: postgres:12-alpine
 
+  yacs_cron:
+    restart: unless-stopped
+    container_name: yacs_cron
+    build:
+      context: ./src/cron
+      dockerfile: Dockerfile
diff --git a/rpi_data/modules/parse_runner.py b/rpi_data/modules/parse_runner.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
-import headless_login as login
-import new_parse as parser
+import cron.headless_login as login
+import cron.new_parse as parser
 import sys
 from datetime import datetime
 import pytz

diff --git a/src/api/Dockerfile b/src/api/Dockerfile
@@ -3,7 +3,7 @@
 RUN mkdir -p /usr/src
 WORKDIR /usr/src
 COPY ./requirements.txt /usr/src/
+RUN apt-get update && apt-get install -y libpq-dev build-essential
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . /usr/src/
-
-CMD [ "sh", "scripts/start.sh" ]
+CMD [ "sh", "scripts/start.sh" ]
diff --git a/src/cron/Dockerfile b/src/cron/Dockerfile
@@ -0,0 +1,23 @@
+# FROM selenium/standalone-firefox:latest
+FROM python:3.9-slim
+
+
+RUN apt-get update                             \
+ && apt-get install -y --no-install-recommends \
+    ca-certificates curl firefox-esr           \
+ && rm -fr /var/lib/apt/lists/*                \
+ && curl -L https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz | tar xz -C /usr/local/bin \
+ && apt-get purge -y ca-certificates curl
+
+RUN apt-get update && apt-get -y install cron vim
+COPY crontab /etc/cron.d/crontab
+RUN chmod 0644 /etc/cron.d/crontab
+RUN touch /var/log/cron.log
+
+RUN mkdir -p /usr/src
+WORKDIR /usr/src
+COPY ./requirements.txt /usr/src/
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . /usr/src/
+
+CMD ["cron", "-f"]
diff --git a/rpi_data/modules/ci_scraper.py → src/cron/ci_scraper.py b/rpi_data/modules/ci_scraper.py → src/cron/ci_scraper.py
diff --git a/rpi_data/modules/course.py → src/cron/course.py b/rpi_data/modules/course.py → src/cron/course.py
diff --git a/rpi_data/modules/courses_scraper.py → src/cron/courses_scraper.py b/rpi_data/modules/courses_scraper.py → src/cron/courses_scraper.py
@@ -16,7 +16,7 @@
 from selenium.webdriver.common.keys import Keys
 import goldy_parse as gp

 '''
 AUGUST 2024 Course Catalog Scraper
 Uses the RPI catalog website to search for individual courses, and then scrapes all of that data. Depends a lot on consistent catalog formatting,
 so if that changes it will need work.
@@ -26,7 +26,7 @@
 by Giancarlo Martinelli (gcm on discord)
 '''

 '''
 Formatting Regex. Replaces unicode characters and removes unnessecary spaces caused by messy scraping (Sorry.)
 '''
 def un_spaceify(string: str) -> str:
@@ -46,13 +46,13 @@
 def re_spaceify(string: str) -> str:
    string = re.sub(r".(?=\w)", ". ", string)

 '''
 Splits a list into a list of n lists. Useful for multiprocessing.
 '''
 def split(a: list[str], n: int):
    parts = []
    [parts.append([]) for _ in range(n)]
    for allocating in range(len(a)):
        parts[allocating % n].append(a[allocating])
    return parts

@@ -80,18 +80,18 @@
            return [ele.rsplit("=", 1)[1], ele.rsplit("=", 2)[1].rsplit("&", 1)[0]]


 '''
 Large scraping function. Goes to the search page of a single course, checks if the data exists, and then scrapes the course
 '''
 def scrape_single_course(prefix:str, code:str, nav: str, cat: str) -> dict:
    try:

        link = "https://catalog.rpi.edu/content.php?filter%5B27%5D={}&filter%5B29%5D={}&filter%5Bkeyword%5D=&filter%5B32%5D=1&filter%5Bcpage%5D=1&cur_cat_oid={}&expand=&navoid={}&search_database=Filter&filter%5Bexact_match%5D=1#acalog_template_course_filter".format(prefix, code, cat, nav)
        r1 = requests.get(link)
        content1 = r1.content
        soup1 = bs(content1, "html.parser")
        check = soup1.find("td", {"class": "block_content"})
        '''
        Testing to see if the course exists. We need the webdriver waits so that selenium only does things when the necessary elements exist. 
        If they don't load in time we probably don't have a valid course.
        '''
@@ -99,7 +99,12 @@
             return dict()
         if "No courses found" in check.get_text(strip=True) or "" == check.get_text(strip=True):
             return dict()
-        nopop = check.find("a", {"aria-expanded": "false"}).get("href") # gets the link to the nopopup page
+
+        element = check.find("a", {"aria-expanded": "false"})
+        nopop = element.get("href") if element else None
+        if nopop is None:
+            return dict()
+        # nopop = check.find("a", {"aria-expanded": "false"}).get("href") # gets the link to the nopopup page
         '''
         Beautiful soup for the nopopup page
         '''
@@ -133,7 +138,7 @@
            description_html = parts[1]
            description_html = description_html.split("<br/>", 3)[-1]
        desc_soup = bs(description_html, "html.parser") # put back into beautiful soup to remove left over tags
        description = un_spaceify(desc_soup.get_text())
        rest = s_string.removeprefix(description_html) # get rid of all of the description stuff to scrape remaining important labels
        r_soup = bs(rest, "html.parser")
        rest_text = r_soup.get_text() # get all of the text from the rest
@@ -149,9 +154,9 @@
                if de in l: # check if our delimiter actually exists in the thing we just split
                    splitted[1] = de + splitted[1] # adds the delimiter back
                t.append(splitted) # stores our splitted (or not splitted) parts for later

            built_list = list(chain.from_iterable(t)) # black magic which collapses our multidimensional list into a single dimension list
        if len(built_list) != 0: # I honestly forgot why I added this. There's probably something useless in the first position of our built list.
            built_list.pop(0)

        for i in range(len(built_list)):
@@ -187,7 +192,7 @@
        looking = looking.split("Corequisites")[1]
    else:
        looking = ""
    for course in courses_mentioned:
        if course in looking:
            coreq_list.append(course)
    return coreq_list
@@ -209,7 +214,7 @@
        pre_looking = looking
    for course in courses_mentioned:
        if course in pre_looking:
            prereq_list.append(course)
    if "Prerequisite" not in looking and prereq_list != []:
        looking = "Prerequisites/Corequisites: " + looking.strip()
    return [prereq_list, looking.strip().replace("\n", "")]
@@ -228,7 +233,7 @@
            looking_co = i
    for course in courses_mentioned:
        if course in looking_cross:
            crosslist_list.add(course)
        if course in looking_co:
            crosslist_list.add(course)
    return list(crosslist_list)
@@ -257,7 +262,7 @@
    if "spring" in result["text"].lower():
        result["spring"] = True
    if "summer" in result["text"].lower():
        result["summer"] = True
    if "availability of instructor" in result["text"].lower() or "upon availability" in result["text"].lower(): 
        result["uia"] = True
    return result
@@ -271,19 +276,19 @@
    jsons_path = os.path.join(parent_path, "frontend", "src", "data", "json")
    folder_title = "{}-{}".format(year - 1, year)
    json_checking_path = os.path.join(jsons_path, folder_title, "pathways.json")
    to_check = list()
    with open(json_checking_path, 'r') as f:
        j = dict(json.load(f))
    for pathway in j.keys():
        if "Remaining" in j[pathway].keys():
            [to_check.append(i) for i in j[pathway]["Remaining"].values()]
        if "Required" in j[pathway].keys():
            [to_check.append(i) for i in j[pathway]["Required"].values()]
        if "One Of0" in j[pathway].keys():
            [to_check.append(i) for i in j[pathway]["One Of0"].values()]
        if "One Of1" in j[pathway].keys():
            [to_check.append(i) for i in j[pathway]["One Of1"].values()]
        if "One Of2" in j[pathway].keys():
            [to_check.append(i) for i in j[pathway]["One Of2"].values()]
    to_check = list(set(to_check))
    return to_check
@@ -297,7 +302,7 @@
    to_check = check_to_scrape(year)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    pdf_path = os.path.join(dir_path, 'pdfs', pdf_name)
    cis = ci.parse_pdf(pdf_path)
    all_courses = dict()
    for course in to_check:
        prefix, code = course.split(" ")
@@ -312,7 +317,7 @@
        all_courses[course_data["name"]] = course_data

    out = json.dumps(all_courses, indent= 4)
    print(out)
    with open(json_path, 'w') as f:
        f.write(out)
    driver.quit()
@@ -324,7 +329,7 @@
    to_check = check_to_scrape(year)
    dir_path = os.path.dirname(os.path.realpath(__file__))
    pdf_path = os.path.join(dir_path, 'pdfs', pdf_name)
    cis = ci.parse_pdf(pdf_path) # uses the pdf scraper to find all communication intensive courses
    all_courses = dict()
    for course in to_check:
        prefix, code = course.split(" ")
@@ -338,7 +343,7 @@

    for res in results:
        all_courses.update(res) # this should combine all of our multiprocess results together

    out = json.dumps(all_courses, indent= 4)
    with open(json_path, 'w') as f: # dump to json file
        f.write(out)
@@ -352,7 +357,7 @@
        subject, code = course.split(" ")[0], course.split(" ")[1]
        course_data = scrape_single_course(subject, code, nav, cat)
        if len(course_data.keys()) == 0: # removes blank courses
            result[subject + "-" + code] = {"description": "", "corequisites": [], "rawprecoreq" : "", "prerequisites": [], "cross listed" : [], "offered": {"text" : ""}}
            continue
        result[course_data["subj"] + "-" + course_data["ID"]] = course_data # builds dictionary
    return result

diff --git a/src/cron/crontab b/src/cron/crontab
@@ -0,0 +1 @@
+00 * * * * root /usr/local/bin/python3 /usr/src/no_login.py >> /var/log/cron.log 2>&1
diff --git a/rpi_data/modules/goldy_parse.py → src/cron/goldy_parse.py b/rpi_data/modules/goldy_parse.py → src/cron/goldy_parse.py
diff --git a/rpi_data/modules/headless_login.py → src/cron/headless_login.py b/rpi_data/modules/headless_login.py → src/cron/headless_login.py
diff --git a/rpi_data/modules/new_parse.py → src/cron/new_parse.py b/rpi_data/modules/new_parse.py → src/cron/new_parse.py
diff --git a/rpi_data/modules/no_login.py → src/cron/no_login.py b/rpi_data/modules/no_login.py → src/cron/no_login.py
@@ -13,7 +13,7 @@
 import regex as re
 import os

 '''
 Finds all of the course codes for a given term and subject.
 '''
 def find_codes(term, subj):
@@ -30,26 +30,26 @@
    print(len(elements))
    pruned_elements = []
    codes = []
    for all in elements:
        element = all.find("a").text
        pruned_elements.append(element)
        codes.append(element[:9])

    return codes

 '''
 Generates SIS links for a list of codes
 '''
 def generate_links(term, codes):
    links = []
    for all in codes:
        subj = all[:4]
        code = all[5:]
        single_course = "https://sis.rpi.edu/rss/bwckctlg.p_disp_listcrse?term_in={}&subj_in={}&crse_in={}&schd_in=L".format(term, subj, code)
        links.append(single_course)
    return links

 '''
 Scrapes all of the course information for a list of links.
 '''
 def scrape_all(links, term, major) -> list[Course]:
@@ -74,7 +74,7 @@
 #info[15] - crosslist capacity, info[16] - crosslist enrolled, info[17] - crosslist seats left,
 #info[18] are the profs, info[19] are days of the sem that the course spans, and info[20] is location
 #Remove index[4] because most classes are on campus, with exceptions for some grad and doctoral courses. 
 '''
 Main link scrape, which splits the page into individual courses and then scrapes each.
 '''
 def link_scrape(term, link, major) -> list[Course]:
@@ -104,7 +104,7 @@
    if (len(titles) != len(bodies)):
        raise RuntimeError("Titles do not equal bodies: "+ link)
    courses = []
    for i in range(len(titles)):
        title = titles[i].text
        split_title = title.rsplit(" - ", 3)        
        body_info = body_scrape(bodies[i])
@@ -121,10 +121,10 @@
            course.append(split_title[0]) # NAME

        formatted = format_and_order(body_info)
        [courses.append(i) for i in formatted]
    return courses

 '''
 Scrapes the course occupancy information for a specific course from SIS.
 '''
 def get_slots(term, CRN):
@@ -149,7 +149,7 @@
    scraped_table.pop(0)
    return scraped_table[0]

 '''
 Scrapes info from main page for a single course.
 '''
 def body_scrape(body) -> list[list[str]]:
@@ -159,7 +159,7 @@
    string_body = string_body.replace("<br>", "")
    string_body = string_body.replace("<br/>", "")
    split_body = string_body.split("\n")
    credits = ""
    for part in split_body:
        if "Credits" in part:
            part = part.replace("Credits", "")
@@ -180,7 +180,7 @@
        course.append(credits)
    return courses

 '''
 Scrapes a table element into a 2D string list.
 '''
 def table_scrape(table:bs) -> list[list[str]]:
@@ -195,7 +195,7 @@
            scraped_table.append(stripped_row)
    return scraped_table

 '''
 turns a term number into a human readable term
 '''
 def number_to_term(term) -> str:
@@ -220,7 +220,7 @@

 #[crn, major, code, section, credits, name, days, stime, etime, max, curr, rem, profs, sdate, enddate, loc]

 '''
 Formats and orders the courses into the desired order.
 '''
 def format_and_order(courses:list[list[str]]) -> list[list[str]]:
@@ -245,7 +245,7 @@
        course[6] = course[6].replace("(P)", "")
        temp = course[6].split(" ")
        f_temp = []
        for x in range(len(temp)):
            temp[x] = temp[x].strip()
            if (temp[x] == ""):
                continue
@@ -285,13 +285,14 @@
    edate = "{0:%Y}-{0:%m}-{0:%d}".format(dt_end)
    return sdate, edate

 '''
 Parent function that scrapes all courses for a given term and writes them to a CSV file.
 '''
 def no_login_scrape(term: str, num_browsers: int):
     options = Options()
+    services = webdriver.FirefoxService( executable_path=os.environ.get('GECKO_PATH', '/usr/local/bin/geckodriver') )
     options.add_argument("--headless")
-    driver = webdriver.Firefox(options=options) # starter code which uses selenium
+    driver = webdriver.Firefox(options=options, service=services) # starter code which uses selenium
     subjects = old.findAllSubjectCodes(driver) # finds all subject codes
     nav, cat = cs.navigate_to_course(driver, term) # finds the navigation and catalog ids, which are each used to build a course search query.
     driver.quit()
@@ -304,7 +305,7 @@
            parts = list(cs.split(links, num_browsers))
            temp_courses = pool.starmap(scrape_all, [(part, term, subject) for part in parts])
        temp_courses = [i for sublist in temp_courses for i in sublist] # flattens the list
        [i.addSchool(subjects[subject]) for i in temp_courses] # adds the school to each course
        temp_codes = list(set([i.major + " " + i.code for i in temp_courses]))
        print(len(temp_codes))
        extra_info = pre_req_scrape(temp_codes, nav, cat, num_browsers) # scrapes the extra info off of the catalog website
@@ -320,8 +321,8 @@
            course.addReqs(extra_info[course.short]["prerequisites"], extra_info[course.short]["corequisites"], extra_info[course.short]["rawprecoreq"], extra_info[course.short]["description"])
            course.frequency = extra_info[course.short]["offered"]["text"]

        [courses.append(i) for i in temp_courses]
    '''
    Check Professor Goldschmidt's information
    '''
    textTerm = number_to_term(term).lower().replace(" ", "")
@@ -335,8 +336,9 @@
     parent = os.path.abspath(os.path.join(dir_path, os.pardir))
     path = os.path.join(parent, number_to_term(term).lower().replace(" ", "-") + ".csv")
     old.writeCSV(courses, path)
+    return path
 
 '''
 Scrapes the prerequisites for multiple courses at once.
 '''
 def pre_req_scrape(codes: list[str], nav:str, cat:str, num_browsers: int):
@@ -348,7 +350,7 @@
        all_courses.update(res)
    return all_courses

 '''
 Edits a course using the information from Professor Goldschmidt's website.
 '''
 def add_goldy_info(course: Course, goldy_info: dict):
@@ -368,7 +370,31 @@
         course.raw = "Prerequisites: " + goldy_info[checking]
 
 if __name__ == "__main__":
-    no_login_scrape("202409", 15)
-    #driver = webdriver.Firefox()
+    print("Our test works at", datetime.now())
+
+    # options = Options()
+    # services = webdriver.FirefoxService( executable_path=os.environ.get('GECKO_PATH', '/usr/local/bin/geckodriver') )
+    # options.add_argument("--headless")
+    # driver = webdriver.Firefox(options=options, service=services)
+
+    # print(cs.scrape_single_course(driver, "MANE", "6990", 202509))
+
+    file = no_login_scrape("202509", 15)
+    fileName = os.path.basename(os.path.normpath(file))
+    url = os.environ.get('YACS_API_HOST', 'http://yacs_api:4000')
+    payload = {'isPubliclyVisible': 'on'}
+
+    files=[
+    ('file',(fileName,open(file,'rb'),'text/csv'))
+    ]
+
+    headers = {
+    'X-API-KEY': os.environ.get('API_SIGN_KEY', None)
+    }
+
+    resp = requests.post(url + '/api/bulkCourseUpload', headers=headers, data=payload, files=files)
+    print(resp.text)
+
+    # driver = webdriver.Firefox()
     #print(cs.scrape_single_course(driver, "CSCI", "1100", 202409))
-    #print(link_scrape("202409", "https://sis.rpi.edu/rss/bwckctlg.p_disp_listcrse?term_in=202409&subj_in=CHME&crse_in=4980&schd_in=L", "CHME"))
+    #print(link_scrape("202409", "https://sis.rpi.edu/rss/bwckctlg.p_disp_listcrse?term_in=202409&subj_in=CHME&crse_in=4980&schd_in=L", "CHME"))
diff --git a/src/cron/requirements.txt b/src/cron/requirements.txt
@@ -0,0 +1,7 @@
+selenium==4.28.1
+beautifulsoup4==4.12.3
+bs4==0.0.2
+pypdf==5.1.0
+pandas==2.2.3
+requests==2.32.3
+regex==2024.11.6
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		00 * * * * root /usr/local/bin/python3 /usr/src/no_login.py >> /var/log/cron.log 2>&1