File tree Expand file tree Collapse file tree 3 files changed +8
-3
lines changed
Expand file tree Collapse file tree 3 files changed +8
-3
lines changed Original file line number Diff line number Diff line change 99import httpx
1010import requests
1111
12+ # User-Agent header to avoid AWS WAF challenges
13+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
14+
1215
1316class RateLimitError (Exception ):
1417 pass
Original file line number Diff line number Diff line change 66from tqdm import tqdm
77
88from ferry .crawler .cache import load_cache_json , save_cache_json
9- from ferry .crawler .cas_request import CASClient
9+ from ferry .crawler .cas_request import CASClient , USER_AGENT
1010from ferry .crawler .classes .parse import ParsedCourse
1111
1212from .fetch import FetchError , fetch_course_evals
@@ -49,7 +49,7 @@ async def crawl_evals(
4949 print (f"Fetching course evals for valid seasons: { seasons } ..." )
5050
5151 # initiate Yale client session to access evals
52- client = httpx .AsyncClient ()
52+ client = httpx .AsyncClient (headers = { "User-Agent" : USER_AGENT } )
5353 cas_client = CASClient (cas_cookie = cas_cookie )
5454
5555 # Season level is synchronous, following same logic as class fetcher
@@ -92,5 +92,6 @@ async def crawl_evals(
9292 data .sort (key = lambda x : x ["crn" ])
9393 save_cache_json (data_dir / "parsed_evaluations" / f"{ season } .json" , data )
9494
95+ await client .aclose ()
9596 print ("\033 [F" , end = "" )
9697 print (f"Fetching course evals for valid seasons: { seasons } ... ✔" )
Original file line number Diff line number Diff line change 88
99from ferry .args_parser import Args , get_args , parse_seasons_arg
1010from ferry .crawler .cache import load_cache_json
11+ from ferry .crawler .cas_request import USER_AGENT
1112from ferry .crawler .classes import crawl_classes
1213from ferry .crawler .evals import crawl_evals
1314from ferry .crawler .seasons import fetch_seasons
@@ -26,7 +27,7 @@ async def start_crawl(args: Args):
2627 classes = None
2728 # Initialize HTTPX client, only used for fetching classes (evals fetch
2829 # initializes its own client with CAS auth)
29- client = AsyncClient (timeout = None )
30+ client = AsyncClient (timeout = None , headers = { "User-Agent" : USER_AGENT } )
3031 if args .crawl_seasons :
3132 course_seasons = await fetch_seasons (
3233 data_dir = args .data_dir , client = client , use_cache = args .use_cache
You can’t perform that action at this time.
0 commit comments