Skip to content

Commit 07e9308

Browse files
authored
fix: add user-agent header to prevent WAF blocking (#382)
1 parent 7570fd6 commit 07e9308

File tree

3 files changed

+8
-3
lines changed

3 files changed

+8
-3
lines changed

ferry/crawler/cas_request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
import httpx
1010
import requests
1111

12+
# User-Agent header to avoid AWS WAF challenges
13+
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
14+
1215

1316
class RateLimitError(Exception):
1417
pass

ferry/crawler/evals/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from tqdm import tqdm
77

88
from ferry.crawler.cache import load_cache_json, save_cache_json
9-
from ferry.crawler.cas_request import CASClient
9+
from ferry.crawler.cas_request import CASClient, USER_AGENT
1010
from ferry.crawler.classes.parse import ParsedCourse
1111

1212
from .fetch import FetchError, fetch_course_evals
@@ -49,7 +49,7 @@ async def crawl_evals(
4949
print(f"Fetching course evals for valid seasons: {seasons}...")
5050

5151
# initiate Yale client session to access evals
52-
client = httpx.AsyncClient()
52+
client = httpx.AsyncClient(headers={"User-Agent": USER_AGENT})
5353
cas_client = CASClient(cas_cookie=cas_cookie)
5454

5555
# Season level is synchronous, following same logic as class fetcher
@@ -92,5 +92,6 @@ async def crawl_evals(
9292
data.sort(key=lambda x: x["crn"])
9393
save_cache_json(data_dir / "parsed_evaluations" / f"{season}.json", data)
9494

95+
await client.aclose()
9596
print("\033[F", end="")
9697
print(f"Fetching course evals for valid seasons: {seasons}... ✔")

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from ferry.args_parser import Args, get_args, parse_seasons_arg
1010
from ferry.crawler.cache import load_cache_json
11+
from ferry.crawler.cas_request import USER_AGENT
1112
from ferry.crawler.classes import crawl_classes
1213
from ferry.crawler.evals import crawl_evals
1314
from ferry.crawler.seasons import fetch_seasons
@@ -26,7 +27,7 @@ async def start_crawl(args: Args):
2627
classes = None
2728
# Initialize HTTPX client, only used for fetching classes (evals fetch
2829
# initializes its own client with CAS auth)
29-
client = AsyncClient(timeout=None)
30+
client = AsyncClient(timeout=None, headers={"User-Agent": USER_AGENT})
3031
if args.crawl_seasons:
3132
course_seasons = await fetch_seasons(
3233
data_dir=args.data_dir, client=client, use_cache=args.use_cache

0 commit comments

Comments
 (0)