Skip to content

Commit ca37e62

Browse files
committed
Enhance bot detection - add recognized bot check and improve suspicious visitor logic
1 parent b6d20d8 commit ca37e62

File tree

2 files changed

+83
-68
lines changed

2 files changed

+83
-68
lines changed

openlibrary/plugins/openlibrary/code.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1229,33 +1229,40 @@ def is_bot():
12291229
"""Check if the current request is from a bot."""
12301230
return req_context.get().is_bot
12311231

1232+
def is_recognized_bot():
1233+
return req_context.get().is_recognized_bot
12321234

12331235
def is_suspicious_visitor():
12341236
"""Check if the current visitor is suspicious and needs human verification.
12351237
1236-
A suspicious visitor is someone who:
1237-
- Is not logged in
1238-
- Is not a known bot (as determined by is_bot())
1239-
- Does not have the verification cookie (vf=1)
1238+
A suspicious visitor is someone who is NOT:
1239+
1. a recognized bot
1240+
2. coming from a referer
1241+
3. carrying a verification cookie (vf=1)
1242+
4. logged in
12401243
12411244
Returns:
12421245
bool: True if visitor is suspicious and needs verification, False otherwise
12431246
"""
1244-
# Check if user is logged in
1245-
try:
1246-
user = web.ctx.site.get_user()
1247-
if user:
1248-
return False
1249-
except Exception:
1250-
pass
1251-
12521247
# Check if it's a known bot
1253-
if is_bot():
1248+
if is_recognized_bot():
12541249
return False
12551250

1251+
# Check if there's a referer header
1252+
if web.ctx.env.get('HTTP_REFERER'):
1253+
return False
1254+
12561255
# Check if visitor has already been verified (has vf=1 cookie)
1257-
return web.cookies().get('vf') != '1'
1256+
if web.cookies().get('vf') == '1':
1257+
return False
12581258

1259+
# Check if user is logged in
1260+
try:
1261+
if web.ctx.site.get_user():
1262+
return False
1263+
except Exception:
1264+
pass
1265+
return True
12591266

12601267
def require_human_verification():
12611268
"""Show the human verification challenge page.

openlibrary/utils/request_context.py

Lines changed: 62 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class RequestContextVars:
2727
lang: str | None
2828
solr_editions: bool | None
2929
print_disabled: bool
30+
is_recognized_bot: bool = False
3031
is_bot: bool = False
3132

3233

@@ -36,6 +37,61 @@ class RequestContextVars:
3637
site: ContextVar[Site] = ContextVar("site")
3738

3839

40+
USER_AGENT_BOTS = [
41+
'sputnikbot',
42+
'dotbot',
43+
'semrushbot',
44+
'googlebot',
45+
'yandexbot',
46+
'monsidobot',
47+
'kazbtbot',
48+
'seznambot',
49+
'dubbotbot',
50+
'360spider',
51+
'redditbot',
52+
'yandexmobilebot',
53+
'linkdexbot',
54+
'musobot',
55+
'mojeekbot',
56+
'focuseekbot',
57+
'behloolbot',
58+
'startmebot',
59+
'yandexaccessibilitybot',
60+
'uptimerobot',
61+
'femtosearchbot',
62+
'pinterestbot',
63+
'toutiaospider',
64+
'yoozbot',
65+
'parsijoobot',
66+
'equellaurlbot',
67+
'donkeybot',
68+
'paperlibot',
69+
'nsrbot',
70+
'discordbot',
71+
'ahrefsbot',
72+
'coccocbot',
73+
'buzzbot',
74+
'laserlikebot',
75+
'baiduspider',
76+
'bingbot',
77+
'mj12bot',
78+
'yoozbotadsbot',
79+
'ahrefsbot',
80+
'amazonbot',
81+
'applebot',
82+
'bingbot',
83+
'brightbot',
84+
'gptbot',
85+
'petalbot',
86+
'semanticscholarbot',
87+
'yandex.com/bots',
88+
'icc-crawler',
89+
]
90+
91+
def _compute_is_recognized_bot(user_agent: str) -> bool:
92+
my_ua = user_agent.lower()
93+
return any(ua in my_ua for ua in USER_AGENT_BOTS)
94+
3995
def _compute_is_bot(user_agent: str | None, hhcl: str | None) -> bool:
4096
"""Determine if the request is from a bot.
4197
@@ -46,57 +102,7 @@ def _compute_is_bot(user_agent: str | None, hhcl: str | None) -> bool:
46102
Returns:
47103
True if the request appears to be from a bot, False otherwise
48104
"""
49-
user_agent_bots = [
50-
'sputnikbot',
51-
'dotbot',
52-
'semrushbot',
53-
'googlebot',
54-
'yandexbot',
55-
'monsidobot',
56-
'kazbtbot',
57-
'seznambot',
58-
'dubbotbot',
59-
'360spider',
60-
'redditbot',
61-
'yandexmobilebot',
62-
'linkdexbot',
63-
'musobot',
64-
'mojeekbot',
65-
'focuseekbot',
66-
'behloolbot',
67-
'startmebot',
68-
'yandexaccessibilitybot',
69-
'uptimerobot',
70-
'femtosearchbot',
71-
'pinterestbot',
72-
'toutiaospider',
73-
'yoozbot',
74-
'parsijoobot',
75-
'equellaurlbot',
76-
'donkeybot',
77-
'paperlibot',
78-
'nsrbot',
79-
'discordbot',
80-
'ahrefsbot',
81-
'coccocbot',
82-
'buzzbot',
83-
'laserlikebot',
84-
'baiduspider',
85-
'bingbot',
86-
'mj12bot',
87-
'yoozbotadsbot',
88-
'ahrefsbot',
89-
'amazonbot',
90-
'applebot',
91-
'bingbot',
92-
'brightbot',
93-
'gptbot',
94-
'petalbot',
95-
'semanticscholarbot',
96-
'yandex.com/bots',
97-
'icc-crawler',
98-
]
99-
105+
100106
# Check hhcl header first (set by nginx)
101107
if hhcl == '1':
102108
return True
@@ -105,9 +111,7 @@ def _compute_is_bot(user_agent: str | None, hhcl: str | None) -> bool:
105111
if not user_agent:
106112
return True
107113

108-
user_agent = user_agent.lower()
109-
return any(bot in user_agent for bot in user_agent_bots)
110-
114+
return _compute_is_recognized_bot(user_agent)
111115

112116
def _parse_solr_editions_from_web() -> bool:
113117
"""Parse solr_editions from web.py context."""
@@ -147,6 +151,9 @@ def set_context_from_legacy_web_py() -> None:
147151
print_disabled = bool(web.cookies().get('pd', False))
148152

149153
# Compute is_bot once during request setup
154+
is_recognized_bot = _compute_is_recognized_bot(
155+
user_agent=web.ctx.env.get("HTTP_USER_AGENT", "")
156+
)
150157
is_bot = _compute_is_bot(
151158
user_agent=web.ctx.env.get("HTTP_USER_AGENT"),
152159
hhcl=web.ctx.env.get("HTTP_X_HHCL"),
@@ -160,6 +167,7 @@ def set_context_from_legacy_web_py() -> None:
160167
lang=web.ctx.lang,
161168
solr_editions=solr_editions,
162169
print_disabled=print_disabled,
170+
is_recognized_bot=is_recognized_bot,
163171
is_bot=is_bot,
164172
)
165173
)

0 commit comments

Comments
 (0)