@@ -27,6 +27,7 @@ class RequestContextVars:
2727 lang : str | None
2828 solr_editions : bool | None
2929 print_disabled : bool
30+ is_recognized_bot : bool = False
3031 is_bot : bool = False
3132
3233
@@ -36,6 +37,61 @@ class RequestContextVars:
3637site : ContextVar [Site ] = ContextVar ("site" )
3738
3839
40+ USER_AGENT_BOTS = [
41+ 'sputnikbot' ,
42+ 'dotbot' ,
43+ 'semrushbot' ,
44+ 'googlebot' ,
45+ 'yandexbot' ,
46+ 'monsidobot' ,
47+ 'kazbtbot' ,
48+ 'seznambot' ,
49+ 'dubbotbot' ,
50+ '360spider' ,
51+ 'redditbot' ,
52+ 'yandexmobilebot' ,
53+ 'linkdexbot' ,
54+ 'musobot' ,
55+ 'mojeekbot' ,
56+ 'focuseekbot' ,
57+ 'behloolbot' ,
58+ 'startmebot' ,
59+ 'yandexaccessibilitybot' ,
60+ 'uptimerobot' ,
61+ 'femtosearchbot' ,
62+ 'pinterestbot' ,
63+ 'toutiaospider' ,
64+ 'yoozbot' ,
65+ 'parsijoobot' ,
66+ 'equellaurlbot' ,
67+ 'donkeybot' ,
68+ 'paperlibot' ,
69+ 'nsrbot' ,
70+ 'discordbot' ,
71+ 'ahrefsbot' ,
72+ 'coccocbot' ,
73+ 'buzzbot' ,
74+ 'laserlikebot' ,
75+ 'baiduspider' ,
76+ 'bingbot' ,
77+ 'mj12bot' ,
78+ 'yoozbotadsbot' ,
79+ 'ahrefsbot' ,
80+ 'amazonbot' ,
81+ 'applebot' ,
82+ 'bingbot' ,
83+ 'brightbot' ,
84+ 'gptbot' ,
85+ 'petalbot' ,
86+ 'semanticscholarbot' ,
87+ 'yandex.com/bots' ,
88+ 'icc-crawler' ,
89+ ]
90+
91+ def _compute_is_recognized_bot (user_agent : str ) -> bool :
92+ my_ua = user_agent .lower ()
93+ return any (ua in my_ua for ua in USER_AGENT_BOTS )
94+
3995def _compute_is_bot (user_agent : str | None , hhcl : str | None ) -> bool :
4096 """Determine if the request is from a bot.
4197
@@ -46,57 +102,7 @@ def _compute_is_bot(user_agent: str | None, hhcl: str | None) -> bool:
46102 Returns:
47103 True if the request appears to be from a bot, False otherwise
48104 """
49- user_agent_bots = [
50- 'sputnikbot' ,
51- 'dotbot' ,
52- 'semrushbot' ,
53- 'googlebot' ,
54- 'yandexbot' ,
55- 'monsidobot' ,
56- 'kazbtbot' ,
57- 'seznambot' ,
58- 'dubbotbot' ,
59- '360spider' ,
60- 'redditbot' ,
61- 'yandexmobilebot' ,
62- 'linkdexbot' ,
63- 'musobot' ,
64- 'mojeekbot' ,
65- 'focuseekbot' ,
66- 'behloolbot' ,
67- 'startmebot' ,
68- 'yandexaccessibilitybot' ,
69- 'uptimerobot' ,
70- 'femtosearchbot' ,
71- 'pinterestbot' ,
72- 'toutiaospider' ,
73- 'yoozbot' ,
74- 'parsijoobot' ,
75- 'equellaurlbot' ,
76- 'donkeybot' ,
77- 'paperlibot' ,
78- 'nsrbot' ,
79- 'discordbot' ,
80- 'ahrefsbot' ,
81- 'coccocbot' ,
82- 'buzzbot' ,
83- 'laserlikebot' ,
84- 'baiduspider' ,
85- 'bingbot' ,
86- 'mj12bot' ,
87- 'yoozbotadsbot' ,
88- 'ahrefsbot' ,
89- 'amazonbot' ,
90- 'applebot' ,
91- 'bingbot' ,
92- 'brightbot' ,
93- 'gptbot' ,
94- 'petalbot' ,
95- 'semanticscholarbot' ,
96- 'yandex.com/bots' ,
97- 'icc-crawler' ,
98- ]
99-
105+
100106 # Check hhcl header first (set by nginx)
101107 if hhcl == '1' :
102108 return True
@@ -105,9 +111,7 @@ def _compute_is_bot(user_agent: str | None, hhcl: str | None) -> bool:
105111 if not user_agent :
106112 return True
107113
108- user_agent = user_agent .lower ()
109- return any (bot in user_agent for bot in user_agent_bots )
110-
114+ return _compute_is_recognized_bot (user_agent )
111115
112116def _parse_solr_editions_from_web () -> bool :
113117 """Parse solr_editions from web.py context."""
@@ -147,6 +151,9 @@ def set_context_from_legacy_web_py() -> None:
147151 print_disabled = bool (web .cookies ().get ('pd' , False ))
148152
149153 # Compute is_bot once during request setup
154+ is_recognized_bot = _compute_is_recognized_bot (
155+ user_agent = web .ctx .env .get ("HTTP_USER_AGENT" , "" )
156+ )
150157 is_bot = _compute_is_bot (
151158 user_agent = web .ctx .env .get ("HTTP_USER_AGENT" ),
152159 hhcl = web .ctx .env .get ("HTTP_X_HHCL" ),
@@ -160,6 +167,7 @@ def set_context_from_legacy_web_py() -> None:
160167 lang = web .ctx .lang ,
161168 solr_editions = solr_editions ,
162169 print_disabled = print_disabled ,
170+ is_recognized_bot = is_recognized_bot ,
163171 is_bot = is_bot ,
164172 )
165173 )
0 commit comments