1818// Layer 1: Known AI agent UA substrings (lowercase).
1919const AI_AGENT_UA_PATTERNS = [
2020 // Anthropic — https://support.claude.com/en/articles/8896518
21- " claudebot" ,
22- " claude-searchbot" ,
23- " claude-user" ,
24- " anthropic-ai" ,
25- " claude-web" ,
21+ ' claudebot' ,
22+ ' claude-searchbot' ,
23+ ' claude-user' ,
24+ ' anthropic-ai' ,
25+ ' claude-web' ,
2626
2727 // OpenAI — https://platform.openai.com/docs/bots
28- " chatgpt" ,
29- " gptbot" ,
30- " oai-searchbot" ,
31- " openai" ,
28+ ' chatgpt' ,
29+ ' gptbot' ,
30+ ' oai-searchbot' ,
31+ ' openai' ,
3232
3333 // Google AI
34- " gemini" ,
35- " bard" ,
36- " google-cloudvertexbot" ,
37- " google-extended" ,
34+ ' gemini' ,
35+ ' bard' ,
36+ ' google-cloudvertexbot' ,
37+ ' google-extended' ,
3838
3939 // Meta
40- " meta-externalagent" ,
41- " meta-externalfetcher" ,
42- " meta-webindexer" ,
40+ ' meta-externalagent' ,
41+ ' meta-externalfetcher' ,
42+ ' meta-webindexer' ,
4343
4444 // Search/Research AI
45- " perplexity" ,
46- " youbot" ,
47- " you.com" ,
48- " deepseekbot" ,
45+ ' perplexity' ,
46+ ' youbot' ,
47+ ' you.com' ,
48+ ' deepseekbot' ,
4949
5050 // Coding assistants
51- " cursor" ,
52- " github-copilot" ,
53- " codeium" ,
54- " tabnine" ,
55- " sourcegraph" ,
51+ ' cursor' ,
52+ ' github-copilot' ,
53+ ' codeium' ,
54+ ' tabnine' ,
55+ ' sourcegraph' ,
5656
5757 // Other AI agents / data scrapers (low-harm to serve markdown)
58- " cohere-ai" ,
59- " bytespider" ,
60- " amazonbot" ,
61- " ai2bot" ,
62- " diffbot" ,
63- " omgili" ,
64- " omgilibot" ,
58+ ' cohere-ai' ,
59+ ' bytespider' ,
60+ ' amazonbot' ,
61+ ' ai2bot' ,
62+ ' diffbot' ,
63+ ' omgili' ,
64+ ' omgilibot' ,
6565] ;
6666
6767// Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).
68- const SIGNATURE_AGENT_DOMAINS = [ " chatgpt.com" ] ;
68+ const SIGNATURE_AGENT_DOMAINS = [ ' chatgpt.com' ] ;
6969
7070// Layer 3: Traditional bot exclusion list — bots that should NOT trigger
7171// the heuristic layer (they're search engine crawlers, social previews, or
7272// monitoring tools, not AI agents).
7373const TRADITIONAL_BOT_PATTERNS = [
74- " googlebot" ,
75- " bingbot" ,
76- " yandexbot" ,
77- " baiduspider" ,
78- " duckduckbot" ,
79- " slurp" ,
80- " msnbot" ,
81- " facebot" ,
82- " twitterbot" ,
83- " linkedinbot" ,
84- " whatsapp" ,
85- " telegrambot" ,
86- " pingdom" ,
87- " uptimerobot" ,
88- " newrelic" ,
89- " datadog" ,
90- " statuspage" ,
91- " site24x7" ,
92- " applebot" ,
74+ ' googlebot' ,
75+ ' bingbot' ,
76+ ' yandexbot' ,
77+ ' baiduspider' ,
78+ ' duckduckbot' ,
79+ ' slurp' ,
80+ ' msnbot' ,
81+ ' facebot' ,
82+ ' twitterbot' ,
83+ ' linkedinbot' ,
84+ ' whatsapp' ,
85+ ' telegrambot' ,
86+ ' pingdom' ,
87+ ' uptimerobot' ,
88+ ' newrelic' ,
89+ ' datadog' ,
90+ ' statuspage' ,
91+ ' site24x7' ,
92+ ' applebot' ,
9393] ;
9494
9595// Broad regex for bot-like UA strings (used only in Layer 3 heuristic).
9696const BOT_LIKE_REGEX = / b o t | a g e n t | f e t c h | c r a w l | s p i d e r | s e a r c h / i;
9797
98- export type DetectionMethod = " ua-match" | " signature-agent" | " heuristic" ;
98+ export type DetectionMethod = ' ua-match' | ' signature-agent' | ' heuristic' ;
9999
100100export interface DetectionResult {
101101 detected : boolean ;
@@ -111,36 +111,36 @@ export interface DetectionResult {
111111export function isAIAgent ( request : {
112112 headers : { get ( name : string ) : string | null } ;
113113} ) : DetectionResult {
114- const userAgent = request . headers . get ( " user-agent" ) ;
114+ const userAgent = request . headers . get ( ' user-agent' ) ;
115115
116116 // Layer 1: Known UA pattern match
117117 if ( userAgent ) {
118118 const lowerUA = userAgent . toLowerCase ( ) ;
119119 if ( AI_AGENT_UA_PATTERNS . some ( ( pattern ) => lowerUA . includes ( pattern ) ) ) {
120- return { detected : true , method : " ua-match" } ;
120+ return { detected : true , method : ' ua-match' } ;
121121 }
122122 }
123123
124124 // Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent)
125- const signatureAgent = request . headers . get ( " signature-agent" ) ;
125+ const signatureAgent = request . headers . get ( ' signature-agent' ) ;
126126 if ( signatureAgent ) {
127127 const lowerSig = signatureAgent . toLowerCase ( ) ;
128128 if ( SIGNATURE_AGENT_DOMAINS . some ( ( domain ) => lowerSig . includes ( domain ) ) ) {
129- return { detected : true , method : " signature-agent" } ;
129+ return { detected : true , method : ' signature-agent' } ;
130130 }
131131 }
132132
133133 // Layer 3: Missing browser fingerprint heuristic
134134 // Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode
135135 // on navigation requests. Its absence signals a programmatic client.
136- const secFetchMode = request . headers . get ( " sec-fetch-mode" ) ;
136+ const secFetchMode = request . headers . get ( ' sec-fetch-mode' ) ;
137137 if ( ! secFetchMode && userAgent && BOT_LIKE_REGEX . test ( userAgent ) ) {
138138 const lowerUA = userAgent . toLowerCase ( ) ;
139139 const isTraditionalBot = TRADITIONAL_BOT_PATTERNS . some ( ( pattern ) =>
140- lowerUA . includes ( pattern )
140+ lowerUA . includes ( pattern ) ,
141141 ) ;
142142 if ( ! isTraditionalBot ) {
143- return { detected : true , method : " heuristic" } ;
143+ return { detected : true , method : ' heuristic' } ;
144144 }
145145 }
146146
0 commit comments