Skip to content

Commit eafa640

Browse files
moleboxclaudevercel[bot]dferber90
authored
Add AI agent detection and automatic markdown rewrites (#351)
* Add AI agent detection and automatic markdown rewrites When AI agents (Claude, ChatGPT, Cursor, etc.) request docs pages, the proxy now detects them and transparently rewrites to the markdown route — matching the geistdocs template default. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix: Missing `detectionMethod` property in `TrackMdRequestParams` type causes TypeScript compilation error and build failure. This commit fixes the issue reported at apps/docs/lib/geistdocs/md-tracking.ts:24 **Bug explanation:** The `trackMdRequest` function in `apps/docs/lib/geistdocs/md-tracking.ts` destructures `detectionMethod` from its parameter (line 24) and includes it in the JSON body sent to the tracking endpoint (line 38). However, the `TrackMdRequestParams` type definition (lines 6-12) does not include `detectionMethod` as a property. In `apps/docs/proxy.ts` (line 82-87), `trackMdRequest` is called with `detectionMethod: agentResult.method` when an AI agent is detected, where `agentResult.method` is of type `DetectionMethod | null` from `@/lib/ai-agent-detection`. This causes an exact TypeScript compilation error confirmed in the build logs: ``` ./lib/geistdocs/md-tracking.ts:24:3 Type error: Property 'detectionMethod' does not exist on type 'TrackMdRequestParams'. ``` This error causes the entire `docs` build to fail (`next build` exits with code 1). **Fix explanation:** Added `detectionMethod?: DetectionMethod | null` as an optional property to the `TrackMdRequestParams` type, and added the corresponding `import type { DetectionMethod } from "@/lib/ai-agent-detection"` import at the top of the file. The property is optional (`?`) because most call sites (for `.md` URL tracking, `llms.txt` tracking, and header-negotiated tracking) don't pass a `detectionMethod` — only the agent-rewrite tracking path does. The `| null` union matches the `DetectionResult.method` type from `ai-agent-detection.ts`. Co-authored-by: Vercel <vercel[bot]@users.noreply.github.com> Co-authored-by: molebox <hello@richardhaines.dev> * format --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Vercel <vercel[bot]@users.noreply.github.com> Co-authored-by: Dominik Ferber <dominik.ferber@gmail.com>
1 parent b66ce3e commit eafa640

File tree

3 files changed

+204
-1
lines changed

3 files changed

+204
-1
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
/**
2+
* AI Agent Detection Utility
3+
*
4+
* Multi-signal detection for AI agents/bots. Used to serve markdown
5+
* responses when agents request docs pages.
6+
*
7+
* Three detection layers:
8+
* 1. Known UA patterns (definitive) — curated from https://bots.fyi/?tags=ai_assistant
9+
* 2. Signature-Agent header (definitive) — catches ChatGPT agent (RFC 9421)
10+
* 3. Missing browser fingerprint heuristic — catches unknown bots
11+
*
12+
* Optimizes for recall over precision: serving markdown to a non-AI bot
13+
* is low-harm; missing an AI agent means a worse experience.
14+
*
15+
* Last reviewed: 2026-03-20 against bots.fyi + official vendor docs
16+
*/
17+
18+
// Layer 1: Known AI agent UA substrings (lowercase).
19+
const AI_AGENT_UA_PATTERNS = [
20+
// Anthropic — https://support.claude.com/en/articles/8896518
21+
'claudebot',
22+
'claude-searchbot',
23+
'claude-user',
24+
'anthropic-ai',
25+
'claude-web',
26+
27+
// OpenAI — https://platform.openai.com/docs/bots
28+
'chatgpt',
29+
'gptbot',
30+
'oai-searchbot',
31+
'openai',
32+
33+
// Google AI
34+
'gemini',
35+
'bard',
36+
'google-cloudvertexbot',
37+
'google-extended',
38+
39+
// Meta
40+
'meta-externalagent',
41+
'meta-externalfetcher',
42+
'meta-webindexer',
43+
44+
// Search/Research AI
45+
'perplexity',
46+
'youbot',
47+
'you.com',
48+
'deepseekbot',
49+
50+
// Coding assistants
51+
'cursor',
52+
'github-copilot',
53+
'codeium',
54+
'tabnine',
55+
'sourcegraph',
56+
57+
// Other AI agents / data scrapers (low-harm to serve markdown)
58+
'cohere-ai',
59+
'bytespider',
60+
'amazonbot',
61+
'ai2bot',
62+
'diffbot',
63+
'omgili',
64+
'omgilibot',
65+
];
66+
67+
// Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).
68+
const SIGNATURE_AGENT_DOMAINS = ['chatgpt.com'];
69+
70+
// Layer 3: Traditional bot exclusion list — bots that should NOT trigger
71+
// the heuristic layer (they're search engine crawlers, social previews, or
72+
// monitoring tools, not AI agents).
73+
const TRADITIONAL_BOT_PATTERNS = [
74+
'googlebot',
75+
'bingbot',
76+
'yandexbot',
77+
'baiduspider',
78+
'duckduckbot',
79+
'slurp',
80+
'msnbot',
81+
'facebot',
82+
'twitterbot',
83+
'linkedinbot',
84+
'whatsapp',
85+
'telegrambot',
86+
'pingdom',
87+
'uptimerobot',
88+
'newrelic',
89+
'datadog',
90+
'statuspage',
91+
'site24x7',
92+
'applebot',
93+
];
94+
95+
// Broad regex for bot-like UA strings (used only in Layer 3 heuristic).
96+
const BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i;
97+
98+
export type DetectionMethod = 'ua-match' | 'signature-agent' | 'heuristic';
99+
100+
export interface DetectionResult {
101+
detected: boolean;
102+
method: DetectionMethod | null;
103+
}
104+
105+
/**
106+
* Detects AI agents from HTTP request headers.
107+
*
108+
* Returns both whether the agent was detected and which signal triggered,
109+
* so callers can log the detection method for accuracy tracking.
110+
*/
111+
export function isAIAgent(request: {
112+
headers: { get(name: string): string | null };
113+
}): DetectionResult {
114+
const userAgent = request.headers.get('user-agent');
115+
116+
// Layer 1: Known UA pattern match
117+
if (userAgent) {
118+
const lowerUA = userAgent.toLowerCase();
119+
if (AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) {
120+
return { detected: true, method: 'ua-match' };
121+
}
122+
}
123+
124+
// Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent)
125+
const signatureAgent = request.headers.get('signature-agent');
126+
if (signatureAgent) {
127+
const lowerSig = signatureAgent.toLowerCase();
128+
if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) {
129+
return { detected: true, method: 'signature-agent' };
130+
}
131+
}
132+
133+
// Layer 3: Missing browser fingerprint heuristic
134+
// Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode
135+
// on navigation requests. Its absence signals a programmatic client.
136+
const secFetchMode = request.headers.get('sec-fetch-mode');
137+
if (!secFetchMode && userAgent && BOT_LIKE_REGEX.test(userAgent)) {
138+
const lowerUA = userAgent.toLowerCase();
139+
const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) =>
140+
lowerUA.includes(pattern),
141+
);
142+
if (!isTraditionalBot) {
143+
return { detected: true, method: 'heuristic' };
144+
}
145+
}
146+
147+
return { detected: false, method: null };
148+
}
149+
150+
/**
151+
* Generates a markdown response for AI agents that hit non-existent URLs.
152+
*/
153+
export function generateAgentNotFoundResponse(requestedPath: string): string {
154+
return `# Page Not Found
155+
156+
The URL \`${requestedPath}\` does not exist in the documentation.
157+
158+
## How to find the correct page
159+
160+
1. **Browse the sitemap**: [/sitemap.md](/sitemap.md) — A structured index of all pages with URLs, content types, and descriptions
161+
2. **Browse the full index**: [/llms.txt](/llms.txt) — Complete documentation index
162+
163+
## Tips for requesting documentation
164+
165+
- For markdown responses, append \`.md\` to URLs (e.g., \`/docs/getting-started.md\`)
166+
- Use \`Accept: text/markdown\` header for content negotiation
167+
`;
168+
}

apps/docs/lib/geistdocs/md-tracking.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { siteId } from "@/geistdocs";
2+
import type { DetectionMethod } from "@/lib/ai-agent-detection";
23

34
const PLATFORM_URL = "https://geistdocs.com/md-tracking";
45

@@ -8,7 +9,9 @@ type TrackMdRequestParams = {
89
referer: string | null;
910
acceptHeader: string | null;
1011
/** How the markdown was requested: 'md-url' for direct .md URLs, 'header-negotiated' for Accept header */
11-
requestType?: "md-url" | "header-negotiated";
12+
requestType?: "md-url" | "header-negotiated" | "agent-rewrite";
13+
/** Which detection method identified the AI agent */
14+
detectionMethod?: DetectionMethod | null;
1215
};
1316

1417
/**
@@ -21,6 +24,7 @@ export async function trackMdRequest({
2124
referer,
2225
acceptHeader,
2326
requestType,
27+
detectionMethod,
2428
}: TrackMdRequestParams): Promise<void> {
2529
try {
2630
const response = await fetch(PLATFORM_URL, {
@@ -35,6 +39,7 @@ export async function trackMdRequest({
3539
referer,
3640
acceptHeader,
3741
requestType,
42+
detectionMethod,
3843
}),
3944
});
4045

apps/docs/proxy.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
} from "next/server";
99
import { rootFlags } from "@/flags";
1010
import { i18n } from "@/lib/geistdocs/i18n";
11+
import { isAIAgent } from "@/lib/ai-agent-detection";
1112
import { trackMdRequest } from "@/lib/geistdocs/md-tracking";
1213

1314
const { rewrite: rewriteLLM } = rewritePath(
@@ -63,6 +64,35 @@ const proxy = async (request: NextRequest, context: NextFetchEvent) => {
6364
}
6465
}
6566

67+
// AI agent detection — rewrite docs pages to markdown for agents
68+
// so they always get structured content without needing .md URLs or Accept headers
69+
if (
70+
(pathname === "/docs" || pathname.startsWith("/docs/")) &&
71+
!pathname.includes("/llms.mdx/")
72+
) {
73+
const agentResult = isAIAgent(request);
74+
if (agentResult.detected && !isMarkdownPreferred(request)) {
75+
const result =
76+
pathname === "/docs"
77+
? `/${i18n.defaultLanguage}/llms.mdx`
78+
: rewriteLLM(pathname);
79+
80+
if (result) {
81+
context.waitUntil(
82+
trackMdRequest({
83+
path: pathname,
84+
userAgent: request.headers.get("user-agent"),
85+
referer: request.headers.get("referer"),
86+
acceptHeader: request.headers.get("accept"),
87+
requestType: "agent-rewrite",
88+
detectionMethod: agentResult.method,
89+
})
90+
);
91+
return NextResponse.rewrite(new URL(result, request.nextUrl));
92+
}
93+
}
94+
}
95+
6696
// Handle Accept header content negotiation and track the request
6797
if (isMarkdownPreferred(request)) {
6898
const result = rewriteLLM(pathname);

0 commit comments

Comments
 (0)