Skip to content

Commit 4e3eac6

Browse files
committed
middleware parses substrings too
1 parent b1e0c81 commit 4e3eac6

2 files changed

Lines changed: 138 additions & 47 deletions

File tree

__tests__/middleware.test.ts

Lines changed: 72 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,21 @@ import type { NextRequest } from 'next/server';
44
import { middleware } from '../middleware';
55

66
const createRequest = (url: string, userAgent: string = '') => {
7-
const parsedUrl = new URL(url);
7+
const u = new URL(url);
8+
const nextUrl = {
9+
// minimal NextURL-ish shape the middleware actually uses
10+
pathname: u.pathname,
11+
search: u.search,
12+
searchParams: new URLSearchParams(u.search),
13+
hash: u.hash,
14+
href: u.href,
15+
origin: u.origin,
16+
clone: () => new URL(u.href),
17+
toString: () => u.toString(),
18+
};
19+
820
return {
9-
nextUrl: {
10-
...parsedUrl,
11-
clone: () => new URL(parsedUrl.href),
12-
pathname: parsedUrl.pathname,
13-
search: parsedUrl.search,
14-
searchParams: parsedUrl.searchParams,
15-
hash: parsedUrl.hash,
16-
},
21+
nextUrl,
1722
headers: {
1823
get: (name: string) => (name.toLowerCase() === 'user-agent' ? userAgent : null),
1924
},
@@ -25,9 +30,6 @@ const botUAs = [
2530
'Mozilla/5.0 (compatible; bingbot/2.0)',
2631
'GPTBot/1.0',
2732
'Claude-Web/1.0',
28-
'ChatGPT-User/1.0',
29-
'facebookexternalhit/1.1',
30-
'Twitterbot/1.0',
3133
'googlebot',
3234
'GOOGLEBOT',
3335
'GoogleBot',
@@ -39,8 +41,22 @@ const humanUAs = [
3941
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
4042
];
4143

42-
describe('Middleware - Bot Redirects', () => {
43-
it('redirects bots from / to /bot', () => {
44+
const scriptedClients = [
45+
'curl/8.5.0',
46+
'Wget/1.21.3',
47+
'python-requests/2.32.3',
48+
'Apache-HttpClient/4.5.14',
49+
'okhttp/4.12.0',
50+
'GuzzleHttp/7',
51+
'libwww-perl/6.68',
52+
'Go-http-client/1.1',
53+
'Java/11.0.24',
54+
'node-fetch/1.0',
55+
'axios/1.7.0',
56+
];
57+
58+
describe('Middleware - Bot/Agent Redirects with AFL heuristics', () => {
59+
it('redirects known bot UAs from / to /bot', () => {
4460
for (const ua of botUAs) {
4561
const res = middleware(createRequest('http://localhost:3000/', ua));
4662
expect(res.status).toBe(302);
@@ -51,7 +67,7 @@ describe('Middleware - Bot Redirects', () => {
5167
}
5268
});
5369

54-
it('redirects bots from any page to /bot/* (no bot/bot loops)', () => {
70+
it('redirects known bot UAs from any page to /bot/* (no /bot loops)', () => {
5571
for (const ua of botUAs) {
5672
const res = middleware(createRequest('http://localhost:3000/gallery', ua));
5773
expect(res.status).toBe(302);
@@ -62,29 +78,25 @@ describe('Middleware - Bot Redirects', () => {
6278
});
6379

6480
it('preserves query strings on redirect', () => {
65-
const res = middleware(
66-
createRequest('http://localhost:3000/about?ref=twitter&x=1', 'Googlebot')
67-
);
81+
const res = middleware(createRequest('http://localhost:3000/about?ref=twitter&x=1', 'Googlebot'));
6882
expect(res.status).toBe(302);
6983
const loc = new URL(res.headers.get('Location') || '');
7084
expect(loc.pathname).toBe('/bot/about');
7185
expect(loc.searchParams.get('ref')).toBe('twitter');
7286
expect(loc.searchParams.get('x')).toBe('1');
7387
});
7488

75-
it('does not redirect already /bot/* paths (for bots or humans)', () => {
89+
it('does not redirect already /bot/* paths (bots or humans)', () => {
7690
const botRes = middleware(createRequest('http://localhost:3000/bot/gallery', 'Googlebot'));
7791
expect(botRes.headers.get('Location')).toBeNull();
7892
expect(botRes.status).toBe(200);
7993

80-
const humanRes = middleware(
81-
createRequest('http://localhost:3000/bot/gallery', humanUAs[0])
82-
);
94+
const humanRes = middleware(createRequest('http://localhost:3000/bot/gallery', humanUAs[0]));
8395
expect(humanRes.headers.get('Location')).toBeNull();
8496
expect(humanRes.status).toBe(200);
8597
});
8698

87-
it('does not redirect API, _next, static, or asset files even for bots', () => {
99+
it('bypasses API, _next, static, and asset files even for bots', () => {
88100
const skipPaths = [
89101
'http://localhost:3000/api/test',
90102
'http://localhost:3000/_next/static/chunk.js',
@@ -105,32 +117,55 @@ describe('Middleware - Bot Redirects', () => {
105117
}
106118
});
107119

108-
it('does not redirect human user agents', () => {
120+
it('does not redirect normal human browsers', () => {
109121
for (const ua of humanUAs) {
110122
const res = middleware(createRequest('http://localhost:3000/gallery', ua));
111123
expect(res.headers.get('Location')).toBeNull();
112124
expect(res.status).toBe(200);
113125
}
114126
});
115127

116-
it('handles deep nested paths correctly', () => {
117-
const res = middleware(
118-
createRequest('http://localhost:3000/deep/nested/path', 'Googlebot')
119-
);
128+
it('redirects generic crawler terms (substring backstop)', () => {
129+
const ua = 'MyAwesomeCrawler/1.0 (+https://example.com)';
130+
const res = middleware(createRequest('http://localhost:3000/deep/nested/path', ua));
120131
expect(res.status).toBe(302);
121132
const loc = new URL(res.headers.get('Location') || '');
122133
expect(loc.pathname).toBe('/bot/deep/nested/path');
123134
});
124135

125-
it('treats missing/empty user-agent as human (no redirect)', () => {
126-
const res1 = middleware(createRequest('http://localhost:3000/gallery'));
127-
expect(res1.headers.get('Location')).toBeNull();
128-
expect(res1.status).toBe(200);
136+
it('redirects scripted HTTP clients to /bot/*', () => {
137+
for (const ua of scriptedClients) {
138+
const res = middleware(createRequest('http://localhost:3000/gallery', ua));
139+
expect(res.status).toBe(302);
140+
const loc = new URL(res.headers.get('Location') || '');
141+
expect(loc.pathname).toBe('/bot/gallery');
142+
}
143+
});
144+
145+
it('treats empty or missing user-agent as scripted client (redirects to /bot/*)', () => {
146+
const res1 = middleware(createRequest('http://localhost:3000/gallery'));
147+
expect(res1.status).toBe(302);
148+
const loc1 = new URL(res1.headers.get('Location') || '');
149+
expect(loc1.pathname).toBe('/bot/gallery');
129150

130-
const res2 = middleware(createRequest('http://localhost:3000/gallery', ' '));
131-
expect(res2.headers.get('Location')).toBeNull();
132-
expect(res2.status).toBe(200);
133-
});
151+
const res2 = middleware(createRequest('http://localhost:3000/gallery', ' '));
152+
expect(res2.status).toBe(302);
153+
const loc2 = new URL(res2.headers.get('Location') || '');
154+
expect(loc2.pathname).toBe('/bot/gallery');
155+
});
134156

135-
157+
it('override: ?afl=human prevents redirect even for bots', () => {
158+
const res = middleware(createRequest('http://localhost:3000/gallery?afl=human', 'Googlebot'));
159+
expect(res.headers.get('Location')).toBeNull();
160+
expect(res.status).toBe(200);
161+
});
162+
163+
it('override: ?afl=bot forces redirect even for humans and strips the override key', () => {
164+
const res = middleware(createRequest('http://localhost:3000/gallery?afl=bot&ref=x', humanUAs[0]));
165+
expect(res.status).toBe(302);
166+
const loc = new URL(res.headers.get('Location') || '');
167+
expect(loc.pathname).toBe('/bot/gallery');
168+
expect(loc.searchParams.get('ref')).toBe('x');
169+
expect(loc.searchParams.get('afl')).toBeNull();
170+
});
136171
});

middleware.ts

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,85 @@ import { NextResponse } from 'next/server';
33
import type { NextRequest } from 'next/server';
44
import { isBotUserAgent } from '@lib/bot/userAgents';
55

6-
export function middleware(request: NextRequest) {
7-
const ua = request.headers.get('user-agent') || '';
8-
const { pathname } = request.nextUrl;
6+
// Generic crawler words like "bot", "crawler", etc.
7+
function isGenericCrawler(uaRaw: string): boolean {
8+
const raw = uaRaw ?? '';
9+
const norm = raw.trim();
10+
if (!norm) return true; // empty/whitespace UA => treat as scripted/automated
11+
return /\b(bot|crawler|spider|crawl)\b/i.test(norm);
12+
}
13+
14+
// Common scripted HTTP clients frequently used by agents/tools
15+
function isScriptedHttpClient(uaRaw: string): boolean {
16+
const raw = uaRaw ?? '';
17+
const norm = raw.trim();
18+
if (!norm) return true; // empty/whitespace UA
19+
const ua = norm.toLowerCase();
20+
21+
const substrings = [
22+
'curl',
23+
'wget',
24+
'python-requests',
25+
'httpclient', // Apache-HttpClient
26+
'okhttp',
27+
'guzzlehttp',
28+
'libwww-perl',
29+
'dart/',
30+
'go-http-client',
31+
'java/',
32+
'node-fetch',
33+
'axios/',
34+
];
935

10-
// Skip bot pages and non-HTML assets/internals
11-
if (
36+
return substrings.some(s => ua.includes(s));
37+
}
38+
39+
function shouldBypass(pathname: string): boolean {
40+
return (
1241
pathname.startsWith('/bot') ||
1342
pathname.startsWith('/api/') ||
1443
pathname.startsWith('/_next/') ||
1544
pathname.startsWith('/static/') ||
1645
/\.(?:ico|png|jpg|jpeg|webp|svg|gif|css|js|json|xml|txt|map)$/i.test(pathname)
17-
) {
46+
);
47+
}
48+
49+
export function middleware(request: NextRequest) {
50+
const uaHeader = request.headers.get('user-agent') || '';
51+
const url = request.nextUrl;
52+
const { pathname, searchParams } = url;
53+
54+
if (shouldBypass(pathname)) {
1855
return NextResponse.next();
1956
}
2057

21-
if (!isBotUserAgent(ua)) {
58+
// Manual overrides
59+
const afl = searchParams.get('afl');
60+
if (afl === 'human') {
61+
return NextResponse.next();
62+
}
63+
if (afl === 'bot') {
64+
const forced = url.clone();
65+
forced.pathname = pathname === '/' ? '/bot' : `/bot${pathname}`;
66+
forced.searchParams.delete('afl');
67+
const res = NextResponse.redirect(forced, 302);
68+
res.headers.set('Vary', 'User-Agent');
69+
res.headers.set('Cache-Control', 'no-store');
70+
return res;
71+
}
72+
73+
// Detection: strict list + generic crawler + scripted clients
74+
const looksAutomated =
75+
isBotUserAgent(uaHeader) || isGenericCrawler(uaHeader) || isScriptedHttpClient(uaHeader);
76+
77+
if (!looksAutomated) {
2278
return NextResponse.next();
2379
}
2480

25-
const url = request.nextUrl.clone();
26-
url.pathname = pathname === '/' ? '/bot' : `/bot${pathname}`;
81+
const dest = url.clone();
82+
dest.pathname = pathname === '/' ? '/bot' : `/bot${pathname}`;
2783

28-
const res = NextResponse.redirect(url, 302);
84+
const res = NextResponse.redirect(dest, 302);
2985
res.headers.set('Vary', 'User-Agent');
3086
res.headers.set('Cache-Control', 'no-store');
3187
return res;

0 commit comments

Comments
 (0)