Skip to content

Commit 48d17f3

Browse files
committed
keep tests new modules
1 parent 0fd42df commit 48d17f3

File tree

6 files changed

+305
-300
lines changed

6 files changed

+305
-300
lines changed

.cache/rss-cache.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"etag": "W/\"6c83011f81bbb055bdee75ae39d61ab2\""
44
},
55
"https://spin.atomicobject.com/author/nathan-papes/feed/atom/": {
6-
"etag": "W/\"7f8560e9c207aff29a153747b7cc6be0\"",
7-
"lastModified": "Thu, 07 Aug 2025 15:50:45 GMT"
6+
"etag": "W/\"42d099d3a398f7701824177ca25797a7\"",
7+
"lastModified": "Sat, 09 Aug 2025 12:00:55 GMT"
88
},
99
"https://www.youtube.com/feeds/videos.xml?channel_id=UCPwv65XQty1QbqE04FitBlQ": {}
1010
}

packages/core/src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
export * from "./schema.js";
21
export * from "./generate.js";
2+
export * from "./robots.js";
3+
export * from "./schema.js";

packages/core/src/robots.ts

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/**
2+
* RFC 9309-compliant robots.txt parsing and path matching utilities.
3+
* Reference: https://www.rfc-editor.org/rfc/rfc9309.html
4+
*/
5+
6+
export type RobotsRule = {
7+
type: "allow" | "disallow";
8+
pattern: string;
9+
regex: RegExp;
10+
length: number; // used for longest-match precedence
11+
};
12+
13+
export type ParsedRobots = {
14+
rules: RobotsRule[];
15+
};
16+
17+
function escapeRegexLiteral(input: string): string {
18+
return input.replace(/[.+?^${}()|[\]\\]/g, "\\$&");
19+
}
20+
21+
function compilePattern(pattern: string): { regex: RegExp; length: number } {
22+
let anchorToEnd = false;
23+
let raw = pattern.trim();
24+
25+
if (raw.endsWith("$")) {
26+
anchorToEnd = true;
27+
raw = raw.slice(0, -1);
28+
}
29+
30+
// Convert path-pattern to regex: wildcards (*) match any char seq.
31+
// We also match from the beginning of the path, and optionally anchor to end.
32+
const escaped = escapeRegexLiteral(raw).replace(/\*/g, ".*");
33+
const source = `^${escaped}${anchorToEnd ? "$" : ""}`;
34+
const regex = new RegExp(source);
35+
36+
// For precedence, RFC uses longest match; we approximate by pattern length
37+
// excluding a trailing '$' if present.
38+
const length = raw.length;
39+
return { regex, length };
40+
}
41+
42+
function normalizeUserAgentToken(userAgent: string): string {
43+
// Extract a product token from a full UA string (e.g. "merln/rss-bot (...)" -> "merln").
44+
const token = (userAgent.split("/")[0] || userAgent).trim();
45+
return token;
46+
}
47+
48+
/**
49+
* Parse robots.txt and return rules relevant to the provided userAgent.
50+
*
51+
* Group selection rules implemented:
52+
* - Accumulate rules for any group where User-agent matches our product token
53+
* (case-insensitive) or the full UA string. If none match specifically,
54+
* fall back to the groups with User-agent: *.
55+
* - If neither specific nor global groups exist, there are no applicable rules.
56+
*
57+
* Rule semantics:
58+
* - Support Allow and Disallow (case-insensitive).
59+
* - Empty Disallow means allow everything (ignored as a rule).
60+
* - Patterns support '*' wildcard and '$' end-anchor per common practice and RFC 9309.
61+
*/
62+
export function parseRobots(
63+
robotsTxt: string,
64+
userAgent: string
65+
): ParsedRobots {
66+
const lines = robotsTxt.split(/\r?\n/);
67+
68+
type Group = { agents: string[]; rules: RobotsRule[] };
69+
const groups: Group[] = [];
70+
let currentGroup: Group | null = null;
71+
72+
const ourToken = normalizeUserAgentToken(userAgent).toLowerCase();
73+
const ourFull = userAgent.toLowerCase();
74+
75+
for (const rawLine of lines) {
76+
const lineWithoutComment = (() => {
77+
const hashIndex = rawLine.indexOf("#");
78+
return (hashIndex >= 0 ? rawLine.slice(0, hashIndex) : rawLine).trim();
79+
})();
80+
if (!lineWithoutComment) continue;
81+
82+
const uaMatch = lineWithoutComment.match(/^user-agent\s*:\s*(.+)$/i);
83+
if (uaMatch) {
84+
const token = (uaMatch[1] ?? "").trim();
85+
if (!token) continue;
86+
// If we already started a group AND it has rules, this UA starts a new group.
87+
if (!currentGroup || currentGroup.rules.length > 0) {
88+
currentGroup = { agents: [], rules: [] };
89+
groups.push(currentGroup);
90+
}
91+
currentGroup.agents.push(token);
92+
continue;
93+
}
94+
95+
// A rule must follow at least one user-agent line; otherwise skip
96+
if (!currentGroup) continue;
97+
98+
const allowMatch = lineWithoutComment.match(/^allow\s*:\s*(.*)$/i);
99+
if (allowMatch) {
100+
const pattern = (allowMatch[1] ?? "").trim();
101+
if (!pattern) continue;
102+
const { regex, length } = compilePattern(pattern);
103+
currentGroup.rules.push({ type: "allow", pattern, regex, length });
104+
continue;
105+
}
106+
107+
const disallowMatch = lineWithoutComment.match(/^disallow\s*:\s*(.*)$/i);
108+
if (disallowMatch) {
109+
const pattern = (disallowMatch[1] ?? "").trim();
110+
// Empty Disallow means allow all → ignore as a rule
111+
if (!pattern) continue;
112+
const { regex, length } = compilePattern(pattern);
113+
currentGroup.rules.push({ type: "disallow", pattern, regex, length });
114+
continue;
115+
}
116+
117+
// Ignore other directives (Sitemap, Crawl-delay, etc.)
118+
}
119+
120+
const specificRules: RobotsRule[] = [];
121+
const globalRules: RobotsRule[] = [];
122+
123+
for (const g of groups) {
124+
const agents = g.agents.map((a) => a.toLowerCase());
125+
const isGlobal = agents.includes("*");
126+
const isSpecific = agents.some((a) => a === ourToken || a === ourFull);
127+
128+
if (isSpecific) specificRules.push(...g.rules);
129+
else if (isGlobal) globalRules.push(...g.rules);
130+
}
131+
132+
if (specificRules.length) return { rules: specificRules };
133+
if (globalRules.length) return { rules: globalRules };
134+
return { rules: [] };
135+
}
136+
137+
/**
138+
* Determine if a given path (including optional query string) is allowed.
139+
* Implements longest-match precedence: select the matching rule with the
140+
* greatest length; on ties, an Allow rule wins.
141+
*/
142+
export function isPathAllowed(
143+
robotsTxt: string,
144+
pathWithQuery: string,
145+
userAgent: string
146+
): boolean {
147+
const { rules } = parseRobots(robotsTxt, userAgent);
148+
if (!rules.length) return true;
149+
150+
let best: RobotsRule | undefined;
151+
for (const rule of rules) {
152+
if (rule.regex.test(pathWithQuery)) {
153+
if (!best) best = rule;
154+
else if (rule.length > best.length) best = rule;
155+
else if (rule.length === best.length && rule.type === "allow")
156+
best = rule;
157+
}
158+
}
159+
160+
if (!best) return true;
161+
return best.type === "allow";
162+
}

packages/core/src/rss.test.ts

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import { isPathAllowed, parseRobots } from "@merln/core/src/robots.ts";
2+
import { describe, expect, test } from "bun:test";
3+
4+
const USER_AGENT = "merln/rss-bot (+https://natepapes.com)";
5+
6+
describe("RFC 9309 robots matcher", () => {
7+
test("longest match (RFC example) and allow precedence", () => {
8+
const robots = `User-Agent: merln
9+
Allow: /example/page/
10+
Disallow: /example/page/disallowed.gif`;
11+
12+
expect(isPathAllowed(robots, "/example/page/ok", USER_AGENT)).toBe(true);
13+
expect(
14+
isPathAllowed(robots, "/example/page/disallowed.gif", USER_AGENT)
15+
).toBe(false);
16+
});
17+
18+
test("empty Disallow means allow all for that agent", () => {
19+
const robots = `User-agent: merln\nDisallow:`;
20+
expect(isPathAllowed(robots, "/anything", USER_AGENT)).toBe(true);
21+
});
22+
23+
test("fallback to global rules when no specific group", () => {
24+
const robots = `User-agent: Googlebot\nDisallow: /google-only/\n\nUser-agent: *\nDisallow: /admin/`;
25+
const { rules } = parseRobots(robots, USER_AGENT);
26+
expect(rules.length).toBe(1);
27+
expect(isPathAllowed(robots, "/admin/panel", USER_AGENT)).toBe(false);
28+
expect(isPathAllowed(robots, "/public", USER_AGENT)).toBe(true);
29+
});
30+
31+
test("wildcard * and end-anchor $", () => {
32+
const robots = `User-Agent: *\nDisallow: *.gif$\nAllow: /publications/`;
33+
expect(isPathAllowed(robots, "/img/pic.gif", USER_AGENT)).toBe(false);
34+
expect(isPathAllowed(robots, "/img/pic.gifv", USER_AGENT)).toBe(true);
35+
expect(isPathAllowed(robots, "/publications/x", USER_AGENT)).toBe(true);
36+
});
37+
38+
test("match by full UA string if provided", () => {
39+
const robots = `User-agent: ${USER_AGENT}\nDisallow: /special/`;
40+
expect(isPathAllowed(robots, "/special/x", USER_AGENT)).toBe(false);
41+
});
42+
});
43+
44+
describe("Provider robots policies", () => {
45+
const youtubeRobots = `# robots.txt file for YouTube
46+
User-agent: Mediapartners-Google*
47+
Disallow:
48+
49+
User-agent: *
50+
Disallow: /api/
51+
Disallow: /comment
52+
Disallow: /feeds/videos.xml
53+
Disallow: /file_download
54+
Disallow: /get_video
55+
Disallow: /get_video_info
56+
Disallow: /get_midroll_info
57+
Disallow: /live_chat
58+
Disallow: /login
59+
Disallow: /qr
60+
Disallow: /results
61+
Disallow: /signup
62+
Disallow: /t/terms
63+
Disallow: /timedtext_video
64+
Disallow: /verify_age
65+
Disallow: /watch_ajax
66+
Disallow: /watch_fragments_ajax
67+
Disallow: /watch_popup
68+
Disallow: /watch_queue_ajax
69+
Disallow: /youtubei/
70+
`;
71+
72+
const githubRobots = `User-agent: *
73+
Disallow: /*/*/commits/
74+
Disallow: /*/raw/
75+
Disallow: /gist/
76+
Disallow: /search$
77+
Disallow: /*.atom$`;
78+
79+
const atomicRobots = `Crawl-delay: 10
80+
User-agent: *
81+
Disallow:
82+
Sitemap: https://spin.atomicobject.com/sitemap_index.xml`;
83+
84+
test("YouTube: typical video page allowed; certain APIs disallowed", () => {
85+
expect(isPathAllowed(youtubeRobots, "/watch?v=abc", USER_AGENT)).toBe(true);
86+
expect(isPathAllowed(youtubeRobots, "/feeds/videos.xml", USER_AGENT)).toBe(
87+
false
88+
);
89+
expect(
90+
isPathAllowed(youtubeRobots, "/youtubei/v1/browse", USER_AGENT)
91+
).toBe(false);
92+
});
93+
94+
test("GitHub: block raw, commits list, gist path, and anchored search", () => {
95+
expect(isPathAllowed(githubRobots, "/org/repo/commits/", USER_AGENT)).toBe(
96+
false
97+
);
98+
expect(isPathAllowed(githubRobots, "/foo/raw/", USER_AGENT)).toBe(false);
99+
expect(isPathAllowed(githubRobots, "/gist/", USER_AGENT)).toBe(false);
100+
expect(isPathAllowed(githubRobots, "/search", USER_AGENT)).toBe(false);
101+
expect(isPathAllowed(githubRobots, "/papes1ns", USER_AGENT)).toBe(true);
102+
});
103+
104+
test("Atomic Object: allow all for *", () => {
105+
expect(isPathAllowed(atomicRobots, "/anything", USER_AGENT)).toBe(true);
106+
});
107+
});

0 commit comments

Comments
 (0)