|
| 1 | +/** |
| 2 | + * RFC 9309-compliant robots.txt parsing and path matching utilities. |
| 3 | + * Reference: https://www.rfc-editor.org/rfc/rfc9309.html |
| 4 | + */ |
| 5 | + |
| 6 | +export type RobotsRule = { |
| 7 | + type: "allow" | "disallow"; |
| 8 | + pattern: string; |
| 9 | + regex: RegExp; |
| 10 | + length: number; // used for longest-match precedence |
| 11 | +}; |
| 12 | + |
| 13 | +export type ParsedRobots = { |
| 14 | + rules: RobotsRule[]; |
| 15 | +}; |
| 16 | + |
| 17 | +function escapeRegexLiteral(input: string): string { |
| 18 | + return input.replace(/[.+?^${}()|[\]\\]/g, "\\$&"); |
| 19 | +} |
| 20 | + |
| 21 | +function compilePattern(pattern: string): { regex: RegExp; length: number } { |
| 22 | + let anchorToEnd = false; |
| 23 | + let raw = pattern.trim(); |
| 24 | + |
| 25 | + if (raw.endsWith("$")) { |
| 26 | + anchorToEnd = true; |
| 27 | + raw = raw.slice(0, -1); |
| 28 | + } |
| 29 | + |
| 30 | + // Convert path-pattern to regex: wildcards (*) match any char seq. |
| 31 | + // We also match from the beginning of the path, and optionally anchor to end. |
| 32 | + const escaped = escapeRegexLiteral(raw).replace(/\*/g, ".*"); |
| 33 | + const source = `^${escaped}${anchorToEnd ? "$" : ""}`; |
| 34 | + const regex = new RegExp(source); |
| 35 | + |
| 36 | + // For precedence, RFC uses longest match; we approximate by pattern length |
| 37 | + // excluding a trailing '$' if present. |
| 38 | + const length = raw.length; |
| 39 | + return { regex, length }; |
| 40 | +} |
| 41 | + |
| 42 | +function normalizeUserAgentToken(userAgent: string): string { |
| 43 | + // Extract a product token from a full UA string (e.g. "merln/rss-bot (...)" -> "merln"). |
| 44 | + const token = (userAgent.split("/")[0] || userAgent).trim(); |
| 45 | + return token; |
| 46 | +} |
| 47 | + |
| 48 | +/** |
| 49 | + * Parse robots.txt and return rules relevant to the provided userAgent. |
| 50 | + * |
| 51 | + * Group selection rules implemented: |
| 52 | + * - Accumulate rules for any group where User-agent matches our product token |
| 53 | + * (case-insensitive) or the full UA string. If none match specifically, |
| 54 | + * fall back to the groups with User-agent: *. |
| 55 | + * - If neither specific nor global groups exist, there are no applicable rules. |
| 56 | + * |
| 57 | + * Rule semantics: |
| 58 | + * - Support Allow and Disallow (case-insensitive). |
| 59 | + * - Empty Disallow means allow everything (ignored as a rule). |
| 60 | + * - Patterns support '*' wildcard and '$' end-anchor per common practice and RFC 9309. |
| 61 | + */ |
| 62 | +export function parseRobots( |
| 63 | + robotsTxt: string, |
| 64 | + userAgent: string |
| 65 | +): ParsedRobots { |
| 66 | + const lines = robotsTxt.split(/\r?\n/); |
| 67 | + |
| 68 | + type Group = { agents: string[]; rules: RobotsRule[] }; |
| 69 | + const groups: Group[] = []; |
| 70 | + let currentGroup: Group | null = null; |
| 71 | + |
| 72 | + const ourToken = normalizeUserAgentToken(userAgent).toLowerCase(); |
| 73 | + const ourFull = userAgent.toLowerCase(); |
| 74 | + |
| 75 | + for (const rawLine of lines) { |
| 76 | + const lineWithoutComment = (() => { |
| 77 | + const hashIndex = rawLine.indexOf("#"); |
| 78 | + return (hashIndex >= 0 ? rawLine.slice(0, hashIndex) : rawLine).trim(); |
| 79 | + })(); |
| 80 | + if (!lineWithoutComment) continue; |
| 81 | + |
| 82 | + const uaMatch = lineWithoutComment.match(/^user-agent\s*:\s*(.+)$/i); |
| 83 | + if (uaMatch) { |
| 84 | + const token = (uaMatch[1] ?? "").trim(); |
| 85 | + if (!token) continue; |
| 86 | + // If we already started a group AND it has rules, this UA starts a new group. |
| 87 | + if (!currentGroup || currentGroup.rules.length > 0) { |
| 88 | + currentGroup = { agents: [], rules: [] }; |
| 89 | + groups.push(currentGroup); |
| 90 | + } |
| 91 | + currentGroup.agents.push(token); |
| 92 | + continue; |
| 93 | + } |
| 94 | + |
| 95 | + // A rule must follow at least one user-agent line; otherwise skip |
| 96 | + if (!currentGroup) continue; |
| 97 | + |
| 98 | + const allowMatch = lineWithoutComment.match(/^allow\s*:\s*(.*)$/i); |
| 99 | + if (allowMatch) { |
| 100 | + const pattern = (allowMatch[1] ?? "").trim(); |
| 101 | + if (!pattern) continue; |
| 102 | + const { regex, length } = compilePattern(pattern); |
| 103 | + currentGroup.rules.push({ type: "allow", pattern, regex, length }); |
| 104 | + continue; |
| 105 | + } |
| 106 | + |
| 107 | + const disallowMatch = lineWithoutComment.match(/^disallow\s*:\s*(.*)$/i); |
| 108 | + if (disallowMatch) { |
| 109 | + const pattern = (disallowMatch[1] ?? "").trim(); |
| 110 | + // Empty Disallow means allow all → ignore as a rule |
| 111 | + if (!pattern) continue; |
| 112 | + const { regex, length } = compilePattern(pattern); |
| 113 | + currentGroup.rules.push({ type: "disallow", pattern, regex, length }); |
| 114 | + continue; |
| 115 | + } |
| 116 | + |
| 117 | + // Ignore other directives (Sitemap, Crawl-delay, etc.) |
| 118 | + } |
| 119 | + |
| 120 | + const specificRules: RobotsRule[] = []; |
| 121 | + const globalRules: RobotsRule[] = []; |
| 122 | + |
| 123 | + for (const g of groups) { |
| 124 | + const agents = g.agents.map((a) => a.toLowerCase()); |
| 125 | + const isGlobal = agents.includes("*"); |
| 126 | + const isSpecific = agents.some((a) => a === ourToken || a === ourFull); |
| 127 | + |
| 128 | + if (isSpecific) specificRules.push(...g.rules); |
| 129 | + else if (isGlobal) globalRules.push(...g.rules); |
| 130 | + } |
| 131 | + |
| 132 | + if (specificRules.length) return { rules: specificRules }; |
| 133 | + if (globalRules.length) return { rules: globalRules }; |
| 134 | + return { rules: [] }; |
| 135 | +} |
| 136 | + |
| 137 | +/** |
| 138 | + * Determine if a given path (including optional query string) is allowed. |
| 139 | + * Implements longest-match precedence: select the matching rule with the |
| 140 | + * greatest length; on ties, an Allow rule wins. |
| 141 | + */ |
| 142 | +export function isPathAllowed( |
| 143 | + robotsTxt: string, |
| 144 | + pathWithQuery: string, |
| 145 | + userAgent: string |
| 146 | +): boolean { |
| 147 | + const { rules } = parseRobots(robotsTxt, userAgent); |
| 148 | + if (!rules.length) return true; |
| 149 | + |
| 150 | + let best: RobotsRule | undefined; |
| 151 | + for (const rule of rules) { |
| 152 | + if (rule.regex.test(pathWithQuery)) { |
| 153 | + if (!best) best = rule; |
| 154 | + else if (rule.length > best.length) best = rule; |
| 155 | + else if (rule.length === best.length && rule.type === "allow") |
| 156 | + best = rule; |
| 157 | + } |
| 158 | + } |
| 159 | + |
| 160 | + if (!best) return true; |
| 161 | + return best.type === "allow"; |
| 162 | +} |
0 commit comments