keep tests new modules

papes1ns · papes1ns · commit 48d17f3768cc · 2025-08-09T13:52:24.000-04:00
diff --git a/.cache/rss-cache.json b/.cache/rss-cache.json
@@ -3,8 +3,8 @@
     "etag": "W/\"6c83011f81bbb055bdee75ae39d61ab2\""
   },
   "https://spin.atomicobject.com/author/nathan-papes/feed/atom/": {
-    "etag": "W/\"7f8560e9c207aff29a153747b7cc6be0\"",
-    "lastModified": "Thu, 07 Aug 2025 15:50:45 GMT"
+    "etag": "W/\"42d099d3a398f7701824177ca25797a7\"",
+    "lastModified": "Sat, 09 Aug 2025 12:00:55 GMT"
   },
   "https://www.youtube.com/feeds/videos.xml?channel_id=UCPwv65XQty1QbqE04FitBlQ": {}
 }
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
@@ -1,2 +1,3 @@
-export * from "./schema.js";
 export * from "./generate.js";
+export * from "./robots.js";
+export * from "./schema.js";
diff --git a/packages/core/src/robots.ts b/packages/core/src/robots.ts
@@ -0,0 +1,162 @@
+/**
+ * RFC 9309-compliant robots.txt parsing and path matching utilities.
+ * Reference: https://www.rfc-editor.org/rfc/rfc9309.html
+ */
+
+export type RobotsRule = {
+  type: "allow" | "disallow";
+  pattern: string;
+  regex: RegExp;
+  length: number; // used for longest-match precedence
+};
+
+export type ParsedRobots = {
+  rules: RobotsRule[];
+};
+
+function escapeRegexLiteral(input: string): string {
+  return input.replace(/[.+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function compilePattern(pattern: string): { regex: RegExp; length: number } {
+  let anchorToEnd = false;
+  let raw = pattern.trim();
+
+  if (raw.endsWith("$")) {
+    anchorToEnd = true;
+    raw = raw.slice(0, -1);
+  }
+
+  // Convert path-pattern to regex: wildcards (*) match any char seq.
+  // We also match from the beginning of the path, and optionally anchor to end.
+  const escaped = escapeRegexLiteral(raw).replace(/\*/g, ".*");
+  const source = `^${escaped}${anchorToEnd ? "$" : ""}`;
+  const regex = new RegExp(source);
+
+  // For precedence, RFC uses longest match; we approximate by pattern length
+  // excluding a trailing '$' if present.
+  const length = raw.length;
+  return { regex, length };
+}
+
+function normalizeUserAgentToken(userAgent: string): string {
+  // Extract a product token from a full UA string (e.g. "merln/rss-bot (...)" -> "merln").
+  const token = (userAgent.split("/")[0] || userAgent).trim();
+  return token;
+}
+
+/**
+ * Parse robots.txt and return rules relevant to the provided userAgent.
+ *
+ * Group selection rules implemented:
+ * - Accumulate rules for any group where User-agent matches our product token
+ *   (case-insensitive) or the full UA string. If none match specifically,
+ *   fall back to the groups with User-agent: *.
+ * - If neither specific nor global groups exist, there are no applicable rules.
+ *
+ * Rule semantics:
+ * - Support Allow and Disallow (case-insensitive).
+ * - Empty Disallow means allow everything (ignored as a rule).
+ * - Patterns support '*' wildcard and '$' end-anchor per common practice and RFC 9309.
+ */
+export function parseRobots(
+  robotsTxt: string,
+  userAgent: string
+): ParsedRobots {
+  const lines = robotsTxt.split(/\r?\n/);
+
+  type Group = { agents: string[]; rules: RobotsRule[] };
+  const groups: Group[] = [];
+  let currentGroup: Group | null = null;
+
+  const ourToken = normalizeUserAgentToken(userAgent).toLowerCase();
+  const ourFull = userAgent.toLowerCase();
+
+  for (const rawLine of lines) {
+    const lineWithoutComment = (() => {
+      const hashIndex = rawLine.indexOf("#");
+      return (hashIndex >= 0 ? rawLine.slice(0, hashIndex) : rawLine).trim();
+    })();
+    if (!lineWithoutComment) continue;
+
+    const uaMatch = lineWithoutComment.match(/^user-agent\s*:\s*(.+)$/i);
+    if (uaMatch) {
+      const token = (uaMatch[1] ?? "").trim();
+      if (!token) continue;
+      // If we already started a group AND it has rules, this UA starts a new group.
+      if (!currentGroup || currentGroup.rules.length > 0) {
+        currentGroup = { agents: [], rules: [] };
+        groups.push(currentGroup);
+      }
+      currentGroup.agents.push(token);
+      continue;
+    }
+
+    // A rule must follow at least one user-agent line; otherwise skip
+    if (!currentGroup) continue;
+
+    const allowMatch = lineWithoutComment.match(/^allow\s*:\s*(.*)$/i);
+    if (allowMatch) {
+      const pattern = (allowMatch[1] ?? "").trim();
+      if (!pattern) continue;
+      const { regex, length } = compilePattern(pattern);
+      currentGroup.rules.push({ type: "allow", pattern, regex, length });
+      continue;
+    }
+
+    const disallowMatch = lineWithoutComment.match(/^disallow\s*:\s*(.*)$/i);
+    if (disallowMatch) {
+      const pattern = (disallowMatch[1] ?? "").trim();
+      // Empty Disallow means allow all → ignore as a rule
+      if (!pattern) continue;
+      const { regex, length } = compilePattern(pattern);
+      currentGroup.rules.push({ type: "disallow", pattern, regex, length });
+      continue;
+    }
+
+    // Ignore other directives (Sitemap, Crawl-delay, etc.)
+  }
+
+  const specificRules: RobotsRule[] = [];
+  const globalRules: RobotsRule[] = [];
+
+  for (const g of groups) {
+    const agents = g.agents.map((a) => a.toLowerCase());
+    const isGlobal = agents.includes("*");
+    const isSpecific = agents.some((a) => a === ourToken || a === ourFull);
+
+    if (isSpecific) specificRules.push(...g.rules);
+    else if (isGlobal) globalRules.push(...g.rules);
+  }
+
+  if (specificRules.length) return { rules: specificRules };
+  if (globalRules.length) return { rules: globalRules };
+  return { rules: [] };
+}
+
+/**
+ * Determine if a given path (including optional query string) is allowed.
+ * Implements longest-match precedence: select the matching rule with the
+ * greatest length; on ties, an Allow rule wins.
+ */
+export function isPathAllowed(
+  robotsTxt: string,
+  pathWithQuery: string,
+  userAgent: string
+): boolean {
+  const { rules } = parseRobots(robotsTxt, userAgent);
+  if (!rules.length) return true;
+
+  let best: RobotsRule | undefined;
+  for (const rule of rules) {
+    if (rule.regex.test(pathWithQuery)) {
+      if (!best) best = rule;
+      else if (rule.length > best.length) best = rule;
+      else if (rule.length === best.length && rule.type === "allow")
+        best = rule;
+    }
+  }
+
+  if (!best) return true;
+  return best.type === "allow";
+}
diff --git a/packages/core/src/rss.test.ts b/packages/core/src/rss.test.ts
@@ -0,0 +1,107 @@
+import { isPathAllowed, parseRobots } from "@merln/core/src/robots.ts";
+import { describe, expect, test } from "bun:test";
+
+const USER_AGENT = "merln/rss-bot (+https://natepapes.com)";
+
+describe("RFC 9309 robots matcher", () => {
+  test("longest match (RFC example) and allow precedence", () => {
+    const robots = `User-Agent: merln
+Allow: /example/page/
+Disallow: /example/page/disallowed.gif`;
+
+    expect(isPathAllowed(robots, "/example/page/ok", USER_AGENT)).toBe(true);
+    expect(
+      isPathAllowed(robots, "/example/page/disallowed.gif", USER_AGENT)
+    ).toBe(false);
+  });
+
+  test("empty Disallow means allow all for that agent", () => {
+    const robots = `User-agent: merln\nDisallow:`;
+    expect(isPathAllowed(robots, "/anything", USER_AGENT)).toBe(true);
+  });
+
+  test("fallback to global rules when no specific group", () => {
+    const robots = `User-agent: Googlebot\nDisallow: /google-only/\n\nUser-agent: *\nDisallow: /admin/`;
+    const { rules } = parseRobots(robots, USER_AGENT);
+    expect(rules.length).toBe(1);
+    expect(isPathAllowed(robots, "/admin/panel", USER_AGENT)).toBe(false);
+    expect(isPathAllowed(robots, "/public", USER_AGENT)).toBe(true);
+  });
+
+  test("wildcard * and end-anchor $", () => {
+    const robots = `User-Agent: *\nDisallow: *.gif$\nAllow: /publications/`;
+    expect(isPathAllowed(robots, "/img/pic.gif", USER_AGENT)).toBe(false);
+    expect(isPathAllowed(robots, "/img/pic.gifv", USER_AGENT)).toBe(true);
+    expect(isPathAllowed(robots, "/publications/x", USER_AGENT)).toBe(true);
+  });
+
+  test("match by full UA string if provided", () => {
+    const robots = `User-agent: ${USER_AGENT}\nDisallow: /special/`;
+    expect(isPathAllowed(robots, "/special/x", USER_AGENT)).toBe(false);
+  });
+});
+
+describe("Provider robots policies", () => {
+  const youtubeRobots = `# robots.txt file for YouTube
+User-agent: Mediapartners-Google*
+Disallow:
+
+User-agent: *
+Disallow: /api/
+Disallow: /comment
+Disallow: /feeds/videos.xml
+Disallow: /file_download
+Disallow: /get_video
+Disallow: /get_video_info
+Disallow: /get_midroll_info
+Disallow: /live_chat
+Disallow: /login
+Disallow: /qr
+Disallow: /results
+Disallow: /signup
+Disallow: /t/terms
+Disallow: /timedtext_video
+Disallow: /verify_age
+Disallow: /watch_ajax
+Disallow: /watch_fragments_ajax
+Disallow: /watch_popup
+Disallow: /watch_queue_ajax
+Disallow: /youtubei/
+`;
+
+  const githubRobots = `User-agent: *
+Disallow: /*/*/commits/
+Disallow: /*/raw/
+Disallow: /gist/
+Disallow: /search$
+Disallow: /*.atom$`;
+
+  const atomicRobots = `Crawl-delay: 10
+User-agent: *
+Disallow:
+Sitemap: https://spin.atomicobject.com/sitemap_index.xml`;
+
+  test("YouTube: typical video page allowed; certain APIs disallowed", () => {
+    expect(isPathAllowed(youtubeRobots, "/watch?v=abc", USER_AGENT)).toBe(true);
+    expect(isPathAllowed(youtubeRobots, "/feeds/videos.xml", USER_AGENT)).toBe(
+      false
+    );
+    expect(
+      isPathAllowed(youtubeRobots, "/youtubei/v1/browse", USER_AGENT)
+    ).toBe(false);
+  });
+
+  test("GitHub: block raw, commits list, gist path, and anchored search", () => {
+    expect(isPathAllowed(githubRobots, "/org/repo/commits/", USER_AGENT)).toBe(
+      false
+    );
+    expect(isPathAllowed(githubRobots, "/foo/raw/", USER_AGENT)).toBe(false);
+    expect(isPathAllowed(githubRobots, "/gist/", USER_AGENT)).toBe(false);
+    expect(isPathAllowed(githubRobots, "/search", USER_AGENT)).toBe(false);
+    expect(isPathAllowed(githubRobots, "/papes1ns", USER_AGENT)).toBe(true);
+  });
+
+  test("Atomic Object: allow all for *", () => {
+    expect(isPathAllowed(atomicRobots, "/anything", USER_AGENT)).toBe(true);
+  });
+});
diff --git a/script/rss.ts b/script/rss.ts
diff --git a/test/rss.test.ts b/test/rss.test.ts