Hide disabled pages from LLMs

jbellerb · jbellerb · commit 5c10513e3302 · 2026-01-28T16:56:06.000-05:00
Previously, disabled pages (marked as draft) could still be accessed at
`/{slug}.md`. This also updates llms.txt to dynamically pull from the
docs collection. This should avoid any drift between the human-focused
and llm-focused pages in the future.
diff --git a/src/lib/utils.ts b/src/lib/utils.ts
@@ -4,3 +4,67 @@ import { twMerge } from 'tailwind-merge';
 export function cn(...inputs: ClassValue[]) {
   return twMerge(clsx(inputs));
 }
+
+export function cleanMdxContent(content: string): string {
+  // Remove MDX import statements at the start of the file
+  content = content.replace(
+    /^(\s*import\s+.*?(?:from\s+['"].*?['"])?;?\s*\n)+/m,
+    '',
+  );
+
+  // Process Tabs components - extract TabItem contents
+  content = content.replace(/<Tabs>[\s\S]*?<\/Tabs>/g, (match) => {
+    const results: string[] = [];
+    const tabItemRegex =
+      /<TabItem[^>]*label="([^"]*)"[^>]*>([\s\S]*?)(?=<TabItem|<\/Tabs>)/g;
+
+    for (const [, label, tabContent] of match.matchAll(tabItemRegex)) {
+      const cleanContent = tabContent.replace(/<\/TabItem>\s*$/, '').trim();
+      if (cleanContent) {
+        results.push(`**${label}:**\n${cleanContent}`);
+      }
+    }
+
+    return results.length > 0 ? results.join('\n\n') : '';
+  });
+
+  // Process CardGrid/LinkCard components - convert to markdown links
+  content = content.replace(/<CardGrid[^>]*>[\s\S]*?<\/CardGrid>/g, (match) => {
+    const links: string[] = [];
+    const linkCardRegex =
+      /<LinkCard\s+[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*description="([^"]*)"[^>]*\/>/g;
+
+    for (const [, href, title, description] of match.matchAll(linkCardRegex)) {
+      const fullUrl = href.startsWith('/')
+        ? `https://docs.sprites.dev${href}`
+        : href;
+      links.push(`- [${title}](${fullUrl}) - ${description}`);
+    }
+
+    return links.length > 0 ? links.join('\n') : '';
+  });
+
+  // Remove self-closing JSX/MDX components
+  content = content.replace(/<[A-Z][a-zA-Z]*\s+[^>]*\/>/g, '');
+
+  // Handle Callout components - keep content
+  content = content.replace(
+    /<Callout[^>]*>([\s\S]*?)<\/Callout>/g,
+    (_, inner) => inner.trim(),
+  );
+
+  // Remove remaining JSX component tags
+  content = content.replace(/<[A-Z][a-zA-Z]*[^>]*>/g, '');
+  content = content.replace(/<\/[A-Z][a-zA-Z]*>/g, '');
+
+  // Convert relative links to fully qualified URLs
+  content = content.replace(
+    /\[([^\]]+)\]\(\/([^)]*)\)/g,
+    (_, text, path) => `[${text}](https://docs.sprites.dev/${path})`,
+  );
+
+  // Clean up excessive blank lines
+  content = content.replace(/\n{4,}/g, '\n\n\n');
+
+  return content.trim();
+}
diff --git a/src/pages/[...slug].md.ts b/src/pages/[...slug].md.ts
@@ -1,80 +1,20 @@
 import { getCollection } from 'astro:content';
 import type { APIRoute, GetStaticPaths } from 'astro';
 
+import { cleanMdxContent } from '@/lib/utils';
+
 export const prerender = true;
 
 export const getStaticPaths: GetStaticPaths = async () => {
-  const docs = await getCollection('docs');
+  const docs = await getCollection('docs', ({ data }) => {
+    return data.draft !== true;
+  });
   return docs.map((doc) => ({
     params: { slug: doc.id },
     props: { doc },
   }));
 };
 
-function cleanMdxContent(content: string): string {
-  // Remove MDX import statements at the start of the file
-  content = content.replace(
-    /^(\s*import\s+.*?(?:from\s+['"].*?['"])?;?\s*\n)+/m,
-    '',
-  );
-
-  // Process Tabs components - extract TabItem contents
-  content = content.replace(/<Tabs>[\s\S]*?<\/Tabs>/g, (match) => {
-    const results: string[] = [];
-    const tabItemRegex =
-      /<TabItem[^>]*label="([^"]*)"[^>]*>([\s\S]*?)(?=<TabItem|<\/Tabs>)/g;
-
-    for (const [, label, tabContent] of match.matchAll(tabItemRegex)) {
-      const cleanContent = tabContent.replace(/<\/TabItem>\s*$/, '').trim();
-      if (cleanContent) {
-        results.push(`**${label}:**\n${cleanContent}`);
-      }
-    }
-
-    return results.length > 0 ? results.join('\n\n') : '';
-  });
-
-  // Process CardGrid/LinkCard components - convert to markdown links
-  content = content.replace(/<CardGrid[^>]*>[\s\S]*?<\/CardGrid>/g, (match) => {
-    const links: string[] = [];
-    const linkCardRegex =
-      /<LinkCard\s+[^>]*href="([^"]*)"[^>]*title="([^"]*)"[^>]*description="([^"]*)"[^>]*\/>/g;
-
-    for (const [, href, title, description] of match.matchAll(linkCardRegex)) {
-      const fullUrl = href.startsWith('/')
-        ? `https://docs.sprites.dev${href}`
-        : href;
-      links.push(`- [${title}](${fullUrl}) - ${description}`);
-    }
-
-    return links.length > 0 ? links.join('\n') : '';
-  });
-
-  // Remove self-closing JSX/MDX components
-  content = content.replace(/<[A-Z][a-zA-Z]*\s+[^>]*\/>/g, '');
-
-  // Handle Callout components - keep content
-  content = content.replace(
-    /<Callout[^>]*>([\s\S]*?)<\/Callout>/g,
-    (_, inner) => inner.trim(),
-  );
-
-  // Remove remaining JSX component tags
-  content = content.replace(/<[A-Z][a-zA-Z]*[^>]*>/g, '');
-  content = content.replace(/<\/[A-Z][a-zA-Z]*>/g, '');
-
-  // Convert relative links to fully qualified URLs
-  content = content.replace(
-    /\[([^\]]+)\]\(\/([^)]*)\)/g,
-    (_, text, path) => `[${text}](https://docs.sprites.dev/${path})`,
-  );
-
-  // Clean up excessive blank lines
-  content = content.replace(/\n{4,}/g, '\n\n\n');
-
-  return content.trim();
-}
-
 export const GET: APIRoute = async ({ props }) => {
   const { doc } = props as {
     doc: {
diff --git a/src/pages/llms-full.txt.ts b/src/pages/llms-full.txt.ts
@@ -1,148 +1,60 @@
-import { promises as fs } from 'node:fs';
-import { join } from 'node:path';
+import type { CollectionEntry } from 'astro:content';
+import { getCollection } from 'astro:content';
 import type { APIRoute } from 'astro';
+import { sidebarConfig } from '@/lib/sidebar';
+import { cleanMdxContent } from '@/lib/utils';
+
+import type { APIRoute } from 'astro';
+import type { CollectionEntry } from 'astro:content';
 
 export const prerender = true;
 
-// Use process.cwd() which is the project root during Astro build
-const docsDir = join(process.cwd(), 'src/content/docs');
-
-// Document order matching the sidebar structure
-const docOrder = [
-  // Getting Started
-  'index.mdx',
-  'quickstart.mdx',
-  'working-with-sprites.mdx',
-  // Concepts
-  'concepts/lifecycle.mdx',
-  'concepts/services.mdx',
-  'concepts/networking.mdx',
-  'concepts/checkpoints.mdx',
-  // CLI
-  'cli/installation.mdx',
-  'cli/authentication.mdx',
-  'cli/commands.mdx',
-  // SDKs
-  'sdks/javascript.mdx',
-  'sdks/go.mdx',
-  // API (generated)
-  'api/index.mdx',
-  'api/exec.mdx',
-  'api/checkpoints.mdx',
-  'api/services.mdx',
-  'api/proxy.mdx',
-  'api/policy.mdx',
-  'api/types.mdx',
-  // Reference
-  'reference/base-images.mdx',
-  'reference/configuration.mdx',
-  'reference/billing.mdx',
-];
-
-// Section headers for organization
-const sections: Record<string, string> = {
-  'index.mdx': '# Getting Started',
-  'concepts/lifecycle.mdx': '# Concepts',
-  'cli/installation.mdx': '# CLI',
-  'sdks/javascript.mdx': '# SDKs',
-  'api/index.mdx': '# API',
-  'reference/base-images.mdx': '# Reference',
+export type DocsGroup = {
+  label: string;
+  items: {
+    slug: string;
+    body: string;
+    title: string;
+    description?: string;
+  }[];
 };
 
-interface DocMeta {
-  title: string;
-  description?: string;
-}
-
-function extractFrontmatter(content: string): { meta: DocMeta; body: string } {
-  const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
-
-  if (!frontmatterMatch) {
-    return { meta: { title: 'Untitled' }, body: content };
-  }
-
-  const [, frontmatterStr, body] = frontmatterMatch;
-  const meta: DocMeta = { title: 'Untitled' };
+export async function getGroupedDocs(): Promise<DocsGroup[]> {
+  const collection = await getCollection('docs', ({ data }) => {
+    return data.draft !== true;
+  });
 
-  // Parse YAML-like frontmatter (simple key: value parsing)
-  for (const line of frontmatterStr.split('\n')) {
-    const titleMatch = line.match(/^title:\s*(.+)$/);
-    if (titleMatch) {
-      meta.title = titleMatch[1].replace(/^["']|["']$/g, '');
-    }
-    const descMatch = line.match(/^description:\s*(.+)$/);
-    if (descMatch) {
-      meta.description = descMatch[1].replace(/^["']|["']$/g, '');
-    }
+  const atlas = new Map<string, CollectionEntry<'docs'>>();
+  for (const doc of collection) {
+    atlas.set(doc.id, doc);
   }
 
-  return { meta, body };
-}
-
-function cleanMdxContent(content: string): string {
-  // Remove MDX import statements at the start of the file (before any content)
-  // This preserves imports inside code blocks
-  content = content.replace(
-    /^(\s*import\s+.*?(?:from\s+['"].*?['"])?;?\s*\n)+/m,
-    '',
-  );
-
-  // Process Tabs components - extract TabItem contents
-  // Need to handle nested content carefully (code blocks with special chars)
-  content = content.replace(/<Tabs>[\s\S]*?<\/Tabs>/g, (match) => {
-    const results: string[] = [];
-
-    // Split by TabItem boundaries and extract content
-    const tabItemRegex =
-      /<TabItem[^>]*label="([^"]*)"[^>]*>([\s\S]*?)(?=<TabItem|<\/Tabs>)/g;
-
-    for (const [, label, tabContent] of match.matchAll(tabItemRegex)) {
-      // Clean up the content - remove closing </TabItem> if present
-      const cleanContent = tabContent.replace(/<\/TabItem>\s*$/, '').trim();
-      if (cleanContent) {
-        results.push(`**${label}:**\n${cleanContent}`);
+  const groups = [];
+  for (const { label, items: sidebarItems } of sidebarConfig) {
+    const items = [];
+    for (const sidebarItem of sidebarItems) {
+      if (
+        typeof sidebarItem === 'object' &&
+        sidebarItem != null &&
+        'slug' in sidebarItem
+      ) {
+        const doc = atlas.get(sidebarItem.slug);
+        if (doc != null && doc.body != null) {
+          items.push({
+            slug: doc.id,
+            body: doc.body,
+            title: doc.data.title,
+            description: doc.data.description,
+          });
+        } else {
+          console.warn(`Warning: Could not find ${sidebarItem.label}:`);
+        }
       }
     }
+    groups.push({ label, items });
+  }
 
-    return results.length > 0 ? results.join('\n\n') : '';
-  });
-
-  // Remove self-closing JSX/MDX components (like <Callout ... />)
-  content = content.replace(/<[A-Z][a-zA-Z]*\s+[^>]*\/>/g, '');
-
-  // Remove JSX components with content (non-greedy, for simple components)
-  // Handle Callout, Snippet, and other simple wrapper components
-  content = content.replace(
-    /<Callout[^>]*>([\s\S]*?)<\/Callout>/g,
-    (_, inner) => {
-      // Keep the content, just remove the wrapper
-      return inner.trim();
-    },
-  );
-
-  // Remove remaining JSX component tags (opening and closing)
-  content = content.replace(/<[A-Z][a-zA-Z]*[^>]*>/g, '');
-  content = content.replace(/<\/[A-Z][a-zA-Z]*>/g, '');
-
-  // Convert relative links to fully qualified URLs
-  // Matches markdown links like [text](/path) or [text](/path/)
-  content = content.replace(
-    /\[([^\]]+)\]\(\/([^)]*)\)/g,
-    (_, text, path) => `[${text}](https://docs.sprites.dev/${path})`,
-  );
-
-  // Clean up excessive blank lines
-  content = content.replace(/\n{4,}/g, '\n\n\n');
-
-  // Trim leading/trailing whitespace
-  content = content.trim();
-
-  return content;
-}
-
-function slugToUrl(slug: string): string {
-  const path = slug.replace(/\.mdx$/, '').replace(/^index$/, '');
-  return `https://docs.sprites.dev/${path}${path ? '/' : ''}`;
+  return groups;
 }
 
 export const GET: APIRoute = async () => {
@@ -160,34 +72,21 @@ Summary: https://docs.sprites.dev/llms.txt
 ---
 `);
 
-  let currentSection = '';
-
-  for (const docPath of docOrder) {
-    const fullPath = join(docsDir, docPath);
-
-    try {
-      const content = await fs.readFile(fullPath, 'utf-8');
-      const { meta, body } = extractFrontmatter(content);
+  const groups = await getGroupedDocs();
+  for (const { label, items } of groups) {
+    parts.push(`\n# ${label}\n`);
+    for (const { slug, title, description, body } of items) {
       const cleanedContent = cleanMdxContent(body);
 
-      // Add section header if we're entering a new section
-      if (sections[docPath] && sections[docPath] !== currentSection) {
-        currentSection = sections[docPath];
-        parts.push(`\n${currentSection}\n`);
-      }
-
       // Add document with title and URL
-      const url = slugToUrl(docPath);
-      parts.push(`## ${meta.title}
+      parts.push(`## ${title}
 
-URL: ${url}
-${meta.description ? `\n${meta.description}\n` : ''}
+URL: https://docs.sprites.dev/${slug}.md
+${description ? `\n${description}\n` : ''}
 ${cleanedContent}
 
 ---
 `);
-    } catch (error) {
-      console.warn(`Warning: Could not read ${docPath}:`, error);
     }
   }
 
diff --git a/src/pages/llms.txt.ts b/src/pages/llms.txt.ts