Canonry
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 2 deletions b/‎README.md‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎package.json‎
Lines changed: 1 addition & 1 deletion b/‎package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎skills/aeo/SKILL.md‎
Lines changed: 5 additions & 1 deletion b/‎skills/aeo/SKILL.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/analyzers/ai-readable-content.ts‎
Lines changed: 36 additions & 0 deletions b/‎src/analyzers/ai-readable-content.ts‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎src/analyzers/helpers.ts‎
Lines changed: 208 additions & 0 deletions b/‎src/analyzers/helpers.ts‎
Lines changed: 208 additions & 0 deletions
@@ -1,5 +1,15 @@
 # Changelog
 
+## 1.10.0 (2026-05-23)
+
+### Added
+- **Sitemap auto-discovery fallback (#32).** When `/sitemap.xml` returns 404, `runSitemapAudit` and the auxiliary fetcher now also try `/sitemap-index.xml` (common on Astro / Next.js / Vercel) and, as a final fallback, parse the `Sitemap:` directive from `/robots.txt`. Previously sites that only published `sitemap-index.xml` got "Sitemap returned HTTP 404." with no audit coverage unless the user passed the explicit URL.
+- **Content-negotiation diagnostic (#34, #35).** When an auxiliary file (`/llms.txt`, `/llms-full.txt`, `/robots.txt`, `/sitemap.xml`) responds OK to the audit, the fetcher probes once with `Accept: text/markdown` to detect content-negotiation traps where Vercel / Astro / Starlight stacks 307-redirect `.txt` to a non-existent `.md` variant. Any non-2xx response from the markdown probe surfaces an actionable finding so users can fix the negotiation rule rather than the file. (Issue #34's original "UA filtering" hypothesis turned out to be the same content-negotiation root cause — `aeo-audit` already sends `Accept: */*` so it isn't directly affected, but the diagnostic catches the pattern that breaks downstream AI tools that prefer markdown.)
+- **Domain-aware schema recommendations (#33).** The `structured-data` and `schema-completeness` analyzers now detect the site category (SaaS / dev tools, e-commerce, local business, service business, blog/content) from JSON-LD, page text keywords, and outbound links, and recommend schemas that match. SaaS sites are no longer told to add `LocalBusiness` schema; the safe fallback when no category is detected is `Organization` instead of `LocalBusiness`.
+
+### Changed
+- New `AuxiliaryDiagnostics` field on `AuxiliaryResource` carries the content-negotiation signal. The `AiReadableContent` analyzer surfaces it as a finding and recommendation.
+
 ## 1.9.0 (2026-05-21)
 
 ### Added
 
@@ -145,7 +145,8 @@ Per-URL fetch errors don't abort the batch — each entry is reported with `stat
 Audit every page discovered from the site's sitemap with bounded concurrency (5 in flight):
 
 ```bash
-# Auto-discover /sitemap.xml
+# Auto-discover the sitemap (tries /sitemap.xml, then /sitemap-index.xml,
+# then the Sitemap: directive in /robots.txt)
 npx @ainyc/aeo-audit https://example.com --sitemap
 
 # Provide an explicit sitemap URL
@@ -158,8 +159,14 @@ npx @ainyc/aeo-audit https://example.com --sitemap --limit 50
 npx @ainyc/aeo-audit https://example.com --sitemap --top-issues
 ```
 
+Auto-discovery checks `/sitemap.xml` → `/sitemap-index.xml` → `Sitemap:` directives in `/robots.txt`. Astro / Next.js / Vercel sites that only publish `sitemap-index.xml` are now discovered without needing an explicit URL.
+
 When the sitemap has more URLs than `--limit`, the run audits the highest-priority pages and prints a notice to stderr listing how many were skipped and how to audit them all.
 
+### Auxiliary File Diagnostics
+
+When fetching `/llms.txt`, `/llms-full.txt`, `/robots.txt`, and `/sitemap.xml` the audit runs a **content-negotiation probe** that surfaces as a finding on the **AI-Readable Content** factor: if a file returns OK to a bare request but a non-2xx response under `Accept: text/markdown`, the audit reports a content-negotiation trap. This catches Astro / Vercel / Starlight setups that redirect `.txt` → non-existent `.md` for markdown-accepting clients, which makes the file invisible to AI content-extraction tools — even though the file is "present" by every other measure.
+
 ### Flag Reference
 
 | Flag | Description |
@@ -169,7 +176,7 @@ When the sitemap has more URLs than `--limit`, the run audits the highest-priori
 | `--include-geo` | Include the optional geographic signals factor |
 | `--include-agent-skills` | Include the optional agent skill exposure factor |
 | `--lighthouse` | Include the optional Lighthouse factor (Performance + Accessibility + Best Practices, mobile strategy) via Google PageSpeed Insights. Single-URL only; cannot combine with `--sitemap` or `--detect-platform`. Adds ~15-30s. Set `PAGESPEED_API_KEY` env var to lift anonymous rate limits. |
-| `--sitemap [url]` | Audit all pages from the sitemap (auto-discovers `/sitemap.xml` or uses an explicit URL) |
+| `--sitemap [url]` | Audit all pages from the sitemap. Auto-discovery tries `/sitemap.xml`, then `/sitemap-index.xml`, then `Sitemap:` directives in `/robots.txt`. Pass an explicit URL to override. |
 | `--limit <n>` | Max pages to audit in sitemap mode (default 200, sorted by sitemap priority) |
 | `--top-issues` | In sitemap mode, skip per-page output and show only cross-cutting issues |
 | `--detect-platform` | Identify the platform/CMS/framework powering the site instead of running an audit |
 
@@ -1,6 +1,6 @@
 {
   "name": "@ainyc/aeo-audit",
-  "version": "1.9.0",
+  "version": "1.10.0",
   "description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 16 ranking factors that determine AI citation.",
   "type": "module",
   "main": "./dist/index.js",
 
@@ -95,7 +95,7 @@ npx @ainyc/aeo-audit@1 "<url>" --sitemap --top-issues --format json
 ```
 
 Flags:
-- `--sitemap [url]` — auto-discover `/sitemap.xml` or provide an explicit URL
+- `--sitemap [url]` — auto-discover the sitemap (tries `/sitemap.xml`, then `/sitemap-index.xml`, then `Sitemap:` directives in `/robots.txt`) or provide an explicit URL
 - `--limit <n>` — cap pages audited (default 200, sorted by sitemap priority)
 - `--top-issues` — skip per-page output, show only cross-cutting patterns
 
@@ -107,6 +107,10 @@ Returns:
 - Aggregate score and grade
 - Prioritized fixes ranked by site-wide impact
 
+#### Auxiliary File Diagnostics
+
+When the audit fetches `/llms.txt`, `/llms-full.txt`, `/robots.txt`, and `/sitemap.xml`, it probes once with `Accept: text/markdown` to detect a **content-negotiation** trap: file responds OK to a bare request but returns a non-2xx response when the client prefers markdown. This catches Astro / Vercel / Starlight setups that 307-redirect `.txt` → non-existent `.md` for markdown-accepting clients, making the file invisible to AI content-extraction tools even though the file exists. The diagnostic surfaces as a finding on the **AI-Readable Content** factor.
+
 ### Lighthouse Mode
 
 Use `--lighthouse` when the user wants page speed, accessibility, or best-practices scoring alongside the AEO factors. It calls Google PageSpeed Insights (mobile strategy) and aggregates Performance + Accessibility + Best Practices into a single optional factor (weight 8).
 
@@ -1,6 +1,38 @@
 import { clampScore, countWords } from './helpers.js'
 import type { AnalysisResult, AuditContext, AuxiliaryResource } from '../types.js'
 
+function pushDiagnosticFindings(
+  fallbackLabel: string,
+  auxEntry: AuxiliaryResource | undefined,
+  findings: AnalysisResult['findings'],
+  recommendations: string[],
+): void {
+  const diagnostics = auxEntry?.diagnostics
+  if (!diagnostics) return
+
+  // Prefer the actual fetched path so that fallback resolutions (e.g.
+  // /sitemap.xml → /sitemap-index.xml) are reflected accurately in the
+  // finding instead of the spec's default label.
+  let label = fallbackLabel
+  if (auxEntry?.url) {
+    try {
+      label = new URL(auxEntry.url).pathname
+    } catch {
+      // ignore — keep the fallback label
+    }
+  }
+
+  if (diagnostics.contentNegotiation) {
+    findings.push({
+      type: 'info',
+      message: `${label} returns a non-2xx response when fetched with \`Accept: text/markdown\` — content negotiation hides it from AI content extraction tools that prefer markdown.`,
+    })
+    recommendations.push(
+      `Serve ${label} with the same body regardless of the \`Accept\` header (avoid redirecting .txt to a non-existent .md variant).`,
+    )
+  }
+}
+
 function scoreAuxState(
   auxEntry: AuxiliaryResource | undefined,
   missingMessage: string,
@@ -47,6 +79,7 @@ export function analyzeAiReadableContent(context: AuditContext): AnalysisResult
     findings,
     recommendations,
   )
+  pushDiagnosticFindings('/llms.txt', auxiliary.llmsTxt, findings, recommendations)
 
   if (auxiliary.llmsTxt?.state === 'ok') {
     const wordCount = countWords(auxiliary.llmsTxt.body || '')
@@ -67,6 +100,7 @@ export function analyzeAiReadableContent(context: AuditContext): AnalysisResult
     findings,
     recommendations,
   )
+  pushDiagnosticFindings('/llms-full.txt', auxiliary.llmsFullTxt, findings, recommendations)
 
   if (auxiliary.llmsFullTxt?.state === 'ok') {
     const wordCount = countWords(auxiliary.llmsFullTxt.body || '')
@@ -91,6 +125,7 @@ export function analyzeAiReadableContent(context: AuditContext): AnalysisResult
     findings.push({ type: 'missing', message: '/robots.txt is missing.' })
     recommendations.push('Add a robots.txt file.')
   }
+  pushDiagnosticFindings('/robots.txt', auxiliary.robotsTxt, findings, recommendations)
 
   // Sitemap presence
   const sitemapState = auxiliary.sitemapXml?.state
@@ -104,6 +139,7 @@ export function analyzeAiReadableContent(context: AuditContext): AnalysisResult
     findings.push({ type: 'missing', message: '/sitemap.xml is missing.' })
     recommendations.push('Add a sitemap.xml file.')
   }
+  pushDiagnosticFindings('/sitemap.xml', auxiliary.sitemapXml, findings, recommendations)
 
   // HTML head link to llms.txt
   const llmsLink = context.$('link[href*="llms.txt"]').length > 0
 
@@ -326,3 +326,211 @@ export function domainFromUrl(rawUrl: string): string {
     return ''
   }
 }
+
+export type SiteCategory =
+  | 'saas-devtools'
+  | 'ecommerce'
+  | 'local-business'
+  | 'service-business'
+  | 'blog-or-content'
+  | 'unknown'
+
+export interface SiteCategoryDetection {
+  category: SiteCategory
+  /** 0–1 confidence in the chosen category; under 0.4 we treat as unknown. */
+  confidence: number
+  /** Recommended JSON-LD types for this category, in priority order. */
+  recommendedSchemas: string[]
+  /** Concrete signals that drove the classification. */
+  evidence: string[]
+}
+
+interface CategorySignalAccumulator {
+  category: SiteCategory
+  score: number
+  evidence: string[]
+}
+
+const SAAS_DEVTOOLS_KEYWORDS = [
+  'api', 'sdk', 'documentation', 'docs', 'github', 'npm install', 'pip install',
+  'yarn add', 'pnpm add', 'cli', 'developers', 'integration', 'webhook',
+  'open source', 'opensource', 'authentication', 'oauth', 'api key',
+  'pricing', 'enterprise', 'self-host', 'self host', 'getting started',
+]
+
+const ECOMMERCE_KEYWORDS = [
+  'add to cart', 'add to bag', 'shopping cart', 'checkout', 'shop now',
+  'buy now', 'in stock', 'out of stock', 'free shipping', 'returns',
+  'product details', 'sku', 'add to wishlist', 'view product',
+]
+
+const LOCAL_BUSINESS_KEYWORDS = [
+  'opening hours', 'business hours', 'directions', 'visit us', 'our location',
+  'find us', 'reservations', 'book a table', 'menu', 'walk-ins welcome',
+  'serving the', 'in the heart of',
+]
+
+const SERVICE_BUSINESS_KEYWORDS = [
+  'book a call', 'book a consultation', 'get a quote', 'request a quote',
+  'free consultation', 'our services', 'case studies', 'client',
+  'testimonials', 'schedule a meeting', 'hire us',
+]
+
+const BLOG_KEYWORDS = [
+  'recent posts', 'latest articles', 'read more', 'by author', 'published on',
+  'subscribe to newsletter', 'archives', 'categories', 'tags', 'comments',
+]
+
+function countKeywordHits(text: string, keywords: string[]): { count: number; matched: string[] } {
+  const lower = text.toLowerCase()
+  const matched: string[] = []
+  let count = 0
+  for (const keyword of keywords) {
+    if (lower.includes(keyword)) {
+      count += 1
+      matched.push(keyword)
+      if (matched.length >= 3) break
+    }
+  }
+  return { count, matched }
+}
+
+/**
+ * Issue #33: detect the site's category so schema recommendations match the
+ * business (SaaS/dev tools shouldn't be told to add LocalBusiness schema).
+ *
+ * Uses three signal layers, ranked by reliability:
+ *   1. Existing JSON-LD types on the page — strongest signal.
+ *   2. Page text keywords — moderate signal.
+ *   3. Outbound/script URLs (GitHub, npm, package registries) — supporting signal.
+ *
+ * Returns 'unknown' when no category clears a low confidence bar so we fall back
+ * to the safe-default recommendations (Organization + something explanatory).
+ */
+export function detectSiteCategory(
+  context: Pick<AuditContext, 'structuredData' | 'textContent' | 'html'>,
+): SiteCategoryDetection {
+  const schemaTypes = extractSchemaTypes(context.structuredData || [])
+  const text = context.textContent || ''
+  const html = context.html || ''
+
+  const accumulators: CategorySignalAccumulator[] = [
+    { category: 'saas-devtools', score: 0, evidence: [] },
+    { category: 'ecommerce', score: 0, evidence: [] },
+    { category: 'local-business', score: 0, evidence: [] },
+    { category: 'service-business', score: 0, evidence: [] },
+    { category: 'blog-or-content', score: 0, evidence: [] },
+  ]
+
+  const saas = accumulators[0]
+  const ecom = accumulators[1]
+  const local = accumulators[2]
+  const service = accumulators[3]
+  const blog = accumulators[4]
+
+  // Schema-level signals (highest confidence — the site told us what it is).
+  if (schemaTypes.has('SoftwareApplication') || schemaTypes.has('WebApplication') || schemaTypes.has('MobileApplication')) {
+    saas.score += 4
+    saas.evidence.push('SoftwareApplication schema present')
+  }
+  if (schemaTypes.has('Product') || schemaTypes.has('Offer') || schemaTypes.has('AggregateOffer')) {
+    ecom.score += 4
+    ecom.evidence.push('Product/Offer schema present')
+  }
+  if (schemaTypes.has('LocalBusiness') || schemaTypes.has('Restaurant') || schemaTypes.has('Store') || schemaTypes.has('PostalAddress')) {
+    local.score += 4
+    local.evidence.push('LocalBusiness/PostalAddress schema present')
+  }
+  if (schemaTypes.has('Service') || schemaTypes.has('ProfessionalService')) {
+    service.score += 2
+    service.evidence.push('Service schema present')
+  }
+  if (schemaTypes.has('Article') || schemaTypes.has('BlogPosting') || schemaTypes.has('NewsArticle')) {
+    blog.score += 4
+    blog.evidence.push('Article/BlogPosting schema present')
+  }
+
+  // Text keyword signals.
+  const saasHits = countKeywordHits(text, SAAS_DEVTOOLS_KEYWORDS)
+  if (saasHits.count > 0) {
+    saas.score += saasHits.count
+    saas.evidence.push(`SaaS/dev keywords: ${saasHits.matched.join(', ')}`)
+  }
+  const ecomHits = countKeywordHits(text, ECOMMERCE_KEYWORDS)
+  if (ecomHits.count > 0) {
+    ecom.score += ecomHits.count * 1.5 // e-commerce phrases are very specific
+    ecom.evidence.push(`E-commerce keywords: ${ecomHits.matched.join(', ')}`)
+  }
+  const localHits = countKeywordHits(text, LOCAL_BUSINESS_KEYWORDS)
+  if (localHits.count > 0) {
+    local.score += localHits.count * 1.5
+    local.evidence.push(`Local-business keywords: ${localHits.matched.join(', ')}`)
+  }
+  const serviceHits = countKeywordHits(text, SERVICE_BUSINESS_KEYWORDS)
+  if (serviceHits.count > 0) {
+    service.score += serviceHits.count
+    service.evidence.push(`Service keywords: ${serviceHits.matched.join(', ')}`)
+  }
+  const blogHits = countKeywordHits(text, BLOG_KEYWORDS)
+  if (blogHits.count > 0) {
+    blog.score += blogHits.count * 0.75 // blog phrases overlap with many sites
+    blog.evidence.push(`Blog/content keywords: ${blogHits.matched.join(', ')}`)
+  }
+
+  // Outbound/script URL signals for SaaS — GitHub repo, npm package, package manager mentions.
+  if (/github\.com\/[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+/i.test(html)) {
+    saas.score += 1
+    saas.evidence.push('GitHub repo link in HTML')
+  }
+  if (/(npmjs\.com|unpkg\.com|jsdelivr\.net|cdnjs\.cloudflare\.com)/i.test(html)) {
+    saas.score += 1
+    saas.evidence.push('npm/CDN registry reference')
+  }
+
+  // Pick the strongest signal and decide whether to commit.
+  accumulators.sort((a, b) => b.score - a.score)
+  const top = accumulators[0]
+  const next = accumulators[1]
+
+  const MIN_SCORE = 2 // need at least one strong schema signal or two keyword matches
+  const MARGIN = 1 // top must beat runner-up by at least one point
+
+  if (top.score < MIN_SCORE || top.score - next.score < MARGIN) {
+    return {
+      category: 'unknown',
+      confidence: 0,
+      recommendedSchemas: ['Organization'],
+      evidence: [],
+    }
+  }
+
+  const totalScore = accumulators.reduce((sum, a) => sum + a.score, 0)
+  const confidence = totalScore > 0 ? Math.min(1, top.score / Math.max(totalScore, 1)) : 0
+
+  return {
+    category: top.category,
+    confidence,
+    recommendedSchemas: recommendedSchemasFor(top.category),
+    evidence: top.evidence,
+  }
+}
+
+function recommendedSchemasFor(category: SiteCategory): string[] {
+  switch (category) {
+    case 'saas-devtools':
+      return ['Organization', 'SoftwareApplication', 'FAQPage']
+    case 'ecommerce':
+      return ['Organization', 'Product', 'AggregateRating']
+    case 'local-business':
+      return ['LocalBusiness', 'Service', 'FAQPage']
+    case 'service-business':
+      return ['Organization', 'Service', 'FAQPage']
+    case 'blog-or-content':
+      return ['Organization', 'Article', 'BreadcrumbList']
+    case 'unknown':
+    default:
+      // Organization is the safest broad default; suggest Article and FAQPage
+      // as common follow-ups regardless of business type.
+      return ['Organization', 'Article', 'FAQPage']
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@ainyc/aeo-audit",`
`3`		`- "version": "1.9.0",`
	`3`	`+ "version": "1.10.0",`
`4`	`4`	`"description": "The most comprehensive open-source Answer Engine Optimization (AEO) audit tool. Scores websites across 16 ranking factors that determine AI citation.",`
`5`	`5`	`"type": "module",`
`6`	`6`	`"main": "./dist/index.js",`