Skip to content

Commit 08219a7

Browse files
committed
Add support for externally-hosted DocC websites
1 parent 157b8d7 commit 08219a7

10 files changed

Lines changed: 972 additions & 40 deletions

File tree

README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,41 @@ https://sosumi.ai/documentation/swift/array
2626
This works for all API reference docs,
2727
as well as Apple's [Human Interface Guidelines](https://developer.apple.com/design/human-interface-guidelines/) (HIG).
2828

29+
### External Swift-DocC sites
30+
31+
Sosumi can also proxy public non-Apple Swift-DocC pages using:
32+
33+
```
34+
https://sosumi.ai/external/https://<host>/documentation/<path>
35+
```
36+
37+
Example:
38+
39+
```
40+
https://sosumi.ai/external/https://reference-ios.daily.co/documentation/daily
41+
```
42+
43+
Under the hood, Sosumi resolves these to the source DocC JSON endpoint
44+
(`https://<host>/data/documentation/<path>.json`) and renders Markdown.
45+
46+
#### External access controls (self-hosting)
47+
48+
You can restrict which external hosts are allowed:
49+
50+
- `EXTERNAL_DOC_HOST_ALLOWLIST` - optional newline-delimited host allowlist
51+
- `EXTERNAL_DOC_HOST_BLOCKLIST` - optional newline-delimited host blocklist
52+
53+
If an allowlist is set, only listed hosts are permitted.
54+
Blocklist entries always deny access.
55+
56+
Sosumi also checks `robots.txt` for external hosts before fetching content.
57+
Site owners can block Sosumi by disallowing:
58+
59+
- User-agent: `sosumi-ai` (full UA: `sosumi-ai/1.0 (+https://sosumi.ai/#bot)`)
60+
- or wildcard `*`
61+
62+
See `/bot` for the crawler policy and contact details.
63+
2964
### MCP Integration
3065

3166
Sosumi's MCP server supports Streamable HTTP and Server-Sent Events (SSE) transport.
@@ -175,6 +210,9 @@ It does not crawl, spider, or bulk download;
175210
it does not attempt to bypass authentication or security;
176211
and it implements rate limiting to avoid imposing unreasonable load.
177212

213+
For external Swift-DocC hosts, access can be denied by `robots.txt`
214+
and opt-out response directives such as `X-Robots-Tag: noai`.
215+
178216
Content is fetched transiently and may be cached briefly to improve performance.
179217
No permanent archives are maintained.
180218
All copyrights and other rights in the underlying content remain with Apple Inc.

public/index.html

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,11 +594,33 @@ <h2>Examples</h2>
594594
<li><a href="/documentation/swift">Swift</a></li>
595595
<li><a href="/documentation/swiftui">SwiftUI</a></li>
596596
<li><a href="/design/human-interface-guidelines/">Human Interface Guidelines</a></li>
597+
<li><a
598+
href="/external/https://apple.github.io/swift-argument-parser/documentation/argumentparser/">Swift
599+
Argument Parser (external)</a></li>
597600
</ul>
598601
</section>
599602
</section>
600603

601604

605+
<section id="bot">
606+
<header>
607+
<h2>Bot & Crawling Policy</h2>
608+
<p>
609+
Automated fetches with the user agent
610+
<code>sosumi-ai/1.0 (+https://sosumi.ai/#bot)</code>.
611+
</p>
612+
<p>
613+
For external Swift-DocC hosts, sosumi.ai honors
614+
<code>robots.txt</code> rules and opt-out response directives such as
615+
<code>X-Robots-Tag: noai</code>.
616+
</p>
617+
<p>
618+
Questions or issues: <a href="mailto:info@sosumi.ai">info@sosumi.ai</a>
619+
</p>
620+
</header>
621+
</section>
622+
623+
602624

603625
<section id="mcp">
604626
<header>

src/index.ts

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ import { HTTPException } from "hono/http-exception"
66
import { trimTrailingSlash } from "hono/trailing-slash"
77

88
import { NotFoundError } from "./lib/fetch"
9+
import {
10+
assertExternalDocumentationAccess,
11+
extractExternalDocumentationBasePath,
12+
ExternalAccessError,
13+
fetchExternalDocCJSON,
14+
validateExternalDocumentationUrl,
15+
} from "./lib/external"
916
import {
1017
fetchHIGPageData,
1118
fetchHIGTableOfContents,
@@ -19,6 +26,8 @@ import { generateAppleDocUrl, isValidAppleDocUrl, normalizeDocumentationPath } f
1926
interface Env {
2027
ASSETS: Fetcher
2128
NODE_ENV: string
29+
EXTERNAL_DOC_HOST_ALLOWLIST?: string
30+
EXTERNAL_DOC_HOST_BLOCKLIST?: string
2231
}
2332

2433
const app = new Hono<{ Bindings: Env }>()
@@ -63,6 +72,8 @@ app.all("/mcp", async (c) => {
6372
return transport.handleRequest(c)
6473
})
6574

75+
app.get("/bot", (c) => c.redirect("/#bot", 302))
76+
6677
app.get("/documentation/*", async (c) => {
6778
const path = c.req.path
6879

@@ -135,6 +146,50 @@ This service only works with Apple Developer documentation URLs:
135146
})
136147
})
137148

149+
app.get("/external/*", async (c) => {
150+
const path = c.req.path
151+
const rawTarget = decodeURIComponent(path.replace("/external/", ""))
152+
const targetUrl = validateExternalDocumentationUrl(rawTarget)
153+
154+
await assertExternalDocumentationAccess(targetUrl, c.env)
155+
const jsonData = await fetchExternalDocCJSON(targetUrl)
156+
const externalBasePath = extractExternalDocumentationBasePath(targetUrl)
157+
const markdown = await renderFromJSON(jsonData, targetUrl.toString(), {
158+
externalOrigin: `${targetUrl.origin}${externalBasePath}`,
159+
})
160+
161+
if (!markdown || markdown.trim().length < 100) {
162+
throw new HTTPException(502, {
163+
message:
164+
"The external documentation page loaded but contained insufficient content. This may be a temporary issue with the page.",
165+
})
166+
}
167+
168+
const headers = {
169+
"Content-Type": "text/markdown; charset=utf-8",
170+
"Content-Location": targetUrl.toString(),
171+
"Cache-Control": "public, max-age=3600, s-maxage=86400",
172+
ETag: `"${Buffer.from(markdown).toString("base64").slice(0, 16)}"`,
173+
"Last-Modified": new Date().toUTCString(),
174+
}
175+
176+
if (c.req.header("Accept")?.includes("application/json")) {
177+
return c.json(
178+
{
179+
url: targetUrl.toString(),
180+
content: markdown,
181+
},
182+
200,
183+
{ ...headers, "Content-Type": "application/json; charset=utf-8" },
184+
)
185+
}
186+
187+
return c.text(markdown, 200, {
188+
...headers,
189+
"Content-Type": "text/markdown; charset=utf-8",
190+
})
191+
})
192+
138193
app.get("/design/human-interface-guidelines", async (c) => {
139194
// Handle the table of contents for HIG
140195
const tocData = await fetchHIGTableOfContents()
@@ -275,6 +330,36 @@ The requested Apple Developer documentation page does not exist.
275330
)
276331
}
277332

333+
if (err instanceof ExternalAccessError) {
334+
const accept = c.req.header("Accept")
335+
if (accept?.includes("application/json")) {
336+
return c.json(
337+
{
338+
error: "External documentation access denied",
339+
message: err.message,
340+
},
341+
{ status: err.status as 400 | 403 },
342+
)
343+
}
344+
345+
return c.text(
346+
`# External Documentation Access Denied
347+
348+
${err.message}
349+
350+
## Opt-out controls supported
351+
352+
- \`robots.txt\` disallow for \`sosumi-ai-bot\` (or \`*\`)
353+
- \`X-Robots-Tag\` response directives such as \`noai\`, \`noimageai\`, \`noindex\`
354+
- Local operator host controls: \`EXTERNAL_DOC_HOST_ALLOWLIST\`, \`EXTERNAL_DOC_HOST_BLOCKLIST\`
355+
356+
---
357+
*[sosumi.ai](https://sosumi.ai) - Making docs AI-readable*`,
358+
err.status as 400 | 403,
359+
{ "Content-Type": "text/markdown; charset=utf-8" },
360+
)
361+
}
362+
278363
// Handle unexpected errors
279364
const accept = c.req.header("Accept")
280365
if (accept?.includes("application/json")) {

src/lib/external/fetch.ts

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import { NotFoundError } from "../fetch"
2+
import type { AppleDocJSON } from "../types"
3+
import { EXTERNAL_DOC_USER_AGENT, ExternalAccessError } from "./policy"
4+
5+
const RESTRICTIVE_X_ROBOTS_TAGS = ["none", "noindex", "noai", "noimageai"] as const
6+
7+
export function extractExternalDocumentationBasePath(sourceUrl: URL): string {
8+
const normalizedPath = sourceUrl.pathname.replace(/\/+$/, "")
9+
const match = normalizedPath.match(/^(.*?)(\/documentation(?:\/.*)?)$/)
10+
if (!match) {
11+
throw new ExternalAccessError(
12+
"External URL must point to a Swift-DocC documentation path.",
13+
400,
14+
)
15+
}
16+
17+
return match[1]
18+
}
19+
20+
export function buildExternalDocCJsonUrl(sourceUrl: URL): URL {
21+
const hostBasePath = extractExternalDocumentationBasePath(sourceUrl)
22+
const documentationPath = sourceUrl.pathname.replace(/\/+$/, "").slice(hostBasePath.length)
23+
const jsonPath = documentationPath.endsWith(".json")
24+
? documentationPath
25+
: `${documentationPath}.json`
26+
return new URL(`${hostBasePath}/data${jsonPath}`, sourceUrl.origin)
27+
}
28+
29+
export async function fetchExternalDocCJSON(sourceUrl: URL): Promise<AppleDocJSON> {
30+
const jsonUrl = buildExternalDocCJsonUrl(sourceUrl)
31+
const response = await fetch(jsonUrl.toString(), {
32+
headers: {
33+
"User-Agent": EXTERNAL_DOC_USER_AGENT,
34+
Accept: "application/json",
35+
},
36+
})
37+
38+
const xRobotsTag = response.headers.get("x-robots-tag")
39+
if (containsRestrictiveXRobotsTag(xRobotsTag)) {
40+
throw new ExternalAccessError(
41+
"External host denied AI/doc access via X-Robots-Tag response header.",
42+
403,
43+
)
44+
}
45+
46+
if (!response.ok) {
47+
if (response.status === 404) {
48+
throw new NotFoundError(`External documentation page not found at ${jsonUrl.toString()}`)
49+
}
50+
51+
throw new Error(`Failed to fetch external DocC JSON: ${response.status} ${response.statusText}`)
52+
}
53+
54+
return (await response.json()) as AppleDocJSON
55+
}
56+
57+
function containsRestrictiveXRobotsTag(headerValue: string | null): boolean {
58+
if (!headerValue) {
59+
return false
60+
}
61+
62+
const tokenSet = new Set(
63+
headerValue
64+
.toLowerCase()
65+
.split(",")
66+
.map((token) => token.trim())
67+
.filter(Boolean),
68+
)
69+
70+
for (const token of RESTRICTIVE_X_ROBOTS_TAGS) {
71+
if (tokenSet.has(token)) {
72+
return true
73+
}
74+
}
75+
return false
76+
}

src/lib/external/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
export * from "./fetch"
2+
export * from "./policy"

0 commit comments

Comments
 (0)