Skip to content

Commit 219a5d7

Browse files
authored
Fall back to root domain robots.txt when subdomain returns 404/403 (#33)
If subdomain robots.txt is missing or inaccessible (404, 410, 403), fetch root domain's robots.txt and use it; allow fetch when neither resolves.
1 parent ec15e53 commit 219a5d7

4 files changed

Lines changed: 128 additions & 7 deletions

File tree

src/lib/external/fetch.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,13 @@ export async function fetchRobotsPolicy(
9191
},
9292
})
9393

94-
// Missing robots.txt is treated as no policy restrictions.
95-
if (response.status === 404 || response.status === 410) {
96-
return { kind: "allow-all" }
94+
// Missing or inaccessible robots.txt — caller may try root domain or allow.
95+
if (response.status === 404 || response.status === 410 || response.status === 403) {
96+
return { kind: "not-found" }
9797
}
9898

99-
// Explicit access denial when robots cannot be read due to auth restrictions.
100-
if (response.status === 401 || response.status === 403) {
99+
// Explicit access denial when robots cannot be read due to auth.
100+
if (response.status === 401) {
101101
return { kind: "deny-all" }
102102
}
103103

src/lib/external/policy.ts

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,10 @@ async function isAllowedByRobotsTxt(targetUrl: URL): Promise<boolean> {
113113
if (policy.kind === "deny-all") {
114114
return false
115115
}
116-
return evaluateRobotsPolicy(policy.robotsText, targetUrl, EXTERNAL_DOC_USER_AGENT)
116+
if (policy.kind === "rules") {
117+
return evaluateRobotsPolicy(policy.robotsText, targetUrl, EXTERNAL_DOC_USER_AGENT)
118+
}
119+
return true
117120
}
118121

119122
function evaluateRobotsPolicy(robotsText: string, targetUrl: URL, userAgent: string): boolean {
@@ -135,6 +138,20 @@ function parseHostList(rawList: string | undefined): Set<string> {
135138
)
136139
}
137140

141+
function getRootOrigin(origin: string): string | null {
142+
try {
143+
const url = new URL(origin)
144+
const labels = url.hostname.toLowerCase().split(".")
145+
if (labels.length < 3) {
146+
return null
147+
}
148+
const rootHost = labels.slice(-2).join(".")
149+
return `${url.protocol}//${rootHost}`
150+
} catch {
151+
return null
152+
}
153+
}
154+
138155
async function getRobotsPolicy(origin: string): Promise<RobotsPolicyResult> {
139156
const now = Date.now()
140157
pruneExpiredRobotsPolicyEntries(now)
@@ -149,7 +166,23 @@ async function getRobotsPolicy(origin: string): Promise<RobotsPolicyResult> {
149166
return inFlight
150167
}
151168

152-
const request = fetchRobotsPolicy(origin, EXTERNAL_DOC_USER_AGENT)
169+
const request = (async (): Promise<RobotsPolicyResult> => {
170+
let policy = await fetchRobotsPolicy(origin, EXTERNAL_DOC_USER_AGENT)
171+
if (policy.kind === "not-found") {
172+
const rootOrigin = getRootOrigin(origin)
173+
if (rootOrigin && rootOrigin !== origin) {
174+
const rootPolicy = await fetchRobotsPolicy(rootOrigin, EXTERNAL_DOC_USER_AGENT)
175+
if (rootPolicy.kind !== "not-found") {
176+
policy = rootPolicy
177+
} else {
178+
policy = { kind: "allow-all" }
179+
}
180+
} else {
181+
policy = { kind: "allow-all" }
182+
}
183+
}
184+
return policy
185+
})()
153186
.then((policy) => {
154187
robotsPolicyCache.set(origin, {
155188
expiresAt: Date.now() + ROBOTS_CACHE_TTL_MS,

src/lib/external/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
export type RobotsPolicyResult =
22
| { kind: "allow-all" }
33
| { kind: "deny-all" }
4+
| { kind: "not-found" }
45
| { kind: "rules"; robotsText: string }
56

67
export interface ExternalPolicyEnv {

tests/external.test.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,93 @@ describe("External Swift-DocC support", () => {
251251
expect(global.fetch).toHaveBeenCalledTimes(1)
252252
})
253253

254+
it("should fall back to root domain robots.txt when subdomain robots.txt is 403 or 404", async () => {
255+
// Real robots.txt from https://daily.co/robots.txt (subdomain reference-ios.daily.co returns 403, e.g. S3/CloudFront)
256+
const dailyCoRobotsTxt = `# *
257+
User-agent: *
258+
Allow: /
259+
260+
# Host
261+
Host: https://www.daily.co
262+
263+
# Sitemaps
264+
Sitemap: https://www.daily.co/sitemap.xml
265+
Sitemap: https://www.daily.co/resources/sitemap.xml
266+
Sitemap: https://www.daily.co/partners/sitemap.xml
267+
Sitemap: https://www.daily.co/videosaurus/sitemap.xml
268+
Sitemap: https://www.daily.co/blog/sitemap.xml
269+
Sitemap: https://docs.daily.co/sitemap.xml
270+
`
271+
272+
global.fetch = vi.fn().mockImplementation((url: string | URL) => {
273+
const u = typeof url === "string" ? url : url.toString()
274+
if (u === "https://reference-ios.daily.co/robots.txt") {
275+
return Promise.resolve(
276+
new Response(null, {
277+
status: 403,
278+
headers: { "content-type": "application/xml" },
279+
}),
280+
)
281+
}
282+
if (u === "https://daily.co/robots.txt") {
283+
return Promise.resolve(
284+
new Response(dailyCoRobotsTxt, {
285+
status: 200,
286+
headers: { "Content-Type": "text/plain" },
287+
}),
288+
)
289+
}
290+
return Promise.reject(new Error(`Unexpected fetch: ${u}`))
291+
})
292+
293+
await expect(
294+
assertExternalDocumentationAccess(
295+
new URL("https://reference-ios.daily.co/documentation/some/module"),
296+
{},
297+
),
298+
).resolves.toBeUndefined()
299+
300+
expect(global.fetch).toHaveBeenNthCalledWith(
301+
1,
302+
"https://reference-ios.daily.co/robots.txt",
303+
expect.objectContaining({
304+
headers: expect.objectContaining({ "User-Agent": EXTERNAL_DOC_USER_AGENT }),
305+
}),
306+
)
307+
expect(global.fetch).toHaveBeenNthCalledWith(
308+
2,
309+
"https://daily.co/robots.txt",
310+
expect.objectContaining({
311+
headers: expect.objectContaining({ "User-Agent": EXTERNAL_DOC_USER_AGENT }),
312+
}),
313+
)
314+
expect(global.fetch).toHaveBeenCalledTimes(2)
315+
})
316+
317+
it("should allow fetch when both subdomain and root domain robots.txt are 404", async () => {
318+
global.fetch = vi.fn().mockImplementation((url: string | URL) => {
319+
const u = typeof url === "string" ? url : url.toString()
320+
if (u === "https://docs.example.org/robots.txt" || u === "https://example.org/robots.txt") {
321+
return Promise.resolve(new Response(null, { status: 404 }))
322+
}
323+
return Promise.reject(new Error(`Unexpected fetch: ${u}`))
324+
})
325+
326+
await expect(
327+
assertExternalDocumentationAccess(
328+
new URL("https://docs.example.org/documentation/some/module"),
329+
{},
330+
),
331+
).resolves.toBeUndefined()
332+
333+
expect(global.fetch).toHaveBeenCalledWith(
334+
"https://docs.example.org/robots.txt",
335+
expect.any(Object),
336+
)
337+
expect(global.fetch).toHaveBeenCalledWith("https://example.org/robots.txt", expect.any(Object))
338+
expect(global.fetch).toHaveBeenCalledTimes(2)
339+
})
340+
254341
it("should build and fetch external DocC JSON", async () => {
255342
global.fetch = vi
256343
.fn()

0 commit comments

Comments
 (0)