Skip to content

Commit 28d88b9

Browse files
committed
Drop cache info when a redirection took place
When a redirection takes place, Reffy follows the redirection logic (typically done through scripting) but the cache info it gets from Puppeteer remains for the initial URL. Reffy incorrectly assumed that info also applied to the final page. There's no easy way to retrieve the cache info of the final URL. Since that should only affect a spec that moves, and only until we detect and update the URL of the spec in browser-specs, this update simply drops the cache info to force Reffy to crawl the spec. Fix #1774.
1 parent 133a884 commit 28d88b9

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

src/lib/mock-server.js

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,40 @@ mockAgent
169169
}
170170
});
171171

172+
mockAgent
173+
.get("https://www.w3.org")
174+
.intercept({ method: "GET", path: "/TR/iredirect/" })
175+
.reply(200,
176+
`<!DOCTYPE html><script>window.location = '/TR/recentlyupdated/';</script>`,
177+
{
178+
headers: {
179+
"Content-Type": "text/html",
180+
"Last-Modified": "Fri, 11 Feb 2022 00:00:42 GMT"
181+
}
182+
}
183+
);
184+
185+
mockAgent
186+
.get("https://www.w3.org")
187+
.intercept({ method: "GET", path: "/TR/recentlyupdated/" })
188+
.reply(200,
189+
`<html><title>Recently updated</title>
190+
<h1>Recently updated</h1>`,
191+
{
192+
headers: {
193+
"Content-Type": "text/html",
194+
"Last-Modified": (new Date()).toString()
195+
}
196+
}
197+
);
198+
172199
mockAgent
173200
.get("https://drafts.csswg.org")
174201
.intercept({ method: "GET", path: "/server-hiccup/" })
175202
.reply(200,
176203
`<html><title>Server hiccup</title>
177204
<h1> Index of Server Hiccup Module Level 42 </h1>`,
178-
{ header: { "Content-Type": "text/html" } })
205+
{ headers: { "Content-Type": "text/html" } })
179206
.persist();
180207

181208
/*nock.emitter.on('error', function (err) {

src/lib/specs-crawler.js

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,15 @@ async function crawlSpec(spec, crawlOptions) {
138138
if (result.crawled) {
139139
spec.crawled = result.crawled;
140140
}
141-
if (result.crawlCacheInfo) {
142-
spec.crawlCacheInfo = result.crawlCacheInfo;
141+
if (result.crawlCacheInfo &&
142+
(result.crawled === spec.url ||
143+
result.crawled === spec.nightly?.url)) {
144+
// Note: Some redirection took place. That happens when, e.g., a
145+
// WICG spec gets moved to another group, until we update the URL
146+
// in browser-specs. Redirection is done through scripting. Reffy
147+
// follows the redirect but the cache info it receives from
148+
// Puppeteer is for the initial URL. We cannot rely on it!
149+
spec.crawlCacheInfo = result.crawlCacheInfo;
143150
}
144151
crawlOptions.modules.forEach(mod => {
145152
if (result[mod.property]) {

tests/crawl.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ if (global.describe && describe instanceof Function) {
103103
assert.equal(results.results[0].title, 'A test spec');
104104
});
105105

106-
107106
it("skips processing and reuse fallback data when spec cache info indicates it has not changed", async () => {
108107
const url = "https://www.w3.org/TR/ididnotchange/";
109108
const fallback = path.resolve(scriptPath, 'crawl-cache.json');
@@ -116,6 +115,15 @@ if (global.describe && describe instanceof Function) {
116115
assert.equal(results[0].title, "Change is the only constant");
117116
assert.ifError(results[0].error);
118117
assert.equal(results[0].refs, "A useful list of refs");
118+
});
119+
120+
it("does not return cache info when a redirection took place", async () => {
121+
const url = "https://www.w3.org/TR/iredirect/";
122+
const results = await crawlSpecs(
123+
[{ url, nightly: { url } }],
124+
{ forceLocalFetch: true });
125+
assert.equal(results[0].title, "Recently updated");
126+
assert.equal(results[0].crawlCacheInfo, undefined);
119127
})
120128

121129
it("reports HTTP error statuses", async () => {

0 commit comments

Comments
 (0)