Skip to content

Commit bbc5c16

Browse files
fix(scraper): check HTTP response status before extracting page content
All three scrapers (accessibility, cookie, privacy) now capture the response object from page.goto() and return scrape_error with the HTTP status code when the server responds with 4xx/5xx. Previously, error pages were silently sent to Bedrock, resulting in misleading "did not contain an accessibility statement" messages. Also adds character-count logging after text extraction to aid future debugging of similar issues.
1 parent 17f6cb9 commit bbc5c16

3 files changed

Lines changed: 84 additions & 6 deletions

File tree

src/scraper/accessibilityScraper.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,29 +92,52 @@ export async function scrapeAccessibility(
9292
// session warm-up best-effort
9393
}
9494

95+
let response: import("playwright").Response | null = null;
9596
try {
96-
await page.goto(effectiveUrl, {
97+
response = await page.goto(effectiveUrl, {
9798
waitUntil: "networkidle",
9899
timeout: config.playwrightTimeout,
99100
});
100101
} catch {
101102
// proceed with whatever loaded
102103
}
103104

105+
if (response && response.status() >= 400) {
106+
return insertAccessibilityResult(pool, {
107+
...base,
108+
...empty,
109+
scrapeStatus: "scrape_error",
110+
errorMessage: `HTTP ${response.status()} fetching accessibility statement`,
111+
accessibilityStatementUrl: effectiveUrl,
112+
rawBedrockResponse: null,
113+
});
114+
}
115+
104116
const deeperLink = await findDeeperStatementLink(
105117
page,
106118
page.url(),
107119
"accessibility",
108120
);
109121
if (deeperLink) {
122+
let deeperResponse: import("playwright").Response | null = null;
110123
try {
111-
await page.goto(deeperLink.href, {
124+
deeperResponse = await page.goto(deeperLink.href, {
112125
waitUntil: "networkidle",
113126
timeout: config.playwrightTimeout,
114127
});
115128
} catch {
116129
// proceed with whatever loaded
117130
}
131+
if (deeperResponse && deeperResponse.status() >= 400) {
132+
try {
133+
await page.goto(effectiveUrl, {
134+
waitUntil: "networkidle",
135+
timeout: config.playwrightTimeout,
136+
});
137+
} catch {
138+
// fall back to whatever loaded
139+
}
140+
}
118141
}
119142

120143
const html = await page.content();
@@ -131,6 +154,9 @@ export async function scrapeAccessibility(
131154
}
132155

133156
const mainText = await extractMainText(page);
157+
console.log(
158+
`[accessibility] ${service.name}: extracted ${mainText.length} chars`,
159+
);
134160
const bedrockResult = await extractAccessibilityFromBedrock(
135161
mainText,
136162
config,

src/scraper/cookieScraper.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,29 +77,52 @@ export async function scrapeCookies(
7777
}
7878
});
7979

80+
let response: import("playwright").Response | null = null;
8081
try {
81-
await page.goto(effectiveUrl, {
82+
response = await page.goto(effectiveUrl, {
8283
waitUntil: "networkidle",
8384
timeout: config.playwrightTimeout,
8485
});
8586
} catch {
8687
// proceed with whatever loaded
8788
}
8889

90+
if (response && response.status() >= 400) {
91+
return insertCookieResult(pool, {
92+
...base,
93+
...empty,
94+
scrapeStatus: "scrape_error",
95+
errorMessage: `HTTP ${response.status()} fetching cookie policy`,
96+
cookiePolicyUrl: effectiveUrl,
97+
rawBedrockResponse: null,
98+
});
99+
}
100+
89101
const deeperLink = await findDeeperStatementLink(
90102
page,
91103
page.url(),
92104
"cookies",
93105
);
94106
if (deeperLink) {
107+
let deeperResponse: import("playwright").Response | null = null;
95108
try {
96-
await page.goto(deeperLink.href, {
109+
deeperResponse = await page.goto(deeperLink.href, {
97110
waitUntil: "networkidle",
98111
timeout: config.playwrightTimeout,
99112
});
100113
} catch {
101114
// proceed with whatever loaded
102115
}
116+
if (deeperResponse && deeperResponse.status() >= 400) {
117+
try {
118+
await page.goto(effectiveUrl, {
119+
waitUntil: "networkidle",
120+
timeout: config.playwrightTimeout,
121+
});
122+
} catch {
123+
// fall back to whatever loaded
124+
}
125+
}
103126
}
104127

105128
const html = await page.content();
@@ -116,6 +139,9 @@ export async function scrapeCookies(
116139
}
117140

118141
const fullText = await extractFullText(page);
142+
console.log(
143+
`[cookies] ${service.name}: extracted ${fullText.length} chars`,
144+
);
119145
const bedrockResult = await extractCookiesFromBedrock(
120146
fullText,
121147
setCookieHeaders,

src/scraper/privacyScraper.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,29 +66,52 @@ export async function scrapePrivacy(
6666
// session warm-up best-effort
6767
}
6868

69+
let response: import("playwright").Response | null = null;
6970
try {
70-
await page.goto(effectiveUrl, {
71+
response = await page.goto(effectiveUrl, {
7172
waitUntil: "networkidle",
7273
timeout: config.playwrightTimeout,
7374
});
7475
} catch {
7576
// proceed with whatever loaded
7677
}
7778

79+
if (response && response.status() >= 400) {
80+
return insertPrivacyResult(pool, {
81+
...base,
82+
...empty,
83+
scrapeStatus: "scrape_error",
84+
errorMessage: `HTTP ${response.status()} fetching privacy notice`,
85+
privacyPolicyUrl: effectiveUrl,
86+
rawBedrockResponse: null,
87+
});
88+
}
89+
7890
const deeperLink = await findDeeperStatementLink(
7991
page,
8092
page.url(),
8193
"privacy",
8294
);
8395
if (deeperLink) {
96+
let deeperResponse: import("playwright").Response | null = null;
8497
try {
85-
await page.goto(deeperLink.href, {
98+
deeperResponse = await page.goto(deeperLink.href, {
8699
waitUntil: "networkidle",
87100
timeout: config.playwrightTimeout,
88101
});
89102
} catch {
90103
// proceed with whatever loaded
91104
}
105+
if (deeperResponse && deeperResponse.status() >= 400) {
106+
try {
107+
await page.goto(effectiveUrl, {
108+
waitUntil: "networkidle",
109+
timeout: config.playwrightTimeout,
110+
});
111+
} catch {
112+
// fall back to whatever loaded
113+
}
114+
}
92115
}
93116

94117
const html = await page.content();
@@ -105,6 +128,9 @@ export async function scrapePrivacy(
105128
}
106129

107130
const mainText = await extractMainText(page);
131+
console.log(
132+
`[privacy] ${service.name}: extracted ${mainText.length} chars`,
133+
);
108134
const bedrockResult = await extractPrivacyFromBedrock(mainText, config);
109135

110136
if ("error" in bedrockResult) {

0 commit comments

Comments
 (0)