Skip to content

Commit 8d403f0

Browse files
committed
SITES-40741: Skip pages with status codes 4xx from the scraper
1 parent 2c3652c commit 8d403f0

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

src/metatags/handler.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,14 @@ export async function fetchAndProcessPageObject(s3Client, bucketName, url, key,
118118
return null;
119119
}
120120

121+
// Skip 4xx pages when statusCode is present (new scrapes); old scrapes have no statusCode
122+
if (object.statusCode !== undefined && object.statusCode !== null) {
123+
if (object.statusCode === 404 || (object.statusCode >= 400 && object.statusCode < 500)) {
124+
log.info(`[metatags] Skipping page with HTTP ${object.statusCode} for ${url}`);
125+
return null;
126+
}
127+
}
128+
121129
// Check for error pages by content
122130
const { tags } = object.scrapeResult;
123131
const title = normalizeTagValue(tags.title);

test/metatags/metatags.test.js

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,115 @@ describe('Meta Tags', () => {
928928
});
929929
expect(logStub.error).to.not.have.been.called;
930930
});
931+
932+
it('should skip page when statusCode is 404 (new scrape with HTTP status)', async () => {
933+
const mockScrapeResult = {
934+
finalUrl: 'http://example.com/missing',
935+
statusCode: 404,
936+
scrapeResult: {
937+
tags: {
938+
title: 'Page Not Found',
939+
description: 'The page you requested was not found',
940+
h1: ['Not Found'],
941+
},
942+
rawBody: '<html><body><h1>Not Found</h1><p>Content here to exceed 300 chars.</p></body></html>'.repeat(5),
943+
},
944+
};
945+
946+
s3ClientStub.send.resolves({
947+
Body: {
948+
transformToString: () => JSON.stringify(mockScrapeResult),
949+
},
950+
ContentType: 'application/json',
951+
});
952+
953+
const result = await fetchAndProcessPageObject(
954+
s3ClientStub,
955+
'test-bucket',
956+
'http://example.com/missing',
957+
'scrapes/site-id/missing/scrape.json',
958+
logStub,
959+
);
960+
961+
expect(result).to.be.null;
962+
expect(logStub.info).to.have.been.calledWith(
963+
'[metatags] Skipping page with HTTP 404 for http://example.com/missing',
964+
);
965+
});
966+
967+
it('should skip page when statusCode is 4xx (e.g. 403)', async () => {
968+
const mockScrapeResult = {
969+
finalUrl: 'http://example.com/forbidden',
970+
statusCode: 403,
971+
scrapeResult: {
972+
tags: {
973+
title: 'Access Denied',
974+
description: 'You do not have permission',
975+
h1: ['Forbidden'],
976+
},
977+
rawBody: 'A'.repeat(400),
978+
},
979+
};
980+
981+
s3ClientStub.send.resolves({
982+
Body: {
983+
transformToString: () => JSON.stringify(mockScrapeResult),
984+
},
985+
ContentType: 'application/json',
986+
});
987+
988+
const result = await fetchAndProcessPageObject(
989+
s3ClientStub,
990+
'test-bucket',
991+
'http://example.com/forbidden',
992+
'scrapes/site-id/forbidden/scrape.json',
993+
logStub,
994+
);
995+
996+
expect(result).to.be.null;
997+
expect(logStub.info).to.have.been.calledWith(
998+
'[metatags] Skipping page with HTTP 403 for http://example.com/forbidden',
999+
);
1000+
});
1001+
1002+
it('should process page when statusCode is 200 (new scrape)', async () => {
1003+
const mockScrapeResult = {
1004+
finalUrl: 'http://example.com/ok',
1005+
statusCode: 200,
1006+
scrapeResult: {
1007+
tags: {
1008+
title: 'OK Page',
1009+
description: 'A valid page',
1010+
h1: ['OK'],
1011+
},
1012+
rawBody: 'A'.repeat(300),
1013+
},
1014+
};
1015+
1016+
s3ClientStub.send.resolves({
1017+
Body: {
1018+
transformToString: () => JSON.stringify(mockScrapeResult),
1019+
},
1020+
ContentType: 'application/json',
1021+
});
1022+
1023+
const result = await fetchAndProcessPageObject(
1024+
s3ClientStub,
1025+
'test-bucket',
1026+
'http://example.com/ok',
1027+
'scrapes/site-id/ok/scrape.json',
1028+
logStub,
1029+
);
1030+
1031+
expect(result).to.deep.equal({
1032+
'/ok': {
1033+
title: 'OK Page',
1034+
description: 'A valid page',
1035+
h1: ['OK'],
1036+
s3key: 'scrapes/site-id/ok/scrape.json',
1037+
},
1038+
});
1039+
});
9311040
});
9321041

9331042
describe('opportunities handler method', () => {

0 commit comments

Comments
 (0)