Skip to content

Commit d4a6953

Browse files
committed
canonical: code review changes
1 parent 2b26dc7 commit d4a6953

File tree

2 files changed

+82
-58
lines changed

2 files changed

+82
-58
lines changed

src/canonical/constants.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ export const CANONICAL_CHECKS = Object.freeze({
1717
explanation: 'The canonical tag is missing, which can lead to duplicate content issues and negatively affect SEO rankings.',
1818
suggestion: 'Add a canonical tag to the head section',
1919
},
20+
CANONICAL_TAG_NO_HREF: {
21+
check: 'canonical-tag-no-href',
22+
title: 'Canonical Tag Missing href',
23+
explanation: 'The canonical tag exists but has no href attribute, making it ineffective for SEO.',
24+
suggestion: 'Add an href attribute to the canonical tag pointing to the preferred URL of this page.',
25+
},
2026
CANONICAL_TAG_MULTIPLE: {
2127
check: 'canonical-tag-multiple',
2228
title: 'Multiple Canonical Tags',

src/canonical/handler.js

Lines changed: 76 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
* governing permissions and limitations under the License.
1111
*/
1212

13-
import { composeBaseURL, tracingFetch as fetch } from '@adobe/spacecat-shared-utils';
13+
import {
14+
composeBaseURL, hasText, isString, tracingFetch as fetch,
15+
} from '@adobe/spacecat-shared-utils';
1416
import { Audit } from '@adobe/spacecat-shared-data-access';
1517
import { retrievePageAuthentication } from '@adobe/spacecat-shared-ims-client';
1618

@@ -34,6 +36,13 @@ import { isAuthUrl } from '../support/utils.js';
3436
const auditType = Audit.AUDIT_TYPES.CANONICAL;
3537
const { AUDIT_STEP_DESTINATIONS } = Audit;
3638

39+
const REACHABILITY_CHECKS = [
40+
CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
41+
CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
42+
];
43+
44+
const EMPTY_CANONICAL_RESULT = Object.freeze({ url: null, checks: [] });
45+
3746
/**
3847
* Step 1: Import top pages (used in multi-step audit)
3948
*/
@@ -139,26 +148,24 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
139148
}
140149

141150
// Check if the canonical URL is fully uppercased
142-
if (canonicalUrl) {
143-
if (typeof canonicalUrl === 'string') {
144-
const isAllCaps = canonicalUrl === canonicalUrl.toUpperCase();
145-
if (isAllCaps) {
146-
checks.push({
147-
check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
148-
success: false,
149-
explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation,
150-
});
151-
} else {
152-
checks.push({
153-
check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
154-
success: true,
155-
});
156-
}
151+
if (isString(canonicalUrl)) {
152+
const isAllCaps = canonicalUrl === canonicalUrl.toUpperCase();
153+
if (isAllCaps) {
154+
checks.push({
155+
check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
156+
success: false,
157+
explanation: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.explanation,
158+
});
157159
} else {
158-
log.error(`[canonical] Canonical URL is not a string: ${typeof canonicalUrl}`);
159-
// Skip adding check for invalid type - validation errors should only be logged
160-
return checks;
160+
checks.push({
161+
check: CANONICAL_CHECKS.CANONICAL_URL_LOWERCASED.check,
162+
success: true,
163+
});
161164
}
165+
} else if (canonicalUrl) {
166+
log.error(`[canonical] Canonical URL is not a string: ${typeof canonicalUrl}`);
167+
// Skip adding check for invalid type - validation errors should only be logged
168+
return checks;
162169
}
163170

164171
// Check if the canonical URL is absolute (case-insensitive protocol check)
@@ -174,18 +181,18 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
174181
check: CANONICAL_CHECKS.CANONICAL_URL_ABSOLUTE.check,
175182
success: true,
176183
});
177-
let url;
184+
let canonical;
178185

179186
try {
180-
url = new URL(canonicalUrl);
187+
canonical = new URL(canonicalUrl);
181188
} catch {
182189
log.error(`[canonical] Invalid canonical URL: ${canonicalUrl}`);
183190
// Skip adding check for invalid URL - validation errors should only be logged
184191
return checks;
185192
}
186193

187194
// Check if the canonical URL has the same protocol as the base URL
188-
if (!url.href.startsWith(base.protocol)) {
195+
if (canonical.protocol !== base.protocol) {
189196
checks.push({
190197
check: CANONICAL_CHECKS.CANONICAL_URL_SAME_PROTOCOL.check,
191198
success: false,
@@ -200,7 +207,7 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
200207

201208
// Check if the canonical URL has the same domain as the base URL
202209
if (!isPreview) {
203-
if (composeBaseURL(url.hostname) !== composeBaseURL(base.hostname)) {
210+
if (composeBaseURL(canonical.hostname) !== composeBaseURL(base.hostname)) {
204211
checks.push({
205212
check: CANONICAL_CHECKS.CANONICAL_URL_SAME_DOMAIN.check,
206213
success: false,
@@ -325,6 +332,24 @@ export async function validateCanonicalRecursively(
325332
return checks;
326333
}
327334

335+
/**
336+
* Checks whether two URLs are self-referencing by comparing their normalized pathnames.
337+
*
338+
* @param {string} url1 - First URL to compare.
339+
* @param {string} url2 - Second URL to compare.
340+
* @returns {boolean} True if both URLs resolve to the same pathname (case-insensitive).
341+
*/
342+
export function isSelfReferencing(url1, url2) {
343+
const normalizePath = (u) => {
344+
try {
345+
return new URL(u).pathname.toLowerCase();
346+
} catch {
347+
return u.toLowerCase();
348+
}
349+
};
350+
return normalizePath(url1) === normalizePath(url2);
351+
}
352+
328353
/**
329354
* Generates a suggestion for fixing a canonical issue based on the check type.
330355
*
@@ -391,19 +416,26 @@ export async function processScrapedContent(context) {
391416
return null;
392417
}
393418

394-
if (!scrapedObject?.scrapeResult?.canonical) {
395-
log.warn(`[canonical] No canonical metadata in S3 object: ${key}`);
396-
return null;
397-
}
398-
399-
const url = scrapedObject.url || scrapedObject.finalUrl;
419+
const { url } = scrapedObject;
400420
if (!url) {
401421
log.warn(`[canonical] No URL found in S3 object: ${key}`);
402422
return null;
403423
}
404424

405425
const finalUrl = scrapedObject.finalUrl || url;
406426

427+
if (!scrapedObject?.scrapeResult?.canonical) {
428+
log.warn(`[canonical] No canonical metadata in S3 object: ${key}`);
429+
return {
430+
url,
431+
checks: [{
432+
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
433+
success: false,
434+
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
435+
}],
436+
};
437+
}
438+
407439
// Filter out scraped pages that redirected to auth/login pages or PDFs
408440
// This prevents false positives when a legitimate page redirects to login
409441
if (isAuthUrl(finalUrl)) {
@@ -415,20 +447,25 @@ export async function processScrapedContent(context) {
415447
return null;
416448
}
417449

418-
const isPreview = isPreviewPage(baseURL);
419-
420450
// Use canonical metadata already extracted by the scraper (Puppeteer)
421451
const canonicalMetadata = scrapedObject.scrapeResult.canonical;
422452
const canonicalUrl = canonicalMetadata.href || null;
423453
const canonicalTagChecks = [];
424454

425455
// Check if canonical tag exists
426-
if (!canonicalMetadata.exists || !canonicalUrl) {
456+
if (!canonicalMetadata.exists) {
427457
canonicalTagChecks.push({
428458
check: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.check,
429459
success: false,
430460
explanation: CANONICAL_CHECKS.CANONICAL_TAG_MISSING.explanation,
431461
});
462+
} else if (!canonicalUrl) {
463+
// Tag exists but href attribute is absent or empty
464+
canonicalTagChecks.push({
465+
check: CANONICAL_CHECKS.CANONICAL_TAG_NO_HREF.check,
466+
success: false,
467+
explanation: CANONICAL_CHECKS.CANONICAL_TAG_NO_HREF.explanation,
468+
});
432469
} else {
433470
// Canonical tag exists
434471
canonicalTagChecks.push({
@@ -465,7 +502,7 @@ export async function processScrapedContent(context) {
465502
}
466503

467504
// Check if canonical is nonempty
468-
if (!canonicalUrl || canonicalUrl.trim() === '') {
505+
if (!hasText(canonicalUrl)) {
469506
canonicalTagChecks.push({
470507
check: CANONICAL_CHECKS.CANONICAL_TAG_EMPTY.check,
471508
success: false,
@@ -478,21 +515,7 @@ export async function processScrapedContent(context) {
478515
});
479516
}
480517

481-
// Check if canonical is self-referenced (ignoring protocol, domain, query, hash, case)
482-
const normalizeUrl = (u) => {
483-
try {
484-
const urlObj = new URL(u);
485-
// Remove protocol, domain, query params, and hash; lowercase; keep only pathname
486-
return urlObj.pathname.toLowerCase();
487-
} catch {
488-
return u.toLowerCase();
489-
}
490-
};
491-
const normalizedCanonical = normalizeUrl(canonicalUrl);
492-
const normalizedFinal = normalizeUrl(finalUrl);
493-
494-
// Canonical should match the final URL path (what was actually served)
495-
const isSelfReferenced = normalizedCanonical === normalizedFinal;
518+
const isSelfReferenced = isSelfReferencing(canonicalUrl, finalUrl);
496519
if (isSelfReferenced) {
497520
canonicalTagChecks.push({
498521
check: CANONICAL_CHECKS.CANONICAL_SELF_REFERENCED.check,
@@ -508,6 +531,7 @@ export async function processScrapedContent(context) {
508531
}
509532

510533
const checks = [...canonicalTagChecks];
534+
const isPreview = isPreviewPage(baseURL);
511535

512536
if (canonicalUrl) {
513537
const urlFormatChecks = validateCanonicalFormat(canonicalUrl, baseURL, log, isPreview);
@@ -533,19 +557,13 @@ export async function processScrapedContent(context) {
533557
// invalid URL; accessibility check may still run and report fetch error
534558
}
535559

536-
// Skip accessibility only when self-referenced, same domain, and same protocol.
560+
// Skip HTTP reachability checks when self-referenced with matching domain and protocol —
561+
// the canonical URL is the current page, so no fetch is needed.
537562
const shouldCheckAccessibility = !isSelfReferenced
538563
|| canonicalDifferentDomain
539564
|| canonicalDifferentProtocol;
540565
if (!shouldCheckAccessibility) {
541-
checks.push({
542-
check: CANONICAL_CHECKS.CANONICAL_URL_STATUS_OK.check,
543-
success: true,
544-
});
545-
checks.push({
546-
check: CANONICAL_CHECKS.CANONICAL_URL_NO_REDIRECT.check,
547-
success: true,
548-
});
566+
checks.push(...REACHABILITY_CHECKS.map((check) => ({ check, success: true })));
549567
} else {
550568
const options = await getPreviewAuthOptions(isPreview, baseURL, site, context, log);
551569

@@ -557,7 +575,7 @@ export async function processScrapedContent(context) {
557575
return { url, checks };
558576
} catch (error) {
559577
log.error(`[canonical] Error processing scraped content from ${key}: ${error.message}`);
560-
return null;
578+
return EMPTY_CANONICAL_RESULT;
561579
}
562580
});
563581

0 commit comments

Comments
 (0)