1010 * governing permissions and limitations under the License.
1111 */
1212
13- import { composeBaseURL , tracingFetch as fetch } from '@adobe/spacecat-shared-utils' ;
13+ import {
14+ composeBaseURL , hasText , isString , tracingFetch as fetch ,
15+ } from '@adobe/spacecat-shared-utils' ;
1416import { Audit } from '@adobe/spacecat-shared-data-access' ;
1517import { retrievePageAuthentication } from '@adobe/spacecat-shared-ims-client' ;
1618
@@ -34,6 +36,13 @@ import { isAuthUrl } from '../support/utils.js';
3436const auditType = Audit . AUDIT_TYPES . CANONICAL ;
3537const { AUDIT_STEP_DESTINATIONS } = Audit ;
3638
39+ const REACHABILITY_CHECKS = [
40+ CANONICAL_CHECKS . CANONICAL_URL_STATUS_OK . check ,
41+ CANONICAL_CHECKS . CANONICAL_URL_NO_REDIRECT . check ,
42+ ] ;
43+
44+ const EMPTY_CANONICAL_RESULT = Object . freeze ( { url : null , checks : [ ] } ) ;
45+
3746/**
3847 * Step 1: Import top pages (used in multi-step audit)
3948 */
@@ -139,26 +148,24 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
139148 }
140149
141150 // Check if the canonical URL is fully uppercased
142- if ( canonicalUrl ) {
143- if ( typeof canonicalUrl === 'string' ) {
144- const isAllCaps = canonicalUrl === canonicalUrl . toUpperCase ( ) ;
145- if ( isAllCaps ) {
146- checks . push ( {
147- check : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . check ,
148- success : false ,
149- explanation : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . explanation ,
150- } ) ;
151- } else {
152- checks . push ( {
153- check : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . check ,
154- success : true ,
155- } ) ;
156- }
151+ if ( isString ( canonicalUrl ) ) {
152+ const isAllCaps = canonicalUrl === canonicalUrl . toUpperCase ( ) ;
153+ if ( isAllCaps ) {
154+ checks . push ( {
155+ check : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . check ,
156+ success : false ,
157+ explanation : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . explanation ,
158+ } ) ;
157159 } else {
158- log . error ( `[canonical] Canonical URL is not a string: ${ typeof canonicalUrl } ` ) ;
159- // Skip adding check for invalid type - validation errors should only be logged
160- return checks ;
160+ checks . push ( {
161+ check : CANONICAL_CHECKS . CANONICAL_URL_LOWERCASED . check ,
162+ success : true ,
163+ } ) ;
161164 }
165+ } else if ( canonicalUrl ) {
166+ log . error ( `[canonical] Canonical URL is not a string: ${ typeof canonicalUrl } ` ) ;
167+ // Skip adding check for invalid type - validation errors should only be logged
168+ return checks ;
162169 }
163170
164171 // Check if the canonical URL is absolute (case-insensitive protocol check)
@@ -174,18 +181,18 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
174181 check : CANONICAL_CHECKS . CANONICAL_URL_ABSOLUTE . check ,
175182 success : true ,
176183 } ) ;
177- let url ;
184+ let canonical ;
178185
179186 try {
180- url = new URL ( canonicalUrl ) ;
187+ canonical = new URL ( canonicalUrl ) ;
181188 } catch {
182189 log . error ( `[canonical] Invalid canonical URL: ${ canonicalUrl } ` ) ;
183190 // Skip adding check for invalid URL - validation errors should only be logged
184191 return checks ;
185192 }
186193
187194 // Check if the canonical URL has the same protocol as the base URL
188- if ( ! url . href . startsWith ( base . protocol ) ) {
195+ if ( canonical . protocol !== base . protocol ) {
189196 checks . push ( {
190197 check : CANONICAL_CHECKS . CANONICAL_URL_SAME_PROTOCOL . check ,
191198 success : false ,
@@ -200,7 +207,7 @@ export function validateCanonicalFormat(canonicalUrl, baseUrl, log, isPreview =
200207
201208 // Check if the canonical URL has the same domain as the base URL
202209 if ( ! isPreview ) {
203- if ( composeBaseURL ( url . hostname ) !== composeBaseURL ( base . hostname ) ) {
210+ if ( composeBaseURL ( canonical . hostname ) !== composeBaseURL ( base . hostname ) ) {
204211 checks . push ( {
205212 check : CANONICAL_CHECKS . CANONICAL_URL_SAME_DOMAIN . check ,
206213 success : false ,
@@ -325,6 +332,24 @@ export async function validateCanonicalRecursively(
325332 return checks ;
326333}
327334
335+ /**
336+ * Checks whether two URLs are self-referencing by comparing their normalized pathnames.
337+ *
338+ * @param {string } url1 - First URL to compare.
339+ * @param {string } url2 - Second URL to compare.
340+ * @returns {boolean } True if both URLs resolve to the same pathname (case-insensitive).
341+ */
342+ export function isSelfReferencing ( url1 , url2 ) {
343+ const normalizePath = ( u ) => {
344+ try {
345+ return new URL ( u ) . pathname . toLowerCase ( ) ;
346+ } catch {
347+ return u . toLowerCase ( ) ;
348+ }
349+ } ;
350+ return normalizePath ( url1 ) === normalizePath ( url2 ) ;
351+ }
352+
328353/**
329354 * Generates a suggestion for fixing a canonical issue based on the check type.
330355 *
@@ -391,19 +416,26 @@ export async function processScrapedContent(context) {
391416 return null ;
392417 }
393418
394- if ( ! scrapedObject ?. scrapeResult ?. canonical ) {
395- log . warn ( `[canonical] No canonical metadata in S3 object: ${ key } ` ) ;
396- return null ;
397- }
398-
399- const url = scrapedObject . url || scrapedObject . finalUrl ;
419+ const { url } = scrapedObject ;
400420 if ( ! url ) {
401421 log . warn ( `[canonical] No URL found in S3 object: ${ key } ` ) ;
402422 return null ;
403423 }
404424
405425 const finalUrl = scrapedObject . finalUrl || url ;
406426
427+ if ( ! scrapedObject ?. scrapeResult ?. canonical ) {
428+ log . warn ( `[canonical] No canonical metadata in S3 object: ${ key } ` ) ;
429+ return {
430+ url,
431+ checks : [ {
432+ check : CANONICAL_CHECKS . CANONICAL_TAG_MISSING . check ,
433+ success : false ,
434+ explanation : CANONICAL_CHECKS . CANONICAL_TAG_MISSING . explanation ,
435+ } ] ,
436+ } ;
437+ }
438+
407439 // Filter out scraped pages that redirected to auth/login pages or PDFs
408440 // This prevents false positives when a legitimate page redirects to login
409441 if ( isAuthUrl ( finalUrl ) ) {
@@ -415,20 +447,25 @@ export async function processScrapedContent(context) {
415447 return null ;
416448 }
417449
418- const isPreview = isPreviewPage ( baseURL ) ;
419-
420450 // Use canonical metadata already extracted by the scraper (Puppeteer)
421451 const canonicalMetadata = scrapedObject . scrapeResult . canonical ;
422452 const canonicalUrl = canonicalMetadata . href || null ;
423453 const canonicalTagChecks = [ ] ;
424454
425455 // Check if canonical tag exists
426- if ( ! canonicalMetadata . exists || ! canonicalUrl ) {
456+ if ( ! canonicalMetadata . exists ) {
427457 canonicalTagChecks . push ( {
428458 check : CANONICAL_CHECKS . CANONICAL_TAG_MISSING . check ,
429459 success : false ,
430460 explanation : CANONICAL_CHECKS . CANONICAL_TAG_MISSING . explanation ,
431461 } ) ;
462+ } else if ( ! canonicalUrl ) {
463+ // Tag exists but href attribute is absent or empty
464+ canonicalTagChecks . push ( {
465+ check : CANONICAL_CHECKS . CANONICAL_TAG_NO_HREF . check ,
466+ success : false ,
467+ explanation : CANONICAL_CHECKS . CANONICAL_TAG_NO_HREF . explanation ,
468+ } ) ;
432469 } else {
433470 // Canonical tag exists
434471 canonicalTagChecks . push ( {
@@ -465,7 +502,7 @@ export async function processScrapedContent(context) {
465502 }
466503
467504 // Check if canonical is nonempty
468- if ( ! canonicalUrl || canonicalUrl . trim ( ) === '' ) {
505+ if ( ! hasText ( canonicalUrl ) ) {
469506 canonicalTagChecks . push ( {
470507 check : CANONICAL_CHECKS . CANONICAL_TAG_EMPTY . check ,
471508 success : false ,
@@ -478,21 +515,7 @@ export async function processScrapedContent(context) {
478515 } ) ;
479516 }
480517
481- // Check if canonical is self-referenced (ignoring protocol, domain, query, hash, case)
482- const normalizeUrl = ( u ) => {
483- try {
484- const urlObj = new URL ( u ) ;
485- // Remove protocol, domain, query params, and hash; lowercase; keep only pathname
486- return urlObj . pathname . toLowerCase ( ) ;
487- } catch {
488- return u . toLowerCase ( ) ;
489- }
490- } ;
491- const normalizedCanonical = normalizeUrl ( canonicalUrl ) ;
492- const normalizedFinal = normalizeUrl ( finalUrl ) ;
493-
494- // Canonical should match the final URL path (what was actually served)
495- const isSelfReferenced = normalizedCanonical === normalizedFinal ;
518+ const isSelfReferenced = isSelfReferencing ( canonicalUrl , finalUrl ) ;
496519 if ( isSelfReferenced ) {
497520 canonicalTagChecks . push ( {
498521 check : CANONICAL_CHECKS . CANONICAL_SELF_REFERENCED . check ,
@@ -508,6 +531,7 @@ export async function processScrapedContent(context) {
508531 }
509532
510533 const checks = [ ...canonicalTagChecks ] ;
534+ const isPreview = isPreviewPage ( baseURL ) ;
511535
512536 if ( canonicalUrl ) {
513537 const urlFormatChecks = validateCanonicalFormat ( canonicalUrl , baseURL , log , isPreview ) ;
@@ -533,19 +557,13 @@ export async function processScrapedContent(context) {
533557 // invalid URL; accessibility check may still run and report fetch error
534558 }
535559
536- // Skip accessibility only when self-referenced, same domain, and same protocol.
560+ // Skip HTTP reachability checks when self-referenced with matching domain and protocol —
561+ // the canonical URL is the current page, so no fetch is needed.
537562 const shouldCheckAccessibility = ! isSelfReferenced
538563 || canonicalDifferentDomain
539564 || canonicalDifferentProtocol ;
540565 if ( ! shouldCheckAccessibility ) {
541- checks . push ( {
542- check : CANONICAL_CHECKS . CANONICAL_URL_STATUS_OK . check ,
543- success : true ,
544- } ) ;
545- checks . push ( {
546- check : CANONICAL_CHECKS . CANONICAL_URL_NO_REDIRECT . check ,
547- success : true ,
548- } ) ;
566+ checks . push ( ...REACHABILITY_CHECKS . map ( ( check ) => ( { check, success : true } ) ) ) ;
549567 } else {
550568 const options = await getPreviewAuthOptions ( isPreview , baseURL , site , context , log ) ;
551569
@@ -557,7 +575,7 @@ export async function processScrapedContent(context) {
557575 return { url, checks } ;
558576 } catch ( error ) {
559577 log . error ( `[canonical] Error processing scraped content from ${ key } : ${ error . message } ` ) ;
560- return null ;
578+ return EMPTY_CANONICAL_RESULT ;
561579 }
562580 } ) ;
563581
0 commit comments