@@ -3,54 +3,75 @@ import trim from 'lodash/trim.js'
33import leven from 'leven'
44import parseTld from 'tld-extract'
55import urlIsAmp from './url-is-amp.js'
6+ import logger from './logger.js'
7+
8+ const debug = logger ( 'utils/canonicize.js' )
69
710// Look for the canonical link (also un-AMP-ifies the canonical link)
811// Not writing a separate metascraper-canonical library for this, as the "standard" way of determining
912// canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls
1013export default async function canonicizeHook ( res ) {
11- if ( ! res . request . options . context . normalize ) return
14+ if ( ! res . request . options . context . normalize ) {
15+ debug ( 'got.context.normalize not set; not canonicizing...' )
16+ return res
17+ }
18+
1219 const { normalize } = res . request . options . context
1320
1421 // Normalize the "final" URL up front
1522 const normalizedUrl = await normalize ( res . url )
23+ debug ( 'Normalized res.url %s to %s' , res . url , normalizedUrl )
1624
1725 // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py
1826 const $ = cheerio . load ( res . body )
1927 const matches = [ ]
2028
2129 // 5.1: rel=canonical <link> tag
2230 $ ( 'link[rel=canonical]' ) . each ( function ( ) {
23- matches . push ( $ ( this ) . attr ( 'href' ) )
31+ const match = $ ( this ) . attr ( 'href' )
32+ matches . push ( match )
33+ debug ( 'Matched rel=canonical <link> tag: %s' , match )
2434 } )
2535
2636 // 5.2: rel=canonical HTTP header
2737 if ( 'link' in res . headers ) {
38+ debug ( '"Link" header exists, searching for rel=canonical...' )
39+
2840 // We're looking for something like:
2941 // Link: <https://example.com>; rel="canonical", ...
3042 res . headers . link . split ( ',' ) . forEach ( linkHeader => {
3143 const parts = linkHeader . split ( ';' )
32- if ( parts . length !== 2 ) return
44+ if ( parts . length !== 2 ) {
45+ debug ( 'Not enough parts exist in the header: %s' , linkHeader )
46+ return
47+ }
3348
3449 const [ linkStr , relStr ] = parts
50+ debug ( 'Extracted link fragment %s and rel fragment %s' , linkStr , relStr )
3551
3652 // rel="canonical", rel=canonical, rel canonical, etc.
3753 const relStrLower = relStr . toLowerCase ( )
3854 if ( relStrLower . includes ( 'rel' ) && relStrLower . includes ( 'canonical' ) ) {
3955 // <https://example.com>, https://example.com, etc.
4056 const url = trim ( linkStr . trim ( ) , [ '<' , '>' , ' ' ] )
4157 matches . push ( url )
58+ debug ( 'Found canonical in header: %s' , url )
4259 }
4360 } )
4461 }
4562
4663 // 5.3: AMP variant
4764 $ ( 'a.amp-canurl' ) . each ( function ( ) {
48- matches . push ( $ ( this ) . attr ( 'href' ) )
65+ const match = $ ( this ) . attr ( 'href' )
66+ matches . push ( match )
67+ debug ( 'Found non-AMP variant: %s' , match )
4968 } )
5069
5170 // 5.4: OpenGraph
5271 $ ( 'meta[property="og:url"]' ) . each ( function ( ) {
53- matches . push ( $ ( this ) . attr ( 'content' ) )
72+ const match = $ ( this ) . attr ( 'content' )
73+ matches . push ( match )
74+ debug ( 'Found OpenGraph og:url: %s' , match )
5475 } )
5576
5677 // 5.5: Sitemap (I'm not doing this shit)
@@ -60,6 +81,7 @@ export default async function canonicizeHook(res) {
6081
6182 const { hostname : domain } = new URL ( normalizedUrl )
6283 const { domain : baseDomain } = parseTld ( normalizedUrl )
84+ debug ( 'Finding the best match for host %s and TLD %s...' , domain , baseDomain )
6385
6486 let result = normalizedUrl
6587 let minDist = Number . POSITIVE_INFINITY
@@ -69,20 +91,34 @@ export default async function canonicizeHook(res) {
6991
7092 // turn relative to absolute URL
7193 if ( match . startsWith ( '/' ) ) link = `${ domain } ${ match } `
94+ debug ( 'Considering match %s...' , link )
7295
7396 // Skip invalid links
7497 try {
7598 link = await normalize ( link )
99+ debug ( 'Normalized match to %s' , link )
76100
77101 // Ensure that every match is a valid URL w/ a matching domain
78102 // In this case, we're only matching the "top-level" domain -
79103 // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
80104 // so we want to include references to www.site.com (actually *prefer* those)
81- if ( parseTld ( link ) . domain !== baseDomain ) continue
105+ const { domain : matchDomain } = parseTld ( link )
106+ if ( matchDomain !== baseDomain ) {
107+ debug (
108+ 'The domain %s does not match the base domain %s' ,
109+ matchDomain ,
110+ baseDomain
111+ )
112+ continue
113+ }
82114
83115 // Then, ensure that links aren't AMP'd
84- if ( urlIsAmp ( link ) ) continue
116+ if ( urlIsAmp ( link ) ) {
117+ debug ( 'Link %s is AMP, skipping...' , link )
118+ continue
119+ }
85120 } catch ( err ) {
121+ debug ( 'Error %s while considering match %s' , err , match )
86122 continue
87123 }
88124
@@ -95,6 +131,7 @@ export default async function canonicizeHook(res) {
95131 }
96132
97133 res . url = result
134+ debug ( 'Found best match %s' , result )
98135
99136 return res
100137}
0 commit comments