Skip to content

Commit 59f690a

Browse files
committed
fix some bugs and add CLI
1 parent 9bf3882 commit 59f690a

File tree

9 files changed

+126
-26
lines changed

9 files changed

+126
-26
lines changed

bin/normalize.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env node
2+
/* eslint-disable no-console */
3+
4+
import gen from '../index.js'
5+
const normalize = gen()
6+
const [, , ...args] = process.argv
7+
8+
;(async () => {
9+
console.log(await normalize(args[0]))
10+
})()

index.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ import QuickLRU from 'quick-lru'
22
import normalizeUrl from './utils/normalize-url.js'
33
import httpClientGen from './utils/http-client.js'
44
import dnsLookupGen from './utils/dns-lookup.js'
5+
import logger from './utils/logger.js'
6+
7+
const debug = logger('index.js')
58

69
export default (
710
normalizeUrlOptions = {
@@ -24,12 +27,16 @@ export default (
2427

2528
// Normalize URL so that we can search by URL.
2629
return async function normalizePlus(url = '') {
30+
debug('Normalizing URL %s', url)
31+
2732
// 1. "Base" normalization using normalize-url + stripping trackers
2833
// When an invalid link is passed, it will throw.
2934
const link = await normalize(url)
35+
debug('Normalization first pass: %s', url)
3036

3137
// 2. Follow redirects to deal with "intermediate" links (such as the links on google search results)
3238
const res = await httpClient.get(link, { context: { normalize } })
39+
debug('Normalization second pass: %s', res.url)
3340

3441
// At this point, the link will be completely normalized based on canonical links (if one exists)
3542
return res.url

package-lock.json

Lines changed: 7 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"repository": "JaneJeon/normalize-url-plus",
88
"author": "Jane Jeon <me@janejeon.dev>",
99
"license": "LGPL-3.0",
10+
"bin": "bin/normalize",
1011
"scripts": {
1112
"test": "NODE_OPTIONS=--experimental-vm-modules jest",
1213
"update-data": "curl -L -s https://rules2.clearurls.xyz/data.minify.json > data/data.minify.json",
@@ -16,6 +17,8 @@
1617
},
1718
"dependencies": {
1819
"cheerio": "^1.0.0-rc.10",
20+
"debug": "^4.3.2",
21+
"got": "^12.0.0-beta.4",
1922
"got-scraping": "^3.0.1",
2023
"got-ssrf": "^1.0.2",
2124
"ipaddr.js": "^2.0.1",

utils/canonicize.js

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,54 +3,75 @@ import trim from 'lodash/trim.js'
33
import leven from 'leven'
44
import parseTld from 'tld-extract'
55
import urlIsAmp from './url-is-amp.js'
6+
import logger from './logger.js'
7+
8+
const debug = logger('utils/canonicize.js')
69

710
// Look for the canonical link (also un-AMP-ifies the canonical link)
811
// Not writing a separate metascraper-canonical library for this, as the "standard" way of determining
912
// canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls
1013
export default async function canonicizeHook(res) {
11-
if (!res.request.options.context.normalize) return
14+
if (!res.request.options.context.normalize) {
15+
debug('got.context.normalize not set; not canonicizing...')
16+
return res
17+
}
18+
1219
const { normalize } = res.request.options.context
1320

1421
// Normalize the "final" URL up front
1522
const normalizedUrl = await normalize(res.url)
23+
debug('Normalized res.url %s to %s', res.url, normalizedUrl)
1624

1725
// Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py
1826
const $ = cheerio.load(res.body)
1927
const matches = []
2028

2129
// 5.1: rel=canonical <link> tag
2230
$('link[rel=canonical]').each(function () {
23-
matches.push($(this).attr('href'))
31+
const match = $(this).attr('href')
32+
matches.push(match)
33+
debug('Matched rel=canonical <link> tag: %s', match)
2434
})
2535

2636
// 5.2: rel=canonical HTTP header
2737
if ('link' in res.headers) {
38+
debug('"Link" header exists, searching for rel=canonical...')
39+
2840
// We're looking for something like:
2941
// Link: <https://example.com>; rel="canonical", ...
3042
res.headers.link.split(',').forEach(linkHeader => {
3143
const parts = linkHeader.split(';')
32-
if (parts.length !== 2) return
44+
if (parts.length !== 2) {
45+
debug('Not enough parts exist in the header: %s', linkHeader)
46+
return
47+
}
3348

3449
const [linkStr, relStr] = parts
50+
debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr)
3551

3652
// rel="canonical", rel=canonical, rel canonical, etc.
3753
const relStrLower = relStr.toLowerCase()
3854
if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
3955
// <https://example.com>, https://example.com, etc.
4056
const url = trim(linkStr.trim(), ['<', '>', ' '])
4157
matches.push(url)
58+
debug('Found canonical in header: %s', url)
4259
}
4360
})
4461
}
4562

4663
// 5.3: AMP variant
4764
$('a.amp-canurl').each(function () {
48-
matches.push($(this).attr('href'))
65+
const match = $(this).attr('href')
66+
matches.push(match)
67+
debug('Found non-AMP variant: %s', match)
4968
})
5069

5170
// 5.4: OpenGraph
5271
$('meta[property="og:url"]').each(function () {
53-
matches.push($(this).attr('content'))
72+
const match = $(this).attr('content')
73+
matches.push(match)
74+
debug('Found OpenGraph og:url: %s', match)
5475
})
5576

5677
// 5.5: Sitemap (I'm not doing this shit)
@@ -60,6 +81,7 @@ export default async function canonicizeHook(res) {
6081

6182
const { hostname: domain } = new URL(normalizedUrl)
6283
const { domain: baseDomain } = parseTld(normalizedUrl)
84+
debug('Finding the best match for host %s and TLD %s...', domain, baseDomain)
6385

6486
let result = normalizedUrl
6587
let minDist = Number.POSITIVE_INFINITY
@@ -69,20 +91,34 @@ export default async function canonicizeHook(res) {
6991

7092
// turn relative to absolute URL
7193
if (match.startsWith('/')) link = `${domain}${match}`
94+
debug('Considering match %s...', link)
7295

7396
// Skip invalid links
7497
try {
7598
link = await normalize(link)
99+
debug('Normalized match to %s', link)
76100

77101
// Ensure that every match is a valid URL w/ a matching domain
78102
// In this case, we're only matching the "top-level" domain -
79103
// e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
80104
// so we want to include references to www.site.com (actually *prefer* those)
81-
if (parseTld(link).domain !== baseDomain) continue
105+
const { domain: matchDomain } = parseTld(link)
106+
if (matchDomain !== baseDomain) {
107+
debug(
108+
'The domain %s does not match the base domain %s',
109+
matchDomain,
110+
baseDomain
111+
)
112+
continue
113+
}
82114

83115
// Then, ensure that links aren't AMP'd
84-
if (urlIsAmp(link)) continue
116+
if (urlIsAmp(link)) {
117+
debug('Link %s is AMP, skipping...', link)
118+
continue
119+
}
85120
} catch (err) {
121+
debug('Error %s while considering match %s', err, match)
86122
continue
87123
}
88124

@@ -95,6 +131,7 @@ export default async function canonicizeHook(res) {
95131
}
96132

97133
res.url = result
134+
debug('Found best match %s', result)
98135

99136
return res
100137
}

utils/dns-lookup.js

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,28 @@
11
import { URL } from 'url'
22
import { lookup as nativeLookup } from 'dns/promises'
3+
import logger from './logger.js'
4+
5+
const debug = logger('utils/dns-lookup.js')
36

47
export default dnsCache => {
5-
const lookup = dnsCache ? dnsCache.lookupAsync : nativeLookup
8+
let lookup
9+
if (dnsCache) {
10+
debug('Using dnsCache.lookupAsync for DNS lookups')
11+
lookup = dnsCache.lookupAsync
12+
} else {
13+
debug('dnsCache does not exist, falling back to dns/promises')
14+
lookup = nativeLookup
15+
}
616

717
return async url => {
18+
debug('Looking up DNS for %s', url)
819
try {
920
const { hostname } = new URL(url)
1021
await lookup(hostname)
22+
debug('Lookup of host %s successful', hostname)
1123
return true
1224
} catch (err) {
25+
debug('Lookup of %s unsuccessful', url)
1326
return false
1427
}
1528
}

utils/logger.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import debug from 'debug'
2+
3+
const BASE = 'normalize'
4+
export default function (name) {
5+
const fullName = `${BASE}:${name}`
6+
return debug(fullName)
7+
}

utils/normalize-url.js

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import normalizeUrl from 'normalize-url'
22
import stripTrackers from './strip-trackers.js'
33
import { URL } from 'url'
4+
import logger from './logger.js'
5+
6+
const debug = logger('utils/normalize-url.js')
47

58
export default function gen(normalizeUrlOptions, dnsLookup, httpClient) {
69
return async function normalize(originalUrl) {
@@ -9,42 +12,49 @@ export default function gen(normalizeUrlOptions, dnsLookup, httpClient) {
912
stripWWW: true,
1013
forceHttps: true
1114
}
12-
// always strip trackers for consistency (even if it means worse performance)!
13-
const cleanUrl = stripTrackers(originalUrl)
1415

1516
// Pass 1: try to force as much normalization as possible, knowing that this may break some links
16-
let url = normalizeUrl(cleanUrl, {
17+
let url = normalizeUrl(originalUrl, {
1718
...normalizeUrlOptions,
1819
...preferredOptions
1920
})
21+
debug('Normalization pass 1: %s', url)
2022

2123
// Check 1: if the www-stripped domain exists...
2224
if (originalUrl.includes('www')) {
25+
debug('Checking if www-stripped domain exists...')
2326
const { hostname } = new URL(url)
2427
try {
2528
await dnsLookup(hostname)
29+
debug('www-stripped domain exists!')
2630
} catch (err) {
2731
// Pass 2: we can't resolve the www-stripped host at the DNS level, so we enable it
2832
preferredOptions.stripWWW = false
2933
url = normalizeUrl(originalUrl, {
3034
...normalizeUrlOptions,
3135
...preferredOptions
3236
})
37+
debug('non-www domain does not exist, using %s', url)
3338
}
3439
}
3540

3641
// Check 2: if the site doesn't support HTTPS...
42+
debug('Checking if https version exists...')
3743
try {
3844
await httpClient.head(url)
45+
debug('https version exists!')
3946
} catch (err) {
4047
// Pass 3: we can't reach the URL via HTTP HEAD request, so try downgrading to http
4148
preferredOptions.forceHttps = false
42-
url = normalizeUrl(cleanUrl, {
49+
url = normalizeUrl(originalUrl, {
4350
...normalizeUrlOptions,
4451
...preferredOptions
4552
})
53+
debug('https version does not exist, using %s', url)
4654
}
4755

48-
return url
56+
// always strip trackers for consistency (even if it means worse performance)!
57+
debug('stripping trackers from %s', url)
58+
return stripTrackers(url)
4959
}
5060
}

0 commit comments

Comments
 (0)