fix some bugs and add CLI

JaneJeon · JaneJeon · commit 59f690a9ce1f · 2021-08-31T21:40:21.000-04:00
diff --git a/bin/normalize.js b/bin/normalize.js
@@ -0,0 +1,10 @@
+#!/usr/bin/env node
+/* eslint-disable no-console */
+
+import gen from '../index.js'
+const normalize = gen()
+const [, , ...args] = process.argv
+
+;(async () => {
+  console.log(await normalize(args[0]))
+})()
diff --git a/index.js b/index.js
@@ -2,6 +2,9 @@ import QuickLRU from 'quick-lru'
 import normalizeUrl from './utils/normalize-url.js'
 import httpClientGen from './utils/http-client.js'
 import dnsLookupGen from './utils/dns-lookup.js'
+import logger from './utils/logger.js'
+
+const debug = logger('index.js')
 
 export default (
   normalizeUrlOptions = {
@@ -24,12 +27,16 @@ export default (
 
   // Normalize URL so that we can search by URL.
   return async function normalizePlus(url = '') {
+    debug('Normalizing URL %s', url)
+
     // 1. "Base" normalization using normalize-url + stripping trackers
     // When an invalid link is passed, it will throw.
     const link = await normalize(url)
+    debug('Normalization first pass: %s', url)
 
     // 2. Follow redirects to deal with "intermediate" links (such as the links on google search results)
     const res = await httpClient.get(link, { context: { normalize } })
+    debug('Normalization second pass: %s', res.url)
 
     // At this point, the link will be completely normalized based on canonical links (if one exists)
     return res.url
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -7,6 +7,7 @@
   "repository": "JaneJeon/normalize-url-plus",
   "author": "Jane Jeon <me@janejeon.dev>",
   "license": "LGPL-3.0",
+  "bin": "bin/normalize",
   "scripts": {
     "test": "NODE_OPTIONS=--experimental-vm-modules jest",
     "update-data": "curl -L -s https://rules2.clearurls.xyz/data.minify.json > data/data.minify.json",
@@ -16,6 +17,8 @@
   },
   "dependencies": {
     "cheerio": "^1.0.0-rc.10",
+    "debug": "^4.3.2",
+    "got": "^12.0.0-beta.4",
     "got-scraping": "^3.0.1",
     "got-ssrf": "^1.0.2",
     "ipaddr.js": "^2.0.1",
diff --git a/utils/canonicize.js b/utils/canonicize.js
@@ -3,54 +3,75 @@ import trim from 'lodash/trim.js'
 import leven from 'leven'
 import parseTld from 'tld-extract'
 import urlIsAmp from './url-is-amp.js'
+import logger from './logger.js'
+
+const debug = logger('utils/canonicize.js')
 
 // Look for the canonical link (also un-AMP-ifies the canonical link)
 // Not writing a separate metascraper-canonical library for this, as the "standard" way of determining
 // canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls
 export default async function canonicizeHook(res) {
-  if (!res.request.options.context.normalize) return
+  if (!res.request.options.context.normalize) {
+    debug('got.context.normalize not set; not canonicizing...')
+    return res
+  }
+
   const { normalize } = res.request.options.context
 
   // Normalize the "final" URL up front
   const normalizedUrl = await normalize(res.url)
+  debug('Normalized res.url %s to %s', res.url, normalizedUrl)
 
   // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py
   const $ = cheerio.load(res.body)
   const matches = []
 
   // 5.1: rel=canonical <link> tag
   $('link[rel=canonical]').each(function () {
-    matches.push($(this).attr('href'))
+    const match = $(this).attr('href')
+    matches.push(match)
+    debug('Matched rel=canonical <link> tag: %s', match)
   })
 
   // 5.2: rel=canonical HTTP header
   if ('link' in res.headers) {
+    debug('"Link" header exists, searching for rel=canonical...')
+
     // We're looking for something like:
     // Link: <https://example.com>; rel="canonical", ...
     res.headers.link.split(',').forEach(linkHeader => {
       const parts = linkHeader.split(';')
-      if (parts.length !== 2) return
+      if (parts.length !== 2) {
+        debug('Not enough parts exist in the header: %s', linkHeader)
+        return
+      }
 
       const [linkStr, relStr] = parts
+      debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr)
 
       // rel="canonical", rel=canonical, rel canonical, etc.
       const relStrLower = relStr.toLowerCase()
       if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
         // <https://example.com>, https://example.com, etc.
         const url = trim(linkStr.trim(), ['<', '>', ' '])
         matches.push(url)
+        debug('Found canonical in header: %s', url)
       }
     })
   }
 
   // 5.3: AMP variant
   $('a.amp-canurl').each(function () {
-    matches.push($(this).attr('href'))
+    const match = $(this).attr('href')
+    matches.push(match)
+    debug('Found non-AMP variant: %s', match)
   })
 
   // 5.4: OpenGraph
   $('meta[property="og:url"]').each(function () {
-    matches.push($(this).attr('content'))
+    const match = $(this).attr('content')
+    matches.push(match)
+    debug('Found OpenGraph og:url: %s', match)
   })
 
   // 5.5: Sitemap (I'm not doing this shit)
@@ -60,6 +81,7 @@ export default async function canonicizeHook(res) {
 
   const { hostname: domain } = new URL(normalizedUrl)
   const { domain: baseDomain } = parseTld(normalizedUrl)
+  debug('Finding the best match for host %s and TLD %s...', domain, baseDomain)
 
   let result = normalizedUrl
   let minDist = Number.POSITIVE_INFINITY
@@ -69,20 +91,34 @@ export default async function canonicizeHook(res) {
 
     // turn relative to absolute URL
     if (match.startsWith('/')) link = `${domain}${match}`
+    debug('Considering match %s...', link)
 
     // Skip invalid links
     try {
       link = await normalize(link)
+      debug('Normalized match to %s', link)
 
       // Ensure that every match is a valid URL w/ a matching domain
       // In this case, we're only matching the "top-level" domain -
       // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
       // so we want to include references to www.site.com (actually *prefer* those)
-      if (parseTld(link).domain !== baseDomain) continue
+      const { domain: matchDomain } = parseTld(link)
+      if (matchDomain !== baseDomain) {
+        debug(
+          'The domain %s does not match the base domain %s',
+          matchDomain,
+          baseDomain
+        )
+        continue
+      }
 
       // Then, ensure that links aren't AMP'd
-      if (urlIsAmp(link)) continue
+      if (urlIsAmp(link)) {
+        debug('Link %s is AMP, skipping...', link)
+        continue
+      }
     } catch (err) {
+      debug('Error %s while considering match %s', err, match)
       continue
     }
 
@@ -95,6 +131,7 @@ export default async function canonicizeHook(res) {
   }
 
   res.url = result
+  debug('Found best match %s', result)
 
   return res
 }
diff --git a/utils/dns-lookup.js b/utils/dns-lookup.js
@@ -1,15 +1,28 @@
 import { URL } from 'url'
 import { lookup as nativeLookup } from 'dns/promises'
+import logger from './logger.js'
+
+const debug = logger('utils/dns-lookup.js')
 
 export default dnsCache => {
-  const lookup = dnsCache ? dnsCache.lookupAsync : nativeLookup
+  let lookup
+  if (dnsCache) {
+    debug('Using dnsCache.lookupAsync for DNS lookups')
+    lookup = dnsCache.lookupAsync
+  } else {
+    debug('dnsCache does not exist, falling back to dns/promises')
+    lookup = nativeLookup
+  }
 
   return async url => {
+    debug('Looking up DNS for %s', url)
     try {
       const { hostname } = new URL(url)
       await lookup(hostname)
+      debug('Lookup of host %s successful', hostname)
       return true
     } catch (err) {
+      debug('Lookup of %s unsuccessful', url)
       return false
     }
   }
diff --git a/utils/logger.js b/utils/logger.js
@@ -0,0 +1,7 @@
+import debug from 'debug'
+
+const BASE = 'normalize'
+export default function (name) {
+  const fullName = `${BASE}:${name}`
+  return debug(fullName)
+}
diff --git a/utils/normalize-url.js b/utils/normalize-url.js
@@ -1,6 +1,9 @@
 import normalizeUrl from 'normalize-url'
 import stripTrackers from './strip-trackers.js'
 import { URL } from 'url'
+import logger from './logger.js'
+
+const debug = logger('utils/normalize-url.js')
 
 export default function gen(normalizeUrlOptions, dnsLookup, httpClient) {
   return async function normalize(originalUrl) {
@@ -9,42 +12,49 @@ export default function gen(normalizeUrlOptions, dnsLookup, httpClient) {
       stripWWW: true,
       forceHttps: true
     }
-    // always strip trackers for consistency (even if it means worse performance)!
-    const cleanUrl = stripTrackers(originalUrl)
 
     // Pass 1: try to force as much normalization as possible, knowing that this may break some links
-    let url = normalizeUrl(cleanUrl, {
+    let url = normalizeUrl(originalUrl, {
       ...normalizeUrlOptions,
       ...preferredOptions
     })
+    debug('Normalization pass 1: %s', url)
 
     // Check 1: if the www-stripped domain exists...
     if (originalUrl.includes('www')) {
+      debug('Checking if www-stripped domain exists...')
       const { hostname } = new URL(url)
       try {
         await dnsLookup(hostname)
+        debug('www-stripped domain exists!')
       } catch (err) {
         // Pass 2: we can't resolve the www-stripped host at the DNS level, so we enable it
         preferredOptions.stripWWW = false
         url = normalizeUrl(originalUrl, {
           ...normalizeUrlOptions,
           ...preferredOptions
         })
+        debug('non-www domain does not exist, using %s', url)
       }
     }
 
     // Check 2: if the site doesn't support HTTPS...
+    debug('Checking if https version exists...')
     try {
       await httpClient.head(url)
+      debug('https version exists!')
     } catch (err) {
       // Pass 3: we can't reach the URL via HTTP HEAD request, so try downgrading to http
       preferredOptions.forceHttps = false
-      url = normalizeUrl(cleanUrl, {
+      url = normalizeUrl(originalUrl, {
         ...normalizeUrlOptions,
         ...preferredOptions
       })
+      debug('https version does not exist, using %s', url)
     }
 
-    return url
+    // always strip trackers for consistency (even if it means worse performance)!
+    debug('stripping trackers from %s', url)
+    return stripTrackers(url)
   }
 }
diff --git a/utils/strip-trackers.js b/utils/strip-trackers.js