Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions wat_extract_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ class ExtractHostLinksJob(ExtractLinksJob):
global_link_pattern = re.compile(r'^(?:[a-z][a-z0-9]{1,5}:)?//',
re.IGNORECASE|re.ASCII)

# match IP addresses
# - including IPs with leading `www.' (stripped)
ip_pattern = re.compile(r'^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z')
# simple pattern to match common IPv4 and IPv6 addresses
# (short link to avoid that IP addresses are validated as host names)
host_ip_pattern = re.compile(r'^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f]{0,4}:[0-9a-f:]+)\Z')

# valid host names, relaxed allowing underscore, allowing also IDNAs
# https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
Expand All @@ -373,7 +373,7 @@ def get_surt_host(url):
host = host.strip().lower()
if len(host) < 1 or len(host) > 253:
return None
if ExtractHostLinksJob.ip_pattern.match(host):
if ExtractHostLinksJob.host_ip_pattern.match(host):
return None
parts = host.split('.')
if parts[-1] == '':
Expand All @@ -382,10 +382,6 @@ def get_surt_host(url):
if len(parts) <= 1:
# do not accept single-word hosts, must be at least `domain.tld'
return None
if len(parts) > 2 and parts[0] == 'www':
# strip leading 'www' to reduce number of "duplicate" hosts,
# but leave at least 2 trailing parts (www.com is a valid domain)
parts = parts[1:]
for (i, part) in enumerate(parts):
if len(part) > 63:
return None
Expand Down