commoncrawl · sebastian-nagel · Jan 13, 2026 · Jan 13, 2026
diff --git a/wat_extract_links.py b/wat_extract_links.py
@@ -344,9 +344,9 @@ class ExtractHostLinksJob(ExtractLinksJob):
     global_link_pattern = re.compile(r'^(?:[a-z][a-z0-9]{1,5}:)?//',
                                      re.IGNORECASE|re.ASCII)
 
-    # match IP addresses
-    # - including IPs with leading `www.' (stripped)
-    ip_pattern = re.compile(r'^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z')
+    # simple pattern to match common IPv4 and IPv6 addresses
+    # (short link to avoid that IP addresses are validated as host names)
+    host_ip_pattern = re.compile(r'^(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[0-9a-f]{0,4}:[0-9a-f:]+)\Z')
 
     # valid host names, relaxed allowing underscore, allowing also IDNAs
     # https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
@@ -373,7 +373,7 @@ def get_surt_host(url):
         host = host.strip().lower()
         if len(host) < 1 or len(host) > 253:
             return None
-        if ExtractHostLinksJob.ip_pattern.match(host):
+        if ExtractHostLinksJob.host_ip_pattern.match(host):
             return None
         parts = host.split('.')
         if parts[-1] == '':
@@ -382,10 +382,6 @@ def get_surt_host(url):
         if len(parts) <= 1:
             # do not accept single-word hosts, must be at least `domain.tld'
             return None
-        if len(parts) > 2 and parts[0] == 'www':
-            # strip leading 'www' to reduce number of "duplicate" hosts,
-            # but leave at least 2 trailing parts (www.com is a valid domain)
-            parts = parts[1:]
         for (i, part) in enumerate(parts):
             if len(part) > 63:
                 return None