move url cleaning to link.py

cosmicexplorer · cosmicexplorer · commit 6d3c6d369307 · 2021-12-26T06:40:17.000-05:00
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -8,10 +8,8 @@
 import itertools
 import logging
 import os
-import re
 import urllib.parse
 import urllib.request
-import xml.etree.ElementTree
 from optparse import Values
 from typing import (
     Callable,
@@ -29,19 +27,18 @@
 from pip._vendor.requests.exceptions import RetryError, SSLError
 
 from pip._internal.exceptions import NetworkConnectionError
-from pip._internal.models.link import Link
+from pip._internal.models.link import HTMLElement, Link
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.network.session import PipSession
 from pip._internal.network.utils import raise_for_status
 from pip._internal.utils.filetypes import is_archive_file
-from pip._internal.utils.misc import pairwise, redact_auth_from_url
+from pip._internal.utils.misc import redact_auth_from_url
 from pip._internal.vcs import vcs
 
 from .sources import CandidatesFromPage, LinkSource, build_source
 
 logger = logging.getLogger(__name__)
 
-HTMLElement = xml.etree.ElementTree.Element
 ResponseHeaders = MutableMapping[str, str]
 
 
@@ -171,96 +168,6 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
     return page_url
 
 
-def _clean_url_path_part(part: str) -> str:
-    """
-    Clean a "part" of a URL path (i.e. after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    return urllib.parse.quote(urllib.parse.unquote(part))
-
-
-def _clean_file_url_path(part: str) -> str:
-    """
-    Clean the first part of a URL path that corresponds to a local
-    filesystem path (i.e. the first part after splitting on "@" characters).
-    """
-    # We unquote prior to quoting to make sure nothing is double quoted.
-    # Also, on Windows the path part might contain a drive letter which
-    # should not be quoted. On Linux where drive letters do not
-    # exist, the colon should be quoted. We rely on urllib.request
-    # to do the right thing here.
-    return urllib.request.pathname2url(urllib.request.url2pathname(part))
-
-
-# percent-encoded:                   /
-_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
-
-
-def _clean_url_path(path: str, is_local_path: bool) -> str:
-    """
-    Clean the path portion of a URL.
-    """
-    if is_local_path:
-        clean_func = _clean_file_url_path
-    else:
-        clean_func = _clean_url_path_part
-
-    # Split on the reserved characters prior to cleaning so that
-    # revision strings in VCS URLs are properly preserved.
-    parts = _reserved_chars_re.split(path)
-
-    cleaned_parts = []
-    for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
-        cleaned_parts.append(clean_func(to_clean))
-        # Normalize %xx escapes (e.g. %2f -> %2F)
-        cleaned_parts.append(reserved.upper())
-
-    return "".join(cleaned_parts)
-
-
-def _clean_link(url: str) -> str:
-    """
-    Make sure a link is fully quoted.
-    For example, if ' ' occurs in the URL, it will be replaced with "%20",
-    and without double-quoting other characters.
-    """
-    # Split the URL into parts according to the general structure
-    # `scheme://netloc/path;parameters?query#fragment`.
-    result = urllib.parse.urlparse(url)
-    # If the netloc is empty, then the URL refers to a local filesystem path.
-    is_local_path = not result.netloc
-    path = _clean_url_path(result.path, is_local_path=is_local_path)
-    return urllib.parse.urlunparse(result._replace(path=path))
-
-
-def _create_link_from_element(
-    anchor: HTMLElement,
-    page_url: str,
-    base_url: str,
-) -> Optional[Link]:
-    """
-    Convert an anchor element in a simple repository page to a Link.
-    """
-    href = anchor.get("href")
-    if not href:
-        return None
-
-    url = _clean_link(urllib.parse.urljoin(base_url, href))
-    pyrequire = anchor.get("data-requires-python")
-    yanked_reason = anchor.get("data-yanked")
-    dist_info_metadata = anchor.get("data-dist-info-metadata")
-
-    link = Link(
-        url,
-        comes_from=page_url,
-        requires_python=pyrequire,
-        yanked_reason=yanked_reason,
-        dist_info_metadata=dist_info_metadata,
-    )
-
-    return link
-
-
 class CacheablePageContent:
     def __init__(self, page: "HTMLPage") -> None:
         assert page.cache_link_parsing
@@ -309,11 +216,7 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
     url = page.url
     base_url = _determine_base_url(document, url)
     for anchor in document.findall(".//a"):
-        link = _create_link_from_element(
-            anchor,
-            page_url=url,
-            base_url=base_url,
-        )
+        link = Link.from_element(anchor, page_url=url, base_url=base_url)
         if link is None:
             continue
         yield link
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -1,14 +1,17 @@
 import functools
+import itertools
 import logging
 import os
 import posixpath
 import re
 import urllib.parse
+import xml.etree.ElementTree
 from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union
 
 from pip._internal.utils.filetypes import WHEEL_EXTENSION
 from pip._internal.utils.hashes import Hashes
 from pip._internal.utils.misc import (
+    pairwise,
     redact_auth_from_url,
     split_auth_from_netloc,
     splitext,
@@ -22,9 +25,74 @@
 logger = logging.getLogger(__name__)
 
 
+HTMLElement = xml.etree.ElementTree.Element
+
+
 _SUPPORTED_HASHES = ("sha1", "sha224", "sha384", "sha256", "sha512", "md5")
 
 
+def _clean_url_path_part(part: str) -> str:
+    """
+    Clean a "part" of a URL path (i.e. after splitting on "@" characters).
+    """
+    # We unquote prior to quoting to make sure nothing is double quoted.
+    return urllib.parse.quote(urllib.parse.unquote(part))
+
+
+def _clean_file_url_path(part: str) -> str:
+    """
+    Clean the first part of a URL path that corresponds to a local
+    filesystem path (i.e. the first part after splitting on "@" characters).
+    """
+    # We unquote prior to quoting to make sure nothing is double quoted.
+    # Also, on Windows the path part might contain a drive letter which
+    # should not be quoted. On Linux where drive letters do not
+    # exist, the colon should be quoted. We rely on urllib.request
+    # to do the right thing here.
+    return urllib.request.pathname2url(urllib.request.url2pathname(part))
+
+
+# percent-encoded:                   /
+_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
+
+
+def _clean_url_path(path: str, is_local_path: bool) -> str:
+    """
+    Clean the path portion of a URL.
+    """
+    if is_local_path:
+        clean_func = _clean_file_url_path
+    else:
+        clean_func = _clean_url_path_part
+
+    # Split on the reserved characters prior to cleaning so that
+    # revision strings in VCS URLs are properly preserved.
+    parts = _reserved_chars_re.split(path)
+
+    cleaned_parts = []
+    for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
+        cleaned_parts.append(clean_func(to_clean))
+        # Normalize %xx escapes (e.g. %2f -> %2F)
+        cleaned_parts.append(reserved.upper())
+
+    return "".join(cleaned_parts)
+
+
+def _ensure_quoted_url(url: str) -> str:
+    """
+    Make sure a link is fully quoted.
+    For example, if ' ' occurs in the URL, it will be replaced with "%20",
+    and without double-quoting other characters.
+    """
+    # Split the URL into parts according to the general structure
+    # `scheme://netloc/path;parameters?query#fragment`.
+    result = urllib.parse.urlparse(url)
+    # If the netloc is empty, then the URL refers to a local filesystem path.
+    is_local_path = not result.netloc
+    path = _clean_url_path(result.path, is_local_path=is_local_path)
+    return urllib.parse.urlunparse(result._replace(path=path))
+
+
 class Link(KeyBasedCompareMixin):
     """Represents a parsed link from a Package Index's simple URL"""
 
@@ -87,6 +155,35 @@ def __init__(
 
         self.cache_link_parsing = cache_link_parsing
 
+    @classmethod
+    def from_element(
+        cls,
+        anchor: HTMLElement,
+        page_url: str,
+        base_url: str,
+    ) -> Optional["Link"]:
+        """
+        Convert an anchor element in a simple repository page to a Link.
+        """
+        href = anchor.get("href")
+        if not href:
+            return None
+
+        url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
+        pyrequire = anchor.get("data-requires-python")
+        yanked_reason = anchor.get("data-yanked")
+        dist_info_metadata = anchor.get("data-dist-info-metadata")
+
+        link = Link(
+            url,
+            comes_from=page_url,
+            requires_python=pyrequire,
+            yanked_reason=yanked_reason,
+            dist_info_metadata=dist_info_metadata,
+        )
+
+        return link
+
     def __str__(self) -> str:
         if self.requires_python:
             rp = f" (requires-python:{self.requires_python})"
@@ -169,7 +266,6 @@ def subdirectory_fragment(self) -> Optional[str]:
             return None
         return match.group(1)
 
-    # FIXME: retrieve all the `re.compile` anchor tags when the Link is constructed!!
     _hash_re = re.compile(
         r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES))
     )