Skip to content

Commit 6d3c6d3

Browse files
move url cleaning to link.py
1 parent 80187a5 commit 6d3c6d3

File tree

2 files changed

+100
-101
lines changed

2 files changed

+100
-101
lines changed

src/pip/_internal/index/collector.py

+3-100
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
import itertools
99
import logging
1010
import os
11-
import re
1211
import urllib.parse
1312
import urllib.request
14-
import xml.etree.ElementTree
1513
from optparse import Values
1614
from typing import (
1715
Callable,
@@ -29,19 +27,18 @@
2927
from pip._vendor.requests.exceptions import RetryError, SSLError
3028

3129
from pip._internal.exceptions import NetworkConnectionError
32-
from pip._internal.models.link import Link
30+
from pip._internal.models.link import HTMLElement, Link
3331
from pip._internal.models.search_scope import SearchScope
3432
from pip._internal.network.session import PipSession
3533
from pip._internal.network.utils import raise_for_status
3634
from pip._internal.utils.filetypes import is_archive_file
37-
from pip._internal.utils.misc import pairwise, redact_auth_from_url
35+
from pip._internal.utils.misc import redact_auth_from_url
3836
from pip._internal.vcs import vcs
3937

4038
from .sources import CandidatesFromPage, LinkSource, build_source
4139

4240
logger = logging.getLogger(__name__)
4341

44-
HTMLElement = xml.etree.ElementTree.Element
4542
ResponseHeaders = MutableMapping[str, str]
4643

4744

@@ -171,96 +168,6 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
171168
return page_url
172169

173170

174-
def _clean_url_path_part(part: str) -> str:
175-
"""
176-
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
177-
"""
178-
# We unquote prior to quoting to make sure nothing is double quoted.
179-
return urllib.parse.quote(urllib.parse.unquote(part))
180-
181-
182-
def _clean_file_url_path(part: str) -> str:
183-
"""
184-
Clean the first part of a URL path that corresponds to a local
185-
filesystem path (i.e. the first part after splitting on "@" characters).
186-
"""
187-
# We unquote prior to quoting to make sure nothing is double quoted.
188-
# Also, on Windows the path part might contain a drive letter which
189-
# should not be quoted. On Linux where drive letters do not
190-
# exist, the colon should be quoted. We rely on urllib.request
191-
# to do the right thing here.
192-
return urllib.request.pathname2url(urllib.request.url2pathname(part))
193-
194-
195-
# percent-encoded: /
196-
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
197-
198-
199-
def _clean_url_path(path: str, is_local_path: bool) -> str:
200-
"""
201-
Clean the path portion of a URL.
202-
"""
203-
if is_local_path:
204-
clean_func = _clean_file_url_path
205-
else:
206-
clean_func = _clean_url_path_part
207-
208-
# Split on the reserved characters prior to cleaning so that
209-
# revision strings in VCS URLs are properly preserved.
210-
parts = _reserved_chars_re.split(path)
211-
212-
cleaned_parts = []
213-
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
214-
cleaned_parts.append(clean_func(to_clean))
215-
# Normalize %xx escapes (e.g. %2f -> %2F)
216-
cleaned_parts.append(reserved.upper())
217-
218-
return "".join(cleaned_parts)
219-
220-
221-
def _clean_link(url: str) -> str:
222-
"""
223-
Make sure a link is fully quoted.
224-
For example, if ' ' occurs in the URL, it will be replaced with "%20",
225-
and without double-quoting other characters.
226-
"""
227-
# Split the URL into parts according to the general structure
228-
# `scheme://netloc/path;parameters?query#fragment`.
229-
result = urllib.parse.urlparse(url)
230-
# If the netloc is empty, then the URL refers to a local filesystem path.
231-
is_local_path = not result.netloc
232-
path = _clean_url_path(result.path, is_local_path=is_local_path)
233-
return urllib.parse.urlunparse(result._replace(path=path))
234-
235-
236-
def _create_link_from_element(
237-
anchor: HTMLElement,
238-
page_url: str,
239-
base_url: str,
240-
) -> Optional[Link]:
241-
"""
242-
Convert an anchor element in a simple repository page to a Link.
243-
"""
244-
href = anchor.get("href")
245-
if not href:
246-
return None
247-
248-
url = _clean_link(urllib.parse.urljoin(base_url, href))
249-
pyrequire = anchor.get("data-requires-python")
250-
yanked_reason = anchor.get("data-yanked")
251-
dist_info_metadata = anchor.get("data-dist-info-metadata")
252-
253-
link = Link(
254-
url,
255-
comes_from=page_url,
256-
requires_python=pyrequire,
257-
yanked_reason=yanked_reason,
258-
dist_info_metadata=dist_info_metadata,
259-
)
260-
261-
return link
262-
263-
264171
class CacheablePageContent:
265172
def __init__(self, page: "HTMLPage") -> None:
266173
assert page.cache_link_parsing
@@ -309,11 +216,7 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
309216
url = page.url
310217
base_url = _determine_base_url(document, url)
311218
for anchor in document.findall(".//a"):
312-
link = _create_link_from_element(
313-
anchor,
314-
page_url=url,
315-
base_url=base_url,
316-
)
219+
link = Link.from_element(anchor, page_url=url, base_url=base_url)
317220
if link is None:
318221
continue
319222
yield link

src/pip/_internal/models/link.py

+97-1
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import functools
2+
import itertools
23
import logging
34
import os
45
import posixpath
56
import re
67
import urllib.parse
8+
import xml.etree.ElementTree
79
from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union
810

911
from pip._internal.utils.filetypes import WHEEL_EXTENSION
1012
from pip._internal.utils.hashes import Hashes
1113
from pip._internal.utils.misc import (
14+
pairwise,
1215
redact_auth_from_url,
1316
split_auth_from_netloc,
1417
splitext,
@@ -22,9 +25,74 @@
2225
logger = logging.getLogger(__name__)
2326

2427

28+
HTMLElement = xml.etree.ElementTree.Element
29+
30+
2531
_SUPPORTED_HASHES = ("sha1", "sha224", "sha384", "sha256", "sha512", "md5")
2632

2733

34+
def _clean_url_path_part(part: str) -> str:
35+
"""
36+
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
37+
"""
38+
# We unquote prior to quoting to make sure nothing is double quoted.
39+
return urllib.parse.quote(urllib.parse.unquote(part))
40+
41+
42+
def _clean_file_url_path(part: str) -> str:
43+
"""
44+
Clean the first part of a URL path that corresponds to a local
45+
filesystem path (i.e. the first part after splitting on "@" characters).
46+
"""
47+
# We unquote prior to quoting to make sure nothing is double quoted.
48+
# Also, on Windows the path part might contain a drive letter which
49+
# should not be quoted. On Linux where drive letters do not
50+
# exist, the colon should be quoted. We rely on urllib.request
51+
# to do the right thing here.
52+
return urllib.request.pathname2url(urllib.request.url2pathname(part))
53+
54+
55+
# percent-encoded: /
56+
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)
57+
58+
59+
def _clean_url_path(path: str, is_local_path: bool) -> str:
60+
"""
61+
Clean the path portion of a URL.
62+
"""
63+
if is_local_path:
64+
clean_func = _clean_file_url_path
65+
else:
66+
clean_func = _clean_url_path_part
67+
68+
# Split on the reserved characters prior to cleaning so that
69+
# revision strings in VCS URLs are properly preserved.
70+
parts = _reserved_chars_re.split(path)
71+
72+
cleaned_parts = []
73+
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
74+
cleaned_parts.append(clean_func(to_clean))
75+
# Normalize %xx escapes (e.g. %2f -> %2F)
76+
cleaned_parts.append(reserved.upper())
77+
78+
return "".join(cleaned_parts)
79+
80+
81+
def _ensure_quoted_url(url: str) -> str:
82+
"""
83+
Make sure a link is fully quoted.
84+
For example, if ' ' occurs in the URL, it will be replaced with "%20",
85+
and without double-quoting other characters.
86+
"""
87+
# Split the URL into parts according to the general structure
88+
# `scheme://netloc/path;parameters?query#fragment`.
89+
result = urllib.parse.urlparse(url)
90+
# If the netloc is empty, then the URL refers to a local filesystem path.
91+
is_local_path = not result.netloc
92+
path = _clean_url_path(result.path, is_local_path=is_local_path)
93+
return urllib.parse.urlunparse(result._replace(path=path))
94+
95+
2896
class Link(KeyBasedCompareMixin):
2997
"""Represents a parsed link from a Package Index's simple URL"""
3098

@@ -87,6 +155,35 @@ def __init__(
87155

88156
self.cache_link_parsing = cache_link_parsing
89157

158+
@classmethod
159+
def from_element(
160+
cls,
161+
anchor: HTMLElement,
162+
page_url: str,
163+
base_url: str,
164+
) -> Optional["Link"]:
165+
"""
166+
Convert an anchor element in a simple repository page to a Link.
167+
"""
168+
href = anchor.get("href")
169+
if not href:
170+
return None
171+
172+
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
173+
pyrequire = anchor.get("data-requires-python")
174+
yanked_reason = anchor.get("data-yanked")
175+
dist_info_metadata = anchor.get("data-dist-info-metadata")
176+
177+
link = Link(
178+
url,
179+
comes_from=page_url,
180+
requires_python=pyrequire,
181+
yanked_reason=yanked_reason,
182+
dist_info_metadata=dist_info_metadata,
183+
)
184+
185+
return link
186+
90187
def __str__(self) -> str:
91188
if self.requires_python:
92189
rp = f" (requires-python:{self.requires_python})"
@@ -169,7 +266,6 @@ def subdirectory_fragment(self) -> Optional[str]:
169266
return None
170267
return match.group(1)
171268

172-
# FIXME: retrieve all the `re.compile` anchor tags when the Link is constructed!!
173269
_hash_re = re.compile(
174270
r"({choices})=([a-f0-9]+)".format(choices="|".join(_SUPPORTED_HASHES))
175271
)

0 commit comments

Comments
 (0)