|
8 | 8 | import itertools
|
9 | 9 | import logging
|
10 | 10 | import os
|
11 |
| -import re |
12 | 11 | import urllib.parse
|
13 | 12 | import urllib.request
|
14 |
| -import xml.etree.ElementTree |
15 | 13 | from optparse import Values
|
16 | 14 | from typing import (
|
17 | 15 | Callable,
|
|
29 | 27 | from pip._vendor.requests.exceptions import RetryError, SSLError
|
30 | 28 |
|
31 | 29 | from pip._internal.exceptions import NetworkConnectionError
|
32 |
| -from pip._internal.models.link import Link |
| 30 | +from pip._internal.models.link import HTMLElement, Link |
33 | 31 | from pip._internal.models.search_scope import SearchScope
|
34 | 32 | from pip._internal.network.session import PipSession
|
35 | 33 | from pip._internal.network.utils import raise_for_status
|
36 | 34 | from pip._internal.utils.filetypes import is_archive_file
|
37 |
| -from pip._internal.utils.misc import pairwise, redact_auth_from_url |
| 35 | +from pip._internal.utils.misc import redact_auth_from_url |
38 | 36 | from pip._internal.vcs import vcs
|
39 | 37 |
|
40 | 38 | from .sources import CandidatesFromPage, LinkSource, build_source
|
41 | 39 |
|
42 | 40 | logger = logging.getLogger(__name__)
|
43 | 41 |
|
44 |
| -HTMLElement = xml.etree.ElementTree.Element |
45 | 42 | ResponseHeaders = MutableMapping[str, str]
|
46 | 43 |
|
47 | 44 |
|
@@ -171,96 +168,6 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
|
171 | 168 | return page_url
|
172 | 169 |
|
173 | 170 |
|
174 |
| -def _clean_url_path_part(part: str) -> str: |
175 |
| - """ |
176 |
| - Clean a "part" of a URL path (i.e. after splitting on "@" characters). |
177 |
| - """ |
178 |
| - # We unquote prior to quoting to make sure nothing is double quoted. |
179 |
| - return urllib.parse.quote(urllib.parse.unquote(part)) |
180 |
| - |
181 |
| - |
182 |
| -def _clean_file_url_path(part: str) -> str: |
183 |
| - """ |
184 |
| - Clean the first part of a URL path that corresponds to a local |
185 |
| - filesystem path (i.e. the first part after splitting on "@" characters). |
186 |
| - """ |
187 |
| - # We unquote prior to quoting to make sure nothing is double quoted. |
188 |
| - # Also, on Windows the path part might contain a drive letter which |
189 |
| - # should not be quoted. On Linux where drive letters do not |
190 |
| - # exist, the colon should be quoted. We rely on urllib.request |
191 |
| - # to do the right thing here. |
192 |
| - return urllib.request.pathname2url(urllib.request.url2pathname(part)) |
193 |
| - |
194 |
| - |
195 |
| -# percent-encoded: / |
196 |
| -_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE) |
197 |
| - |
198 |
| - |
199 |
| -def _clean_url_path(path: str, is_local_path: bool) -> str: |
200 |
| - """ |
201 |
| - Clean the path portion of a URL. |
202 |
| - """ |
203 |
| - if is_local_path: |
204 |
| - clean_func = _clean_file_url_path |
205 |
| - else: |
206 |
| - clean_func = _clean_url_path_part |
207 |
| - |
208 |
| - # Split on the reserved characters prior to cleaning so that |
209 |
| - # revision strings in VCS URLs are properly preserved. |
210 |
| - parts = _reserved_chars_re.split(path) |
211 |
| - |
212 |
| - cleaned_parts = [] |
213 |
| - for to_clean, reserved in pairwise(itertools.chain(parts, [""])): |
214 |
| - cleaned_parts.append(clean_func(to_clean)) |
215 |
| - # Normalize %xx escapes (e.g. %2f -> %2F) |
216 |
| - cleaned_parts.append(reserved.upper()) |
217 |
| - |
218 |
| - return "".join(cleaned_parts) |
219 |
| - |
220 |
| - |
221 |
| -def _clean_link(url: str) -> str: |
222 |
| - """ |
223 |
| - Make sure a link is fully quoted. |
224 |
| - For example, if ' ' occurs in the URL, it will be replaced with "%20", |
225 |
| - and without double-quoting other characters. |
226 |
| - """ |
227 |
| - # Split the URL into parts according to the general structure |
228 |
| - # `scheme://netloc/path;parameters?query#fragment`. |
229 |
| - result = urllib.parse.urlparse(url) |
230 |
| - # If the netloc is empty, then the URL refers to a local filesystem path. |
231 |
| - is_local_path = not result.netloc |
232 |
| - path = _clean_url_path(result.path, is_local_path=is_local_path) |
233 |
| - return urllib.parse.urlunparse(result._replace(path=path)) |
234 |
| - |
235 |
| - |
236 |
| -def _create_link_from_element( |
237 |
| - anchor: HTMLElement, |
238 |
| - page_url: str, |
239 |
| - base_url: str, |
240 |
| -) -> Optional[Link]: |
241 |
| - """ |
242 |
| - Convert an anchor element in a simple repository page to a Link. |
243 |
| - """ |
244 |
| - href = anchor.get("href") |
245 |
| - if not href: |
246 |
| - return None |
247 |
| - |
248 |
| - url = _clean_link(urllib.parse.urljoin(base_url, href)) |
249 |
| - pyrequire = anchor.get("data-requires-python") |
250 |
| - yanked_reason = anchor.get("data-yanked") |
251 |
| - dist_info_metadata = anchor.get("data-dist-info-metadata") |
252 |
| - |
253 |
| - link = Link( |
254 |
| - url, |
255 |
| - comes_from=page_url, |
256 |
| - requires_python=pyrequire, |
257 |
| - yanked_reason=yanked_reason, |
258 |
| - dist_info_metadata=dist_info_metadata, |
259 |
| - ) |
260 |
| - |
261 |
| - return link |
262 |
| - |
263 |
| - |
264 | 171 | class CacheablePageContent:
|
265 | 172 | def __init__(self, page: "HTMLPage") -> None:
|
266 | 173 | assert page.cache_link_parsing
|
@@ -309,11 +216,7 @@ def parse_links(page: "HTMLPage") -> Iterable[Link]:
|
309 | 216 | url = page.url
|
310 | 217 | base_url = _determine_base_url(document, url)
|
311 | 218 | for anchor in document.findall(".//a"):
|
312 |
| - link = _create_link_from_element( |
313 |
| - anchor, |
314 |
| - page_url=url, |
315 |
| - base_url=base_url, |
316 |
| - ) |
| 219 | + link = Link.from_element(anchor, page_url=url, base_url=base_url) |
317 | 220 | if link is None:
|
318 | 221 | continue
|
319 | 222 | yield link
|
|
0 commit comments