8
8
import itertools
9
9
import logging
10
10
import os
11
- import re
12
11
import urllib .parse
13
12
import urllib .request
14
- import xml .etree .ElementTree
15
13
from html .parser import HTMLParser
16
14
from optparse import Values
17
15
from typing import (
33
31
from pip ._vendor .requests .exceptions import RetryError , SSLError
34
32
35
33
from pip ._internal .exceptions import NetworkConnectionError
36
- from pip ._internal .models .link import Link
34
+ from pip ._internal .models .link import HTMLElement , Link
37
35
from pip ._internal .models .search_scope import SearchScope
38
36
from pip ._internal .network .session import PipSession
39
37
from pip ._internal .network .utils import raise_for_status
40
38
from pip ._internal .utils .filetypes import is_archive_file
41
- from pip ._internal .utils .misc import pairwise , redact_auth_from_url
39
+ from pip ._internal .utils .misc import redact_auth_from_url
42
40
from pip ._internal .vcs import vcs
43
41
44
42
from .sources import CandidatesFromPage , LinkSource , build_source
50
48
51
49
logger = logging .getLogger (__name__ )
52
50
53
- HTMLElement = xml .etree .ElementTree .Element
54
51
ResponseHeaders = MutableMapping [str , str ]
55
52
56
53
@@ -182,94 +179,6 @@ def _determine_base_url(document: HTMLElement, page_url: str) -> str:
182
179
return page_url
183
180
184
181
185
- def _clean_url_path_part (part : str ) -> str :
186
- """
187
- Clean a "part" of a URL path (i.e. after splitting on "@" characters).
188
- """
189
- # We unquote prior to quoting to make sure nothing is double quoted.
190
- return urllib .parse .quote (urllib .parse .unquote (part ))
191
-
192
-
193
- def _clean_file_url_path (part : str ) -> str :
194
- """
195
- Clean the first part of a URL path that corresponds to a local
196
- filesystem path (i.e. the first part after splitting on "@" characters).
197
- """
198
- # We unquote prior to quoting to make sure nothing is double quoted.
199
- # Also, on Windows the path part might contain a drive letter which
200
- # should not be quoted. On Linux where drive letters do not
201
- # exist, the colon should be quoted. We rely on urllib.request
202
- # to do the right thing here.
203
- return urllib .request .pathname2url (urllib .request .url2pathname (part ))
204
-
205
-
206
- # percent-encoded: /
207
- _reserved_chars_re = re .compile ("(@|%2F)" , re .IGNORECASE )
208
-
209
-
210
- def _clean_url_path (path : str , is_local_path : bool ) -> str :
211
- """
212
- Clean the path portion of a URL.
213
- """
214
- if is_local_path :
215
- clean_func = _clean_file_url_path
216
- else :
217
- clean_func = _clean_url_path_part
218
-
219
- # Split on the reserved characters prior to cleaning so that
220
- # revision strings in VCS URLs are properly preserved.
221
- parts = _reserved_chars_re .split (path )
222
-
223
- cleaned_parts = []
224
- for to_clean , reserved in pairwise (itertools .chain (parts , ["" ])):
225
- cleaned_parts .append (clean_func (to_clean ))
226
- # Normalize %xx escapes (e.g. %2f -> %2F)
227
- cleaned_parts .append (reserved .upper ())
228
-
229
- return "" .join (cleaned_parts )
230
-
231
-
232
- def _clean_link (url : str ) -> str :
233
- """
234
- Make sure a link is fully quoted.
235
- For example, if ' ' occurs in the URL, it will be replaced with "%20",
236
- and without double-quoting other characters.
237
- """
238
- # Split the URL into parts according to the general structure
239
- # `scheme://netloc/path;parameters?query#fragment`.
240
- result = urllib .parse .urlparse (url )
241
- # If the netloc is empty, then the URL refers to a local filesystem path.
242
- is_local_path = not result .netloc
243
- path = _clean_url_path (result .path , is_local_path = is_local_path )
244
- return urllib .parse .urlunparse (result ._replace (path = path ))
245
-
246
-
247
- def _create_link_from_element (
248
- element_attribs : Dict [str , Optional [str ]],
249
- page_url : str ,
250
- base_url : str ,
251
- ) -> Optional [Link ]:
252
- """
253
- Convert an anchor element's attributes in a simple repository page to a Link.
254
- """
255
- href = element_attribs .get ("href" )
256
- if not href :
257
- return None
258
-
259
- url = _clean_link (urllib .parse .urljoin (base_url , href ))
260
- pyrequire = element_attribs .get ("data-requires-python" )
261
- yanked_reason = element_attribs .get ("data-yanked" )
262
-
263
- link = Link (
264
- url ,
265
- comes_from = page_url ,
266
- requires_python = pyrequire ,
267
- yanked_reason = yanked_reason ,
268
- )
269
-
270
- return link
271
-
272
-
273
182
class CacheablePageContent :
274
183
def __init__ (self , page : "HTMLPage" ) -> None :
275
184
assert page .cache_link_parsing
@@ -326,7 +235,7 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
326
235
url = page .url
327
236
base_url = _determine_base_url (document , url )
328
237
for anchor in document .findall (".//a" ):
329
- link = _create_link_from_element (
238
+ link = Link . from_element (
330
239
anchor .attrib ,
331
240
page_url = url ,
332
241
base_url = base_url ,
@@ -353,11 +262,7 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
353
262
url = page .url
354
263
base_url = parser .base_url or url
355
264
for anchor in parser .anchors :
356
- link = _create_link_from_element (
357
- anchor ,
358
- page_url = url ,
359
- base_url = base_url ,
360
- )
265
+ link = Link .from_element (anchor , page_url = url , base_url = base_url )
361
266
if link is None :
362
267
continue
363
268
yield link
0 commit comments