Skip to content

Carry anchor text with link for parsing #6329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/6272.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Parse text in HTML anchors when collecting links to conform with PEP 503.
64 changes: 35 additions & 29 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from pip._internal.utils.logging import indent_log
from pip._internal.utils.misc import (
ARCHIVE_EXTENSIONS, SUPPORTED_EXTENSIONS, WHEEL_EXTENSION, normalize_path,
redact_password_from_url,
splitext, redact_password_from_url,
)
from pip._internal.utils.packaging import check_requires_python
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
Expand All @@ -54,6 +54,7 @@
SecureOrigin = Tuple[str, str, Optional[str]]
BuildTag = Tuple[Any, ...] # either emply tuple or Tuple[int, str]
CandidateSortingKey = Tuple[int, _BaseVersion, BuildTag, Optional[int]]
Anchor = Tuple[str, Link]

__all__ = ['FormatControl', 'PackageFinder']

Expand Down Expand Up @@ -568,18 +569,14 @@ def find_all_candidates(self, project_name):
This checks index_urls and find_links.
All versions found are returned as an InstallationCandidate list.

See _link_package_versions for details on which files are accepted
See _anchor_package_versions for details on which files are accepted
"""
index_locations = self._get_index_urls_locations(project_name)
index_file_loc, index_url_loc = self._sort_locations(index_locations)
fl_file_loc, fl_url_loc = self._sort_locations(
self.find_links, expand_dir=True,
)

file_locations = (Link(url) for url in itertools.chain(
index_file_loc, fl_file_loc,
))

# We trust every url that the user has given us whether it was given
# via --index-url or --find-links.
# We want to filter out any thing which does not have a secure origin.
Expand All @@ -600,9 +597,11 @@ def find_all_candidates(self, project_name):
canonical_name = canonicalize_name(project_name)
formats = self.format_control.get_allowed_formats(canonical_name)
search = Search(project_name, canonical_name, formats)

find_links_locations = (Link(url, '-f') for url in self.find_links)
find_links_versions = self._package_versions(
# We trust every directly linked archive in find_links
(Link(url, '-f') for url in self.find_links),
((link.filename, link) for link in find_links_locations),
search
)

Expand All @@ -611,10 +610,15 @@ def find_all_candidates(self, project_name):
logger.debug('Analyzing links from page %s', page.url)
with indent_log():
page_versions.extend(
self._package_versions(page.iter_links(), search)
self._package_versions(page.iter_anchors(), search)
)

file_versions = self._package_versions(file_locations, search)
file_locations = (
Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
)
file_versions = self._package_versions(
((link.filename, link) for link in file_locations), search,
)
if file_versions:
file_versions.sort(reverse=True)
logger.debug(
Expand Down Expand Up @@ -748,32 +752,32 @@ def _get_pages(self, locations, project_name):

_py_version_re = re.compile(r'-py([123]\.?[0-9]?)$')

def _sort_links(self, links):
# type: (Iterable[Link]) -> List[Link]
def _sort_anchors(self, anchors):
# type: (Iterable[Anchor]) -> List[Anchor]
"""
Returns elements of links in order, non-egg links first, egg links
second, while eliminating duplicates
"""
eggs, no_eggs = [], []
seen = set() # type: Set[Link]
for link in links:
for text, link in anchors:
if link not in seen:
seen.add(link)
if link.egg_fragment:
eggs.append(link)
eggs.append((text, link))
else:
no_eggs.append(link)
no_eggs.append((text, link))
return no_eggs + eggs

def _package_versions(
self,
links, # type: Iterable[Link]
search # type: Search
anchors, # type: Iterable[Anchor]
search, # type: Search
):
# type: (...) -> List[Optional[InstallationCandidate]]
result = []
for link in self._sort_links(links):
v = self._link_package_versions(link, search)
for anchor in self._sort_anchors(anchors):
v = self._anchor_package_versions(anchor, search)
if v is not None:
result.append(v)
return result
Expand All @@ -784,15 +788,16 @@ def _log_skipped_link(self, link, reason):
logger.debug('Skipping link %s; %s', link, reason)
self.logged_links.add(link)

def _link_package_versions(self, link, search):
# type: (Link, Search) -> Optional[InstallationCandidate]
def _anchor_package_versions(self, anchor, search):
# type: (Anchor, Search) -> Optional[InstallationCandidate]
"""Return an InstallationCandidate or None"""
text, link = anchor
version = None
if link.egg_fragment:
egg_info = link.egg_fragment
ext = link.ext
_, ext = splitext(text)
else:
egg_info, ext = link.splitext()
egg_info, ext = splitext(text)
if not ext:
self._log_skipped_link(link, 'not a file')
return None
Expand All @@ -806,12 +811,12 @@ def _link_package_versions(self, link, search):
link, 'No binaries permitted for %s' % search.supplied,
)
return None
if "macosx10" in link.path and ext == '.zip':
if "macosx10" in text and ext == '.zip':
self._log_skipped_link(link, 'macosx10 one')
return None
if ext == WHEEL_EXTENSION:
try:
wheel = Wheel(link.filename)
wheel = Wheel(text)
except InvalidWheelFilename:
self._log_skipped_link(link, 'invalid wheel filename')
return None
Expand Down Expand Up @@ -853,7 +858,7 @@ def _link_package_versions(self, link, search):
support_this_python = check_requires_python(link.requires_python)
except specifiers.InvalidSpecifier:
logger.debug("Package %s has an invalid Requires-Python entry: %s",
link.filename, link.requires_python)
text, link.requires_python)
support_this_python = True

if not support_this_python:
Expand Down Expand Up @@ -962,9 +967,9 @@ def __init__(self, content, url, headers=None):
def __str__(self):
return redact_password_from_url(self.url)

def iter_links(self):
# type: () -> Iterable[Link]
"""Yields all links in the page"""
def iter_anchors(self):
# type: () -> Iterable[Anchor]
"""Yields all anchor information in the page"""
document = html5lib.parse(
self.content,
transport_encoding=_get_encoding_from_headers(self.headers),
Expand All @@ -977,7 +982,8 @@ def iter_links(self):
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yield Link(url, self.url, requires_python=pyrequire)
link = Link(url, self.url, requires_python=pyrequire)
yield (anchor.text, link)


Search = namedtuple('Search', 'supplied canonical formats')
Expand Down
24 changes: 12 additions & 12 deletions tests/unit/test_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,42 +478,42 @@ def setup(self):
)

@pytest.mark.parametrize(
'url',
'name, url',
[
'http:/yo/pytest-1.0.tar.gz',
'http:/yo/pytest-1.0-py2.py3-none-any.whl',
('pytest-1.0.tar.gz', 'does-not-matter-tar-gz-url'),
('pytest-1.0-py2.py3-none-any.whl', 'does-not-matter-whl-url'),
],
)
def test_link_package_versions_match(self, url):
def test_anchor_package_versions_match(self, name, url):
"""Test that 'pytest' archives match for 'pytest'"""
link = Link(url)
search = Search(
supplied=self.search_name,
canonical=self.canonical_name,
formats=['source', 'binary'],
)
result = self.finder._link_package_versions(link, search)
result = self.finder._anchor_package_versions((name, link), search)
expected = InstallationCandidate(self.search_name, self.version, link)
assert result == expected, result

@pytest.mark.parametrize(
'url',
'name',
[
# TODO: Uncomment this test case when #1217 is fixed.
# 'http:/yo/pytest-xdist-1.0.tar.gz',
'http:/yo/pytest2-1.0.tar.gz',
'http:/yo/pytest_xdist-1.0-py2.py3-none-any.whl',
# 'pytest-xdist-1.0.tar.gz',
'pytest2-1.0.tar.gz',
'pytest_xdist-1.0-py2.py3-none-any.whl',
],
)
def est_link_package_versions_substring_fails(self, url):
def test_anchor_package_versions_substring_fails(self, name):
"""Test that 'pytest<something> archives won't match for 'pytest'."""
link = Link(url)
anchor = (name, Link('does-not-matter'))
search = Search(
supplied=self.search_name,
canonical=self.canonical_name,
formats=['source', 'binary'],
)
result = self.finder._link_package_versions(link, search)
result = self.finder._anchor_package_versions(anchor, search)
assert result is None, result


Expand Down