Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion src/fundus/publishers/uk/nature.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

class NatureParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract")
VALID_UNTIL = datetime.date(2026, 2, 1) # This date is the best guess
_summary_selector: XPath = CSSSelector("div.c-article-abstract p, p.c-article-abstract")

_paragraph_selector = XPath(
"//div[@data-test='access-teaser']//p"
Expand Down Expand Up @@ -85,3 +86,24 @@ def images(self) -> List[Image]:
author_selector=self._author_pattern,
lower_boundary_selector=self._lower_boundary_selector,
)

class V1_1(V1):
_paragraph_selector = XPath(
"//div[@data-test='main-content' or @class='main-content']//p"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this article [1] the title of the related article is extracted as paragraph. In general related article titles are extracted.

This article [2] couldn't be parsed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching, should be fixed now 👍

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@addie9800 Unfortunately the plaintext of 1 is now cut off.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MaxDall Unfortunately, I cannot reproduce that. The last extracted sentence in my test is: It’s not crazy. It’s really exciting.” as expected. Or what do you mean by cut off?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@addie9800 Hmm, that's weird. For me the last paragraph is the one before the Reinventing MRI subheadline, ending with “It’s a tiny change on a tiny scale.”

"["
" not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])"
" and not(contains(@class, 'recommended__title'))"
" and not(ancestor::figure)"
" and not(ancestor::figcaption)"
" and not(ancestor::a)"
"]"
)
_summary_selector = XPath("//div[@class='c-article-teaser-text']")
_subheadline_selector = XPath(
"//div[@data-test='main-content' or @class='main-content']"
"//h2"
"[not(ancestor::article[contains(@class, 'recommended')])]"
)

_lower_boundary_selector = XPath("(//aside)[2]")
_paywall_selector = XPath("//div[contains(@class, 'buybox')]")
Loading