Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 34 additions & 32 deletions src/paperoni/discovery/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,12 @@ def _get_link(link_type: str, link_value: str) -> Link:
return Link(type=link_type, link=relevant_part)


def _links(**data):
for link_type, link_value in data.items():
if link_value is not None:
yield _get_link(link_type, link_value)


class OpenAlexQueryManager:
def __init__(self, *, mailto=None, work_types=DEFAULT_WORK_TYPES):
self.mailto = mailto
Expand Down Expand Up @@ -201,8 +207,9 @@ async def _evaluate(self, path: str, **params):
def _try_wrapping_paper(self, data: dict) -> PaperInfo:
try:
return self._wrap_paper(data)
except Exception as exc:
raise Exception(pprint.pformat(data)) from exc
except Exception:
pprint.pformat(data)
raise

def _wrap_paper(self, data: dict) -> PaperInfo:
# Assert consistency in locations
Expand All @@ -212,15 +219,15 @@ def _wrap_paper(self, data: dict) -> PaperInfo:

locations = data["locations"]

if locations:
# data["primary_location"] apparently has more keys than
# locations[0]. In particular, it has the additional "version" and
# "is_accepted" keys.
for key in locations[0].keys():
assert locations[0][key] == data["primary_location"][key]
else:
assert data["primary_location"] is None
assert data["best_oa_location"] is None
# if locations:
# # data["primary_location"] apparently has more keys than
# # locations[0]. In particular, it has the additional "version" and
# # "is_accepted" keys.
# for key in locations[0].keys():
# assert locations[0][key] == data["primary_location"][key]
# else:
# assert data["primary_location"] is None
# assert data["best_oa_location"] is None

# # Assert consistency in paper ids
# if data.get("doi"):
Expand All @@ -230,19 +237,16 @@ def _wrap_paper(self, data: dict) -> PaperInfo:
# We collect them here so that they can also be added to paper "links" field.
links_from_locations = []
for location in locations:
if location["landing_page_url"]:
links_from_locations.append(
_get_link(
"url",
location["landing_page_url"],
)
links_from_locations.extend(
_links(
url=location["landing_page_url"],
pdf=location["pdf_url"],
)
if location["pdf_url"]:
links_from_locations.append(_get_link("pdf", location["pdf_url"]))
)

def venue_name(loc):
vn = candidate.get("raw_source_name", None)
if vn is None and loc["source"]:
if (not vn) and loc["source"]:
vn = loc["source"]["display_name"]
if vn and (vn.startswith("http") or "CiteSeer" in vn):
return None
Expand All @@ -257,13 +261,16 @@ def venue_name(loc):
break

# We will use work publication date with primary location to set release
if not data["publication_date"]:
return None

publication_date = date.fromisoformat(data["publication_date"])

# We will save open access url in paper links
oa_url = data["open_access"]["oa_url"]

links = {_get_link(typ, ref) for typ, ref in data["ids"].items()}
links.update([_get_link("open-access", oa_url)] if oa_url is not None else [])
links.update(_links(**{"open-access": oa_url}))
links.update(links_from_locations)
links = list(links)
links.sort(key=lambda l: (l.type, l.link))
Expand All @@ -277,16 +284,11 @@ def venue_name(loc):
author=Author(
name=authorship["author"]["display_name"],
aliases=[],
links=[_get_link("openalex", authorship["author"]["id"])]
+ (
[
_get_link(
"orcid",
authorship["author"]["orcid"],
)
]
if authorship["author"]["orcid"] is not None
else []
links=list(
_links(
openalex=authorship["author"]["id"],
orcid=authorship["author"]["orcid"],
)
),
),
affiliations=[
Expand Down Expand Up @@ -412,7 +414,7 @@ async def query(
# [alias: -v]
verbose: bool = False,
# Data version
data_version: Literal["1", "2"] = "1",
data_version: Literal["1", "2"] = "2",
# A list of focuses
focuses: Focuses = None,
):
Expand Down
115 changes: 114 additions & 1 deletion src/paperoni/refinement/formats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from datetime import date
from datetime import date, datetime

from ..config import config
from ..model import (
Expand Down Expand Up @@ -337,6 +337,119 @@ def find_affiliations(author):
)


def papers_from_arxiv(soup):
for entry in soup.find_all("entry"):
paper = paper_from_arxiv(entry)
if paper is not None:
yield paper


def paper_from_arxiv(entry):
# Extract title
entry_title = entry.find("title")
if not entry_title or not entry_title.text.strip():
return None

entry_title_text = entry_title.text.strip()

# Extract arxiv ID from the entry ID
entry_id = entry.find("id")
if not entry_id or not entry_id.text:
return None

arxiv_id_result = url_to_id(entry_id.text)
if not arxiv_id_result or arxiv_id_result[0] != "arxiv":
return None

arxiv_id = arxiv_id_result[1]

# Extract abstract
summary = entry.find("summary")
abstract = summary.text.strip() if summary and summary.text else None

# Extract authors
authors = []
for author_elem in entry.find_all("author"):
name_elem = author_elem.find("name")
if name_elem and name_elem.text:
author_name = name_elem.text.strip()
authors.append(
PaperAuthor(
display_name=author_name,
author=Author(name=author_name, aliases=[], links=[]),
affiliations=[],
)
)

if not authors:
return None

# Extract published date
published = entry.find("published")
date_obj = None
date_precision = DatePrecision.year
if published and published.text:
try:
# Parse ISO format date: 2021-04-17T23:46:57Z
dt = datetime.fromisoformat(published.text.replace("Z", "+00:00"))
date_obj = dt.date()
date_precision = DatePrecision.day
except (ValueError, AttributeError):
pass

if not date_obj:
date_obj = date(2000, 1, 1)
date_precision = DatePrecision.year

# Extract topics from categories
topics = []
for category in entry.find_all("category"):
term = category.get("term")
if term:
topics.append(Topic(name=term))

# Extract links
links = [Link(type="arxiv", link=arxiv_id)]
for link_elem in entry.find_all("link"):
href = link_elem.get("href")
rel = link_elem.get("rel")
link_type = link_elem.get("type")
if href:
if rel == "alternate":
links.append(Link(type="html", link=href))
elif rel == "related" and link_type == "application/pdf":
links.append(Link(type="pdf", link=href))

# Create release (ArXiv is a preprint venue)
releases = [
Release(
venue=Venue(
name="arXiv",
date=date_obj,
date_precision=date_precision,
type=VenueType.preprint,
series="arXiv",
aliases=[],
links=[],
open=True,
peer_reviewed=False,
publisher="Cornell University",
),
status="preprint",
pages=None,
)
]

return Paper(
title=entry_title_text,
abstract=abstract,
authors=authors,
releases=releases,
topics=topics,
links=links,
)


async def institution_from_ror(ror_id):
"""
Given a ROR ID (e.g., '025wfj672'), fetch institution info from ROR API and return an Institution object.
Expand Down
31 changes: 29 additions & 2 deletions src/paperoni/refinement/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..discovery.openalex import OpenAlexQueryManager
from ..get import ERRORS
from .fetch import register_fetch
from .formats import paper_from_crossref
from .formats import paper_from_crossref, papers_from_arxiv


@register_fetch
Expand Down Expand Up @@ -53,9 +53,36 @@ async def openalex_title(typ: Literal["title"], link: str):

async for paper in qm.works(
filter=f"display_name.search:{title.strip().replace(',', '')}",
data_version="1",
data_version="2",
limit=1,
):
paper = paper.paper
if paper.title == title:
return paper


@register_fetch
async def arxiv_title(type: Literal["title"], link: str):
"""Fetch from ArXiv by title search."""

title = link

try:
soup = await config.fetch.read(
"https://export.arxiv.org/api/query",
params={
"search_query": f'title:"{title}"',
"start": "0",
"max_results": "5",
},
format="xml",
)
except ERRORS as exc: # pragma: no cover
if exc.response.status_code == 404:
return None
else:
raise

for paper in papers_from_arxiv(soup):
if paper.title == title:
return paper
5 changes: 5 additions & 0 deletions tests/discovery/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
"GraphMix: Improved Training of GNNs for Semi-Supervised Learning",
"HiFormer: Hierarchical Multi-scale Representations Using Transformers for Medical Image Segmentation",
"Large-Scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation",
"Domain adaptation for large-scale sentiment classification: A deep learning approach",
"EmoNets: Multimodal deep learning approaches for emotion recognition in video",
"What regularized auto-encoders learn from the data-generating distribution",
"Word Representations: A Simple and General Method for Semi-Supervised Learning",
"Zero-data learning of new tasks",
]


Expand Down
Loading
Loading