Skip to content

Commit fbc56d3

Browse files
committed
Add refinement by title
1 parent 4e8d06e commit fbc56d3

15 files changed

+511
-114
lines changed

src/paperoni/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def run(self):
117117
if link.startswith("http"):
118118
type, link = url_to_id(link)
119119
else:
120-
type, link = link.split(":")
120+
type, link = link.split(":", 1)
121121
results.extend(fetch_all(type, link))
122122
if self.merge:
123123
results = [merge_all(results)]
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
from . import dblp, doi, pubmed
1+
from . import dblp, doi, pubmed, title
22
from .fetch import fetch_all, register_fetch
33

44
__all__ = [
55
"dblp",
66
"doi",
77
"pubmed",
8+
"title",
89
"fetch_all",
910
"register_fetch",
1011
]

src/paperoni/refinement/dblp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def dblp(type: Literal["dblp"], link: str):
2525
extra_links = []
2626
if ee and ee.text.startswith("https://doi.org/"):
2727
doi = ee.text.replace("https://doi.org/", "")
28-
extra_links = [Link(type="doi", link=doi)]
28+
extra_links = [Link(type="doi", link=doi.lower())]
2929
elif ee:
3030
extra_links = [Link(type="html", link=ee.text)]
3131
return Paper(

src/paperoni/refinement/doi.py

Lines changed: 9 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import re
21
from datetime import date
32
from types import SimpleNamespace
43
from typing import Literal
@@ -20,9 +19,8 @@
2019
Venue,
2120
VenueType,
2221
)
23-
from ..utils import url_to_id
2422
from .fetch import register_fetch
25-
from .formats import institution_from_ror, paper_from_jats
23+
from .formats import paper_from_crossref, paper_from_jats
2624

2725

2826
@register_fetch
@@ -48,106 +46,7 @@ def crossref(type: Literal["doi"], link: str):
4846
raise Exception("Request failed", data)
4947

5048
data = SimpleNamespace(**data["message"])
51-
52-
releases = []
53-
if getattr(data, "event", None) or getattr(data, "container-title", None):
54-
date_parts = None
55-
56-
if evt := getattr(data, "event", None):
57-
venue_name = evt["name"]
58-
venue_type = VenueType.conference
59-
if "start" in evt:
60-
date_parts = evt["start"]["date-parts"][0]
61-
62-
if venue := getattr(data, "container-title", None):
63-
venue_name = venue[0]
64-
if data.type == "journal-article":
65-
venue_type = VenueType.journal
66-
else:
67-
venue_type = VenueType.conference
68-
69-
if not date_parts:
70-
for field in (
71-
"published-online",
72-
"published-print",
73-
"published",
74-
"issued",
75-
"created",
76-
):
77-
if dateholder := getattr(data, field, None):
78-
date_parts = dateholder["date-parts"][0]
79-
break
80-
81-
precision = [
82-
DatePrecision.year,
83-
DatePrecision.month,
84-
DatePrecision.day,
85-
][len(date_parts) - 1]
86-
date_parts += [1] * (3 - len(date_parts))
87-
release = Release(
88-
venue=Venue(
89-
aliases=[],
90-
name=venue_name,
91-
type=venue_type,
92-
series=venue_name,
93-
links=[],
94-
open=False,
95-
peer_reviewed=False,
96-
publisher=None,
97-
date_precision=precision,
98-
date=date(*date_parts),
99-
),
100-
status="published",
101-
pages=None,
102-
)
103-
releases = [release]
104-
105-
required_keys = {"given", "family", "affiliation"}
106-
107-
def extract_affiliation(aff):
108-
if "id" in aff and isinstance(aff["id"], list):
109-
for id_entry in aff["id"]:
110-
if (
111-
isinstance(id_entry, dict)
112-
and id_entry.get("id-type") == "ROR"
113-
and "id" in id_entry
114-
):
115-
ror_url = id_entry["id"]
116-
_, ror_id = url_to_id(ror_url)
117-
return institution_from_ror(ror_id)
118-
119-
else:
120-
return Institution(
121-
name=aff["name"],
122-
category=InstitutionCategory.unknown,
123-
aliases=[],
124-
)
125-
126-
abstract = getattr(data, "abstract", None)
127-
if abstract:
128-
abstract = re.sub(r"<jats:title>.*</jats:title>", "", abstract)
129-
abstract = re.sub(r"</?jats:[^>]+>", "", abstract)
130-
131-
return Paper(
132-
title=data.title[0],
133-
authors=[
134-
PaperAuthor(
135-
display_name=(dn := f"{author['given']} {author['family']}"),
136-
author=Author(name=dn),
137-
affiliations=[
138-
extract_affiliation(aff)
139-
for aff in author["affiliation"]
140-
if "name" in aff
141-
],
142-
)
143-
for author in data.author
144-
if not (required_keys - author.keys())
145-
],
146-
abstract=abstract,
147-
links=[Link(type="doi", link=doi)],
148-
topics=[],
149-
releases=releases,
150-
)
49+
return paper_from_crossref(data)
15150

15251

15352
@register_fetch
@@ -301,8 +200,13 @@ def datacite(type: Literal["doi"], link: str):
301200
identifier = related_identifier["relatedIdentifier"]
302201
# Available identifier types:
303202
# ARK arXiv bibcode DOI EAN13 EISSN Handle IGSN ISBN ISSN ISTC LISSN LSID PMID PURL UPC URL URN w3id
304-
if identifier_type in {"ARK", "arXiv", "DOI", "PURL", "URL", "w3id"}:
305-
links.append(Link(type=f"{relation_type}.{identifier_type}", link=identifier))
203+
allowed_types = {"ARK", "arXiv", "DOI", "PURL", "URL", "w3id"}
204+
if relation_type == "IsVersionOf" and identifier_type in allowed_types:
205+
identifier_type = identifier_type.lower()
206+
normalized_identifier = (
207+
identifier.lower() if identifier_type == "doi" else identifier
208+
)
209+
links.append(Link(type=f"{identifier_type}", link=normalized_identifier))
306210

307211
return Paper(
308212
title=raw_paper.titles[0]["title"],

src/paperoni/refinement/formats.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,117 @@
77
DatePrecision,
88
Institution,
99
InstitutionCategory,
10+
Link,
1011
Paper,
1112
PaperAuthor,
1213
Release,
1314
Topic,
1415
Venue,
1516
VenueType,
1617
)
18+
from ..utils import url_to_id
19+
20+
21+
def paper_from_crossref(data):
22+
releases = []
23+
if getattr(data, "event", None) or getattr(data, "container-title", None):
24+
date_parts = None
25+
26+
if evt := getattr(data, "event", None):
27+
venue_name = evt["name"]
28+
venue_type = VenueType.conference
29+
if "start" in evt:
30+
date_parts = evt["start"]["date-parts"][0]
31+
32+
if venue := getattr(data, "container-title", None):
33+
venue_name = venue[0]
34+
if data.type == "journal-article":
35+
venue_type = VenueType.journal
36+
else:
37+
venue_type = VenueType.conference
38+
39+
if not date_parts:
40+
for field in (
41+
"published-online",
42+
"published-print",
43+
"published",
44+
"issued",
45+
"created",
46+
):
47+
if dateholder := getattr(data, field, None):
48+
date_parts = dateholder["date-parts"][0]
49+
break
50+
51+
precision = [
52+
DatePrecision.year,
53+
DatePrecision.month,
54+
DatePrecision.day,
55+
][len(date_parts) - 1]
56+
date_parts += [1] * (3 - len(date_parts))
57+
release = Release(
58+
venue=Venue(
59+
aliases=[],
60+
name=venue_name,
61+
type=venue_type,
62+
series=venue_name,
63+
links=[],
64+
open=False,
65+
peer_reviewed=False,
66+
publisher=None,
67+
date_precision=precision,
68+
date=date(*date_parts),
69+
),
70+
status="published",
71+
pages=None,
72+
)
73+
releases = [release]
74+
75+
required_keys = {"given", "family", "affiliation"}
76+
77+
def extract_affiliation(aff):
78+
if "id" in aff and isinstance(aff["id"], list):
79+
for id_entry in aff["id"]:
80+
if (
81+
isinstance(id_entry, dict)
82+
and id_entry.get("id-type") == "ROR"
83+
and "id" in id_entry
84+
):
85+
ror_url = id_entry["id"]
86+
_, ror_id = url_to_id(ror_url)
87+
return institution_from_ror(ror_id)
88+
89+
else:
90+
return Institution(
91+
name=aff["name"],
92+
category=InstitutionCategory.unknown,
93+
aliases=[],
94+
)
95+
96+
abstract = getattr(data, "abstract", None)
97+
if abstract:
98+
abstract = re.sub(r"<jats:title>.*</jats:title>", "", abstract)
99+
abstract = re.sub(r"</?jats:[^>]+>", "", abstract)
100+
101+
return Paper(
102+
title=data.title[0],
103+
authors=[
104+
PaperAuthor(
105+
display_name=(dn := f"{author['given']} {author['family']}"),
106+
author=Author(name=dn),
107+
affiliations=[
108+
extract_affiliation(aff)
109+
for aff in author["affiliation"]
110+
if "name" in aff
111+
],
112+
)
113+
for author in data.author
114+
if not (required_keys - author.keys())
115+
],
116+
abstract=abstract,
117+
links=[Link(type="doi", link=data.DOI.lower())],
118+
topics=[],
119+
releases=releases,
120+
)
17121

18122

19123
def extract_date(txt: str) -> dict | None:

src/paperoni/refinement/title.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from types import SimpleNamespace
2+
from typing import Literal
3+
from urllib.parse import quote
4+
5+
from requests import HTTPError
6+
7+
from ..config import config
8+
from ..discovery.openalex import OpenAlexQueryManager
9+
from .fetch import register_fetch
10+
from .formats import paper_from_crossref
11+
12+
13+
@register_fetch
14+
def crossref_title(type: Literal["title"], link: str):
15+
"""Fetch from Crossref by title search."""
16+
17+
title = link
18+
19+
# URL encode the title for the query
20+
encoded_title = quote(title.strip())
21+
22+
try:
23+
data = config.fetch.read(
24+
f"https://api.crossref.org/works?query.title={encoded_title}&rows=1",
25+
format="json",
26+
)
27+
except HTTPError as exc: # pragma: no cover
28+
if exc.response.status_code == 404:
29+
return None
30+
else:
31+
raise
32+
33+
if data["status"] != "ok": # pragma: no cover
34+
raise Exception("Request failed", data)
35+
36+
items = data.get("message", {}).get("items", [])
37+
if not items:
38+
return None
39+
40+
work_data = SimpleNamespace(**items[0])
41+
paper = paper_from_crossref(work_data)
42+
if paper.title != title:
43+
return None
44+
return paper
45+
46+
47+
@register_fetch
48+
def openalex_title(type: Literal["title"], link: str):
49+
"""Fetch from OpenAlex by title search."""
50+
51+
title = link
52+
53+
qm = OpenAlexQueryManager(mailto=config.mailto)
54+
55+
papers = list(qm.works(filter=f"display_name.search:{title.strip()}", limit=1))
56+
57+
if not papers:
58+
return None
59+
60+
paper = papers[0].paper
61+
if paper.title != title:
62+
return None
63+
return paper

0 commit comments

Comments
 (0)