mila-iqia
diff --git a/‎src/paperoni/__main__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/paperoni/__main__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/paperoni/refinement/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/paperoni/refinement/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/paperoni/refinement/dblp.py‎
Lines changed: 1 addition & 1 deletion b/‎src/paperoni/refinement/dblp.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/paperoni/refinement/doi.py‎
Lines changed: 9 additions & 105 deletions b/‎src/paperoni/refinement/doi.py‎
Lines changed: 9 additions & 105 deletions
diff --git a/‎src/paperoni/refinement/formats.py‎
Lines changed: 104 additions & 0 deletions b/‎src/paperoni/refinement/formats.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎src/paperoni/refinement/title.py‎
Lines changed: 63 additions & 0 deletions b/‎src/paperoni/refinement/title.py‎
Lines changed: 63 additions & 0 deletions
@@ -117,7 +117,7 @@ def run(self):
             if link.startswith("http"):
                 type, link = url_to_id(link)
             else:
-                type, link = link.split(":")
+                type, link = link.split(":", 1)
             results.extend(fetch_all(type, link))
         if self.merge:
             results = [merge_all(results)]
 
@@ -1,10 +1,11 @@
-from . import dblp, doi, pubmed
+from . import dblp, doi, pubmed, title
 from .fetch import fetch_all, register_fetch
 
 __all__ = [
     "dblp",
     "doi",
     "pubmed",
+    "title",
     "fetch_all",
     "register_fetch",
 ]
@@ -25,7 +25,7 @@ def dblp(type: Literal["dblp"], link: str):
     extra_links = []
     if ee and ee.text.startswith("https://doi.org/"):
         doi = ee.text.replace("https://doi.org/", "")
-        extra_links = [Link(type="doi", link=doi)]
+        extra_links = [Link(type="doi", link=doi.lower())]
     elif ee:
         extra_links = [Link(type="html", link=ee.text)]
     return Paper(
 
@@ -1,4 +1,3 @@
-import re
 from datetime import date
 from types import SimpleNamespace
 from typing import Literal
@@ -20,9 +19,8 @@
     Venue,
     VenueType,
 )
-from ..utils import url_to_id
 from .fetch import register_fetch
-from .formats import institution_from_ror, paper_from_jats
+from .formats import paper_from_crossref, paper_from_jats
 
 
 @register_fetch
@@ -48,106 +46,7 @@ def crossref(type: Literal["doi"], link: str):
         raise Exception("Request failed", data)
 
     data = SimpleNamespace(**data["message"])
-
-    releases = []
-    if getattr(data, "event", None) or getattr(data, "container-title", None):
-        date_parts = None
-
-        if evt := getattr(data, "event", None):
-            venue_name = evt["name"]
-            venue_type = VenueType.conference
-            if "start" in evt:
-                date_parts = evt["start"]["date-parts"][0]
-
-        if venue := getattr(data, "container-title", None):
-            venue_name = venue[0]
-            if data.type == "journal-article":
-                venue_type = VenueType.journal
-            else:
-                venue_type = VenueType.conference
-
-        if not date_parts:
-            for field in (
-                "published-online",
-                "published-print",
-                "published",
-                "issued",
-                "created",
-            ):
-                if dateholder := getattr(data, field, None):
-                    date_parts = dateholder["date-parts"][0]
-                    break
-
-        precision = [
-            DatePrecision.year,
-            DatePrecision.month,
-            DatePrecision.day,
-        ][len(date_parts) - 1]
-        date_parts += [1] * (3 - len(date_parts))
-        release = Release(
-            venue=Venue(
-                aliases=[],
-                name=venue_name,
-                type=venue_type,
-                series=venue_name,
-                links=[],
-                open=False,
-                peer_reviewed=False,
-                publisher=None,
-                date_precision=precision,
-                date=date(*date_parts),
-            ),
-            status="published",
-            pages=None,
-        )
-        releases = [release]
-
-    required_keys = {"given", "family", "affiliation"}
-
-    def extract_affiliation(aff):
-        if "id" in aff and isinstance(aff["id"], list):
-            for id_entry in aff["id"]:
-                if (
-                    isinstance(id_entry, dict)
-                    and id_entry.get("id-type") == "ROR"
-                    and "id" in id_entry
-                ):
-                    ror_url = id_entry["id"]
-                    _, ror_id = url_to_id(ror_url)
-                    return institution_from_ror(ror_id)
-
-        else:
-            return Institution(
-                name=aff["name"],
-                category=InstitutionCategory.unknown,
-                aliases=[],
-            )
-
-    abstract = getattr(data, "abstract", None)
-    if abstract:
-        abstract = re.sub(r"<jats:title>.*</jats:title>", "", abstract)
-        abstract = re.sub(r"</?jats:[^>]+>", "", abstract)
-
-    return Paper(
-        title=data.title[0],
-        authors=[
-            PaperAuthor(
-                display_name=(dn := f"{author['given']} {author['family']}"),
-                author=Author(name=dn),
-                affiliations=[
-                    extract_affiliation(aff)
-                    for aff in author["affiliation"]
-                    if "name" in aff
-                ],
-            )
-            for author in data.author
-            if not (required_keys - author.keys())
-        ],
-        abstract=abstract,
-        links=[Link(type="doi", link=doi)],
-        topics=[],
-        releases=releases,
-    )
+    return paper_from_crossref(data)
 
 
 @register_fetch
@@ -301,8 +200,13 @@ def datacite(type: Literal["doi"], link: str):
         identifier = related_identifier["relatedIdentifier"]
         # Available identifier types:
         # ARK arXiv bibcode DOI EAN13 EISSN Handle IGSN ISBN ISSN ISTC LISSN LSID PMID PURL UPC URL URN w3id
-        if identifier_type in {"ARK", "arXiv", "DOI", "PURL", "URL", "w3id"}:
-            links.append(Link(type=f"{relation_type}.{identifier_type}", link=identifier))
+        allowed_types = {"ARK", "arXiv", "DOI", "PURL", "URL", "w3id"}
+        if relation_type == "IsVersionOf" and identifier_type in allowed_types:
+            identifier_type = identifier_type.lower()
+            normalized_identifier = (
+                identifier.lower() if identifier_type == "doi" else identifier
+            )
+            links.append(Link(type=f"{identifier_type}", link=normalized_identifier))
 
     return Paper(
         title=raw_paper.titles[0]["title"],
 
@@ -7,13 +7,117 @@
     DatePrecision,
     Institution,
     InstitutionCategory,
+    Link,
     Paper,
     PaperAuthor,
     Release,
     Topic,
     Venue,
     VenueType,
 )
+from ..utils import url_to_id
+
+
+def paper_from_crossref(data):
+    releases = []
+    if getattr(data, "event", None) or getattr(data, "container-title", None):
+        date_parts = None
+
+        if evt := getattr(data, "event", None):
+            venue_name = evt["name"]
+            venue_type = VenueType.conference
+            if "start" in evt:
+                date_parts = evt["start"]["date-parts"][0]
+
+        if venue := getattr(data, "container-title", None):
+            venue_name = venue[0]
+            if data.type == "journal-article":
+                venue_type = VenueType.journal
+            else:
+                venue_type = VenueType.conference
+
+        if not date_parts:
+            for field in (
+                "published-online",
+                "published-print",
+                "published",
+                "issued",
+                "created",
+            ):
+                if dateholder := getattr(data, field, None):
+                    date_parts = dateholder["date-parts"][0]
+                    break
+
+        precision = [
+            DatePrecision.year,
+            DatePrecision.month,
+            DatePrecision.day,
+        ][len(date_parts) - 1]
+        date_parts += [1] * (3 - len(date_parts))
+        release = Release(
+            venue=Venue(
+                aliases=[],
+                name=venue_name,
+                type=venue_type,
+                series=venue_name,
+                links=[],
+                open=False,
+                peer_reviewed=False,
+                publisher=None,
+                date_precision=precision,
+                date=date(*date_parts),
+            ),
+            status="published",
+            pages=None,
+        )
+        releases = [release]
+
+    required_keys = {"given", "family", "affiliation"}
+
+    def extract_affiliation(aff):
+        if "id" in aff and isinstance(aff["id"], list):
+            for id_entry in aff["id"]:
+                if (
+                    isinstance(id_entry, dict)
+                    and id_entry.get("id-type") == "ROR"
+                    and "id" in id_entry
+                ):
+                    ror_url = id_entry["id"]
+                    _, ror_id = url_to_id(ror_url)
+                    return institution_from_ror(ror_id)
+
+        else:
+            return Institution(
+                name=aff["name"],
+                category=InstitutionCategory.unknown,
+                aliases=[],
+            )
+
+    abstract = getattr(data, "abstract", None)
+    if abstract:
+        abstract = re.sub(r"<jats:title>.*</jats:title>", "", abstract)
+        abstract = re.sub(r"</?jats:[^>]+>", "", abstract)
+
+    return Paper(
+        title=data.title[0],
+        authors=[
+            PaperAuthor(
+                display_name=(dn := f"{author['given']} {author['family']}"),
+                author=Author(name=dn),
+                affiliations=[
+                    extract_affiliation(aff)
+                    for aff in author["affiliation"]
+                    if "name" in aff
+                ],
+            )
+            for author in data.author
+            if not (required_keys - author.keys())
+        ],
+        abstract=abstract,
+        links=[Link(type="doi", link=data.DOI.lower())],
+        topics=[],
+        releases=releases,
+    )
 
 
 def extract_date(txt: str) -> dict | None:
 
@@ -0,0 +1,63 @@
+from types import SimpleNamespace
+from typing import Literal
+from urllib.parse import quote
+
+from requests import HTTPError
+
+from ..config import config
+from ..discovery.openalex import OpenAlexQueryManager
+from .fetch import register_fetch
+from .formats import paper_from_crossref
+
+
+@register_fetch
+def crossref_title(type: Literal["title"], link: str):
+    """Fetch from Crossref by title search."""
+
+    title = link
+
+    # URL encode the title for the query
+    encoded_title = quote(title.strip())
+
+    try:
+        data = config.fetch.read(
+            f"https://api.crossref.org/works?query.title={encoded_title}&rows=1",
+            format="json",
+        )
+    except HTTPError as exc:  # pragma: no cover
+        if exc.response.status_code == 404:
+            return None
+        else:
+            raise
+
+    if data["status"] != "ok":  # pragma: no cover
+        raise Exception("Request failed", data)
+
+    items = data.get("message", {}).get("items", [])
+    if not items:
+        return None
+
+    work_data = SimpleNamespace(**items[0])
+    paper = paper_from_crossref(work_data)
+    if paper.title != title:
+        return None
+    return paper
+
+
+@register_fetch
+def openalex_title(type: Literal["title"], link: str):
+    """Fetch from OpenAlex by title search."""
+
+    title = link
+
+    qm = OpenAlexQueryManager(mailto=config.mailto)
+
+    papers = list(qm.works(filter=f"display_name.search:{title.strip()}", limit=1))
+
+    if not papers:
+        return None
+
+    paper = papers[0].paper
+    if paper.title != title:
+        return None
+    return paper