mila-iqia · satyaog · Jan 23, 2026 · satyaog · Jan 27, 2026
diff --git a/config/basic.yaml b/config/basic.yaml
@@ -45,6 +45,9 @@ paperoni:
     scrape:
       urls:
         - https://dadelani.github.io/publications
+    v2:
+      $class: paperoni.discovery.paperoni_v2:PaperoniV2
+      data: ${paperoni.data_path}/paperoniv2.json
   refine:
     prompt:
       $class: paperoni.prompt:GenAIPrompt

diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,7 @@ pmlr = "paperoni.discovery.pmlr:PMLR"
 jmlr = "paperoni.discovery.jmlr:JMLR"
 synth = "paperoni.discovery.synth:Synth"
 scrape = "paperoni.discovery.scrape:Scrape"
+v2 = "paperoni.discovery.paperoni_v2:PaperoniV2"
 
 [project.entry-points."outsight.fixtures"]
 dash = "paperoni.dash:Dash"

diff --git a/src/paperoni/__main__.py b/src/paperoni/__main__.py
@@ -43,6 +43,7 @@
 from .collection.remotecoll import RemoteCollection
 from .config import config
 from .dash import History
+from .discovery.paperoni_v2 import PaperoniV2
 from .display import display, print_field, terminal_width
 from .fulltext.locate import URL, locate_all
 from .fulltext.pdf import PDF, CachePolicies, get_pdf
@@ -693,8 +694,60 @@ async def run(self, coll: "Coll"):
             elif not len(coll.collection) and not len(await coll.collection.exclusions()):
                 logging.warning("Collection is not empty. Use --force to drop it.")
 
+    @dataclass
+    class Validate(Productor):
+        """Validate the papers in the collection using the paperoni v2 database."""
+
+        # The paperoni v2 database
+        # [positional]
+        # [metavar v2]
+        command: Annotated[
+            Any,
+            FromEntryPoint(
+                "paperoni.discovery",
+                wrap=lambda cls: Auto[cls.query] if cls is PaperoniV2 else None,
+            ),
+        ]
+
+        async def run(self, coll: "Coll"):
+            validated = []
+            not_found = []
+            total = 0
+            send(progress=("Validated papers", len(validated), total))
+            send(progress=("Papers not found", len(not_found), total))
+
+            async for paper_v2 in self.iterate(embed=True):
+                paper_v2_json = paper_v2.info.pop("v2")
+                total += 1
+
+                if "validated" not in paper_v2.flags:
+                    continue
+
+                if paper := await coll.collection.find_paper(paper_v2):
+                    paper.flags.add("validated")
+                    await coll.collection.edit_paper(paper)
+                    validated.append(paper_v2_json)
+
+                else:
+                    not_found.append(paper_v2_json)
+
+                send(
+                    progress=(
+                        "Found papers",
+                        len(validated),
+                        len(validated) + len(not_found),
+                    )
+                )
+                send(
+                    progress=(
+                        "Validated papers",
+                        len(validated) + len(not_found),
+                        total,
+                    )
+                )
+
     # Command to execute
-    command: TaggedUnion[Search, Import, Export, Drop]
+    command: TaggedUnion[Search, Import, Export, Drop, Validate]
 
     # Collection dir
     # [alias: -c]

diff --git a/src/paperoni/collection/abc.py b/src/paperoni/collection/abc.py
@@ -1,5 +1,5 @@
 from datetime import date
-from typing import AsyncIterable, Iterable
+from typing import AsyncGenerator, Iterable
 
 from ..model.classes import Paper
 
@@ -62,7 +62,7 @@ async def search(
         include_flags: list[str] = None,
         # Flags that must be False
         exclude_flags: list[str] = None,
-    ) -> AsyncIterable[Paper]:
+    ) -> AsyncGenerator[Paper, None]:
         raise NotImplementedError()
 
     def __len__(self) -> int:

diff --git a/src/paperoni/collection/remotecoll.py b/src/paperoni/collection/remotecoll.py
@@ -1,7 +1,7 @@
 import os
 from dataclasses import dataclass, field
 from datetime import date
-from typing import Iterable
+from typing import AsyncGenerator, Iterable
 
 from fastapi import HTTPException
 from serieux import deserialize
@@ -81,7 +81,7 @@ async def search(
         include_flags: list[str] = None,
         # Flags that must be False
         exclude_flags: list[str] = None,
-    ):
+    ) -> AsyncGenerator[Paper, None]:
         params = {}
         if paper_id:
             params["paper_id"] = paper_id
@@ -106,7 +106,7 @@ async def search(
         while True:
             query_params = params.copy()
             query_params["offset"] = offset
-            resp = await self.fetch.read(
+            resp: dict = await self.fetch.read(
                 url,
                 format="json",
                 cache_into=None,

diff --git a/src/paperoni/config.py b/src/paperoni/config.py
@@ -7,7 +7,7 @@
 import gifnoc
 from easy_oauth import OAuthManager
 from rapporteur.report import Reporter
-from serieux import JSON, TaggedSubclass
+from serieux import TaggedSubclass
 from serieux.features.encrypt import Secret
 
 from .collection.abc import PaperCollection
@@ -52,16 +52,21 @@ def __post_init__(self):
             self.process_pool = ProcessPoolExecutor(**self.process_pool_executor)
 
 
+@dataclass
+class AutoValidate:
+    score_threshold: float = 10.0
+
+
 @dataclass
 class PaperoniConfig:
     cache_path: Path = None
     data_path: Path = None
     mailto: str = ""
     api_keys: Keys[str, Secret[str]] = field(default_factory=Keys)
-    discovery: JSON = None
     fetch: TaggedSubclass[Fetcher] = field(default_factory=RequestsFetcher)
     focuses: Focuses = field(default_factory=Focuses)
     autofocus: AutoFocus[str, AutoFocus.Author] = field(default_factory=AutoFocus)
+    autovalidate: AutoValidate = field(default_factory=AutoValidate)
     refine: Refine = None
     work_file: Path = None
     collection: TaggedSubclass[PaperCollection] = None

diff --git a/src/paperoni/discovery/paperoni_v2.py b/src/paperoni/discovery/paperoni_v2.py
@@ -0,0 +1,142 @@
+import datetime
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, AsyncGenerator
+
+import gifnoc
+
+from ..discovery.base import Discoverer
+from ..model import (
+    Author,
+    DatePrecision,
+    Institution,
+    InstitutionCategory,
+    Link,
+    PaperAuthor,
+    Release,
+    Topic,
+    Venue,
+    VenueType,
+)
+from ..model.classes import Paper
+from ..utils import url_to_id
+
+
+def _is_validated(paper: dict) -> bool | None:
+    for flag in paper["flags"]:
+        if flag["name"] == "validation":
+            if flag["value"] == 1:
+                return True
+            elif flag["value"] == 0:
+                return False
+    else:
+        return None
+
+
+def _parse_topic(topic: dict[str, str]) -> Topic:
+    return Topic(name=topic["name"])
+
+
+def _parse_link(link: dict[str, str]) -> Link:
+    type_link = url_to_id(link.get("url", ""))
+    if type_link:
+        typ, link = type_link
+        return Link(type=typ, link=link)
+
+    return Link(type=link["type"].split(".", 1)[0], link=link["link"])
+
+
+def _parse_institution(institution: dict[str, str]) -> Institution:
+    return Institution(
+        name=institution["name"],
+        category=InstitutionCategory(institution["category"]),
+    )
+
+
+def _parse_author(author: dict[str, str]) -> PaperAuthor:
+    return PaperAuthor(
+        display_name=author["author"]["name"],
+        author=Author(
+            name=author["author"]["name"],
+            aliases=[],
+            links=list(map(_parse_link, author["author"]["links"]))
+            + [Link(type="paperoni_v2", link=author["author"]["author_id"])],
+        ),
+        affiliations=list(map(_parse_institution, author["affiliations"])),
+    )
+
+
+def _parse_release(release: dict[str, Any]) -> Release:
+    return Release(
+        venue=Venue(
+            type=VenueType(release["venue"]["type"]),
+            name=release["venue"]["name"],
+            series=release["venue"]["series"],
+            **(
+                (
+                    {
+                        "date": datetime.datetime.fromtimestamp(
+                            release["venue"]["date"]["timestamp"]
+                        ).date(),
+                    }
+                    if release["venue"]["date"].get("timestamp", None) is not None
+                    else DatePrecision.assimilate_date(release["venue"]["date"]["text"])
+                )
+                | {
+                    "date_precision": DatePrecision(
+                        release["venue"]["date"]["precision"]
+                    ),
+                }
+            ),
+            volume=release["venue"]["volume"],
+            publisher=release["venue"]["publisher"],
+            links=list(map(_parse_link, release["venue"]["links"])),
+            peer_reviewed=release["peer_reviewed"],
+        ),
+        status=release["status"],
+        pages=release["pages"],
+    )
+
+
+# TODO: complete the conversion of the paperoni v2 database to the paperoni model
+# - [ ] Complete the flags information
+@dataclass
+class PaperoniV2(Discoverer):
+    # The paperoni v2 JSON export file
+    # [positional]
+    # [metavar JSON]
+    json: Path = field(default_factory=lambda: paperoni_v2_json)
+
+    async def query(
+        self,
+        # Embed the paperoni v2 paper's JSON in the Paper info dictionary
+        embed: bool = False,
+    ) -> AsyncGenerator[Paper, None]:
+        """Query the paperoni v2 database"""
+        with self.json.open() as f:
+            papers = json.load(f)
+
+        for paper in papers:
+            yield Paper(
+                title=paper["title"],
+                abstract=paper["abstract"],
+                authors=list(map(_parse_author, paper["authors"])),
+                releases=list(map(_parse_release, paper["releases"])),
+                topics=list(map(_parse_topic, paper["topics"])),
+                links=list(map(_parse_link, paper["links"])),
+                flags=set(
+                    (["validated"] if _is_validated(paper) else [])
+                    + (["~validated"] if _is_validated(paper) is False else [])
+                ),
+                key=f"paperoni_v2:{paper['paper_id']}",
+                info={"discovered_by": {"paperoni_v2": paper["paper_id"]}}
+                | ({"v2": paper} if embed else {}),
+                score=10.0 if _is_validated(paper) else 0.0,
+                version=datetime.datetime.now(),
+            )
+
+
+paperoni_v2_json: Path | None = gifnoc.define(
+    "paperoni.discovery.v2.data", Path | None, defaults=None
+)
diff --git a/src/paperoni/discovery/scrape.py b/src/paperoni/discovery/scrape.py
@@ -88,6 +88,7 @@ class Scrape(Discoverer):
     force: bool = False
 
     async def query(self):
+        """Query the pages to scrape"""
         for link in self.links:
             prompt_result = await prompt_html(
                 system_prompt=llm_config.system_prompt,

diff --git a/tests/config/test-config.yaml b/tests/config/test-config.yaml
@@ -25,6 +25,9 @@ paperoni:
     scrape:
       urls:
         - https://dadelani.github.io/publications
+    v2:
+      $class: paperoni.discovery.paperoni_v2:PaperoniV2
+      data: ../data/paperoniv2.json
   refine:
     prompt:
       $class: GenAIPrompt