mila-iqia · breuleux · Aug 18, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.12']
+        python-version: ['3.13']
     steps:
     - name: Check out the code
       uses: actions/checkout@v3
@@ -33,8 +33,6 @@ jobs:
     strategy:
       matrix:
         settings:
-        - python: '3.12'
-          coverage: false
         - python: '3.13'
           coverage: false
     steps:

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,8 +13,8 @@ readme = "README.md"
 license = "MIT"
 requires-python = ">=3.12"
 dependencies = [
-    "ovld>=0.5.9",
-    "serieux>=0.2.0",
+    "ovld>=0.5.10",
+    "serieux>=0.2.2",
     "gifnoc>=0.6.0",
     "backoff>=2.2.1",
     "beautifulsoup4>=4.13.4",
@@ -23,7 +23,6 @@ dependencies = [
     "openreview-py>=1.50.0",
     "requests-cache>=1.2.1",
     "lxml>=6.0.0",
-    "wrapt>=1.17.2",
     "unidecode>=1.4.0",
     "fake-useragent>=2.2.0",
     "cloudscraper>=1.2.71",
@@ -52,6 +51,7 @@ openalex = "paperoni.discovery.openalex:OpenAlex"
 miniconf = "paperoni.discovery.miniconf:MiniConf"
 pmlr = "paperoni.discovery.pmlr:PMLR"
 jmlr = "paperoni.discovery.jmlr:JMLR"
+synth = "paperoni.discovery.synth:Synth"
 
 [tool.ruff]
 line-length = 90

diff --git a/src/paperoni/__main__.py b/src/paperoni/__main__.py
@@ -6,15 +6,17 @@
 
 import yaml
 from gifnoc import add_overlay, cli
-from serieux import Auto, Registered, TaggedUnion, serialize, singleton
+from serieux import Auto, Registered, TaggedUnion, deserialize, dump, serialize, singleton
 from serieux.features.tagset import FromEntryPoint
 
+from paperoni.model.focus import Focuses, Scored, Top
+
 from .config import config
 from .display import display, terminal_width
 from .fulltext.locate import locate_all
 from .fulltext.pdf import CachePolicies, get_pdf
 from .model import PaperInfo
-from .model.merge import merge_all
+from .model.merge import PaperWorkingSet, merge_all
 from .refinement import fetch_all
 from .utils import url_to_id
 
@@ -68,32 +70,30 @@ def run(self):
         self.format(papers)
 
 
-def locate(
-    # Reference to locate
-    # [positional]
-    ref: str,
-):
-    for url in locate_all(ref):
-        print(f"\033[36m[{url.info}]\033[0m {url.url}")
-
-
-def download(
-    # Reference to locate
-    # [positional]
-    # [nargs: +]
-    ref: list[str],
-    # Cache policy
-    # [alias: -p]
-    cache_policy: Literal["use", "use_best", "no_download", "force"] = "use",
-):
-    p = get_pdf(ref, cache_policy=getattr(CachePolicies, cache_policy.upper()))
-    print("Downloaded into:", p.pdf_path.resolve())
-
-
 @dataclass
 class Fulltext:
     """Download and process fulltext."""
 
+    def locate(
+        # Reference to locate
+        # [positional]
+        ref: str,
+    ):
+        for url in locate_all(ref):
+            print(f"\033[36m[{url.info}]\033[0m {url.url}")
+
+    def download(
+        # Reference to locate
+        # [positional]
+        # [nargs: +]
+        ref: list[str],
+        # Cache policy
+        # [alias: -p]
+        cache_policy: Literal["use", "use_best", "no_download", "force"] = "use",
+    ):
+        p = get_pdf(ref, cache_policy=getattr(CachePolicies, cache_policy.upper()))
+        print("Downloaded into:", p.pdf_path.resolve())
+
     run: TaggedUnion[Auto[locate], Auto[download]]
 
 
@@ -124,11 +124,71 @@ def run(self):
         self.format(results)
 
 
+@dataclass
+class Work:
+    """Discover and work on prospective papers."""
+
+    @dataclass
+    class Get:
+        """Get articles from various sources."""
+
+        command: Annotated[
+            Any, FromEntryPoint("paperoni.discovery", wrap=lambda cls: Auto[cls.query])
+        ]
+
+        def run(self, work):
+            focuses = deserialize(Focuses, work.focuses)
+            ex = work.exclusions and deserialize(set[str], work.exclusions)
+            if work.state.exists():
+                top = deserialize(Top[Scored[PaperWorkingSet]], work.state)
+            else:
+                top = Top(work.n)
+            for paper in self.command():
+                if ex and paper.key in ex:
+                    continue
+                scored = Scored(focuses.score(paper), PaperWorkingSet.make(paper))
+                top.add(scored)
+            dump(Top[Scored[PaperWorkingSet]], top, dest=work.state)
+
+    @dataclass
+    class View:
+        """View the articles in the workset."""
+
+        # Output format
+        format: Formatter = TerminalFormatter
+
+        def run(self, work):
+            top = deserialize(Top[Scored[PaperWorkingSet]], work.state)
+            papers = list(ws.value.current for ws in top)
+            self.format(papers)
+
+    # Command
+    command: TaggedUnion[Get, View]
+
+    # File containing the working set
+    # [alias: -w]
+    state: Path
+
+    # List of focuses
+    # [alias: -f]
+    focuses: Path
+
+    # Exclusion list
+    # [alias: -x]
+    exclusions: Path = None
+
+    # Number of papers to keep in the working set
+    n: int = 10
+
+    def run(self):
+        self.command.run(self)
+
+
 @dataclass
 class PaperoniInterface:
     """Paper database"""
 
-    command: TaggedUnion[Discover, Refine, Fulltext]
+    command: TaggedUnion[Discover, Refine, Fulltext, Work]
 
     def run(self):
         self.command.run()

diff --git a/src/paperoni/discovery/jmlr.py b/src/paperoni/discovery/jmlr.py
@@ -15,6 +15,7 @@
     Venue,
     VenueType,
 )
+from ..model.focus import Focuses
 from ..utils import asciiify
 from .base import Discoverer
 
@@ -32,6 +33,8 @@ def query(
         name: str = None,
         # Whether to cache the download
         cache: bool = True,
+        # A list of focuses
+        focuses: Focuses = None,
     ):
         """Query Journal of Machine Learning Research."""
         name = name and asciiify(name).lower()

diff --git a/src/paperoni/discovery/miniconf.py b/src/paperoni/discovery/miniconf.py
@@ -1,6 +1,6 @@
 import math
 import re
-from datetime import datetime
+from datetime import date, datetime
 from enum import Enum
 
 from ..config import config
@@ -18,6 +18,7 @@
     Venue,
     VenueType,
 )
+from ..model.focus import Focuses
 from .base import Discoverer
 
 conference_urls = {
@@ -178,6 +179,8 @@ def query(
         cache: bool = True,
         # Whether to raise an error if a paper cannot be converted
         error_policy: ErrorPolicy = ErrorPolicy.LOG,
+        # A list of focuses
+        focuses: Focuses = None,
     ):
         """Query conference papers as JSON"""
         # Get the base URL for the conference, defaulting to conference.cc if not found
@@ -228,7 +231,7 @@ def query(
 
         # If no valid starttime found, use a default date
         if conference_date is None:
-            conference_date = datetime(year, 1, 1)
+            conference_date = date(year, 1, 1)
 
         def matches(paper):
             if not affiliation and not author:

diff --git a/src/paperoni/discovery/openalex.py b/src/paperoni/discovery/openalex.py
@@ -1,6 +1,7 @@
 import pprint
+import sys
 from dataclasses import dataclass, field
-from datetime import datetime
+from datetime import date, datetime
 from typing import Dict, List, Optional
 
 from ..config import config
@@ -17,7 +18,9 @@
     Topic,
     Venue,
     VenueType,
+    rescore,
 )
+from ..model.focus import Focus, Focuses
 from ..utils import QueryError, link_generators as LINK_GENERATORS
 from .base import Discoverer
 
@@ -186,7 +189,7 @@ def _wrap_paper(self, data: dict) -> PaperInfo:
             release_locations = [data["primary_location"]]
 
         # We will use work publication date with primary location to set release
-        publication_date = datetime.fromisoformat(data["publication_date"])
+        publication_date = date.fromisoformat(data["publication_date"])
 
         # We will save open access url in paper links
         oa_url = data["open_access"]["oa_url"]
@@ -216,7 +219,7 @@ def _wrap_paper(self, data: dict) -> PaperInfo:
                         Institution(
                             name=author_inst["display_name"],
                             category=INSTITUTION_CATEGORY_MAPPING.get(
-                                author_inst["type"], "wat"
+                                author_inst["type"], InstitutionCategory.unknown
                             ),
                             aliases=[],
                         )
@@ -325,8 +328,64 @@ def query(
         # If specified, display debug info
         # [alias: -v]
         verbose: bool = False,
+        # A list of focuses
+        focuses: Focuses = None,
     ):
         """Query OpenAlex for works."""
+        if focuses:
+            if limit is not None:
+                print(
+                    "The 'limit' parameter is ignored when 'focuses' are provided.",
+                    file=sys.stderr,
+                )
+            if any((page is not None, per_page is not None)):
+                print(
+                    "The 'page' and 'per_page' parameters will effectively be multiplied by the number of 'focuses' provided.",
+                    file=sys.stderr,
+                )
+
+            for focus in focuses.focuses:
+                match focus:
+                    case Focus(drive_discovery=False):
+                        continue
+                    case Focus(type="author", name=name, score=score):
+                        yield from rescore(
+                            self.query(
+                                author=name,
+                                institution=institution,
+                                title=title,
+                                page=page,
+                                per_page=per_page,
+                                verbose=verbose,
+                            ),
+                            score,
+                        )
+                    case Focus(type="author_openalex", name=aid, score=score):
+                        yield from rescore(
+                            self.query(
+                                author_id=aid,
+                                institution=institution,
+                                title=title,
+                                page=page,
+                                per_page=per_page,
+                                verbose=verbose,
+                            ),
+                            score,
+                        )
+                    case Focus(type="institution", name=name, score=score):
+                        yield from rescore(
+                            self.query(
+                                author=author,
+                                institution=name,
+                                title=title,
+                                page=page,
+                                per_page=per_page,
+                                verbose=verbose,
+                            ),
+                            score,
+                        )
+            return
+
         if verbose and self.mailto:
             print("[openalex: using polite pool]")
         qm = OpenAlexQueryManager(mailto=self.mailto)