Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.12']
python-version: ['3.13']
steps:
- name: Check out the code
uses: actions/checkout@v3
Expand All @@ -33,8 +33,6 @@ jobs:
strategy:
matrix:
settings:
- python: '3.12'
coverage: false
- python: '3.13'
coverage: false
steps:
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ readme = "README.md"
license = "MIT"
requires-python = ">=3.12"
dependencies = [
"ovld>=0.5.9",
"serieux>=0.2.0",
"ovld>=0.5.10",
"serieux>=0.2.2",
"gifnoc>=0.6.0",
"backoff>=2.2.1",
"beautifulsoup4>=4.13.4",
Expand All @@ -23,7 +23,6 @@ dependencies = [
"openreview-py>=1.50.0",
"requests-cache>=1.2.1",
"lxml>=6.0.0",
"wrapt>=1.17.2",
"unidecode>=1.4.0",
"fake-useragent>=2.2.0",
"cloudscraper>=1.2.71",
Expand Down Expand Up @@ -52,6 +51,7 @@ openalex = "paperoni.discovery.openalex:OpenAlex"
miniconf = "paperoni.discovery.miniconf:MiniConf"
pmlr = "paperoni.discovery.pmlr:PMLR"
jmlr = "paperoni.discovery.jmlr:JMLR"
synth = "paperoni.discovery.synth:Synth"

[tool.ruff]
line-length = 90
Expand Down
110 changes: 85 additions & 25 deletions src/paperoni/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@

import yaml
from gifnoc import add_overlay, cli
from serieux import Auto, Registered, TaggedUnion, serialize, singleton
from serieux import Auto, Registered, TaggedUnion, deserialize, dump, serialize, singleton
from serieux.features.tagset import FromEntryPoint

from paperoni.model.focus import Focuses, Scored, Top

from .config import config
from .display import display, terminal_width
from .fulltext.locate import locate_all
from .fulltext.pdf import CachePolicies, get_pdf
from .model import PaperInfo
from .model.merge import merge_all
from .model.merge import PaperWorkingSet, merge_all
from .refinement import fetch_all
from .utils import url_to_id

Expand Down Expand Up @@ -68,32 +70,30 @@ def run(self):
self.format(papers)


def locate(
# Reference to locate
# [positional]
ref: str,
):
for url in locate_all(ref):
print(f"\033[36m[{url.info}]\033[0m {url.url}")


def download(
# Reference to locate
# [positional]
# [nargs: +]
ref: list[str],
# Cache policy
# [alias: -p]
cache_policy: Literal["use", "use_best", "no_download", "force"] = "use",
):
p = get_pdf(ref, cache_policy=getattr(CachePolicies, cache_policy.upper()))
print("Downloaded into:", p.pdf_path.resolve())


@dataclass
class Fulltext:
"""Download and process fulltext."""

def locate(
# Reference to locate
# [positional]
ref: str,
):
for url in locate_all(ref):
print(f"\033[36m[{url.info}]\033[0m {url.url}")

def download(
# Reference to locate
# [positional]
# [nargs: +]
ref: list[str],
# Cache policy
# [alias: -p]
cache_policy: Literal["use", "use_best", "no_download", "force"] = "use",
):
p = get_pdf(ref, cache_policy=getattr(CachePolicies, cache_policy.upper()))
print("Downloaded into:", p.pdf_path.resolve())

run: TaggedUnion[Auto[locate], Auto[download]]


Expand Down Expand Up @@ -124,11 +124,71 @@ def run(self):
self.format(results)


@dataclass
class Work:
"""Discover and work on prospective papers."""

@dataclass
class Get:
"""Get articles from various sources."""

command: Annotated[
Any, FromEntryPoint("paperoni.discovery", wrap=lambda cls: Auto[cls.query])
]

def run(self, work):
focuses = deserialize(Focuses, work.focuses)
ex = work.exclusions and deserialize(set[str], work.exclusions)
if work.state.exists():
top = deserialize(Top[Scored[PaperWorkingSet]], work.state)
else:
top = Top(work.n)
for paper in self.command():
if ex and paper.key in ex:
continue
scored = Scored(focuses.score(paper), PaperWorkingSet.make(paper))
top.add(scored)
dump(Top[Scored[PaperWorkingSet]], top, dest=work.state)

@dataclass
class View:
"""View the articles in the workset."""

# Output format
format: Formatter = TerminalFormatter

def run(self, work):
top = deserialize(Top[Scored[PaperWorkingSet]], work.state)
papers = list(ws.value.current for ws in top)
self.format(papers)

# Command
command: TaggedUnion[Get, View]

# File containing the working set
# [alias: -w]
state: Path

# List of focuses
# [alias: -f]
focuses: Path

# Exclusion list
# [alias: -x]
exclusions: Path = None

# Number of papers to keep in the working set
n: int = 10

def run(self):
self.command.run(self)


@dataclass
class PaperoniInterface:
"""Paper database"""

command: TaggedUnion[Discover, Refine, Fulltext]
command: TaggedUnion[Discover, Refine, Fulltext, Work]

def run(self):
self.command.run()
Expand Down
3 changes: 3 additions & 0 deletions src/paperoni/discovery/jmlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
Venue,
VenueType,
)
from ..model.focus import Focuses
from ..utils import asciiify
from .base import Discoverer

Expand All @@ -32,6 +33,8 @@ def query(
name: str = None,
# Whether to cache the download
cache: bool = True,
# A list of focuses
focuses: Focuses = None,
):
"""Query Journal of Machine Learning Research."""
name = name and asciiify(name).lower()
Expand Down
7 changes: 5 additions & 2 deletions src/paperoni/discovery/miniconf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math
import re
from datetime import datetime
from datetime import date, datetime
from enum import Enum

from ..config import config
Expand All @@ -18,6 +18,7 @@
Venue,
VenueType,
)
from ..model.focus import Focuses
from .base import Discoverer

conference_urls = {
Expand Down Expand Up @@ -178,6 +179,8 @@ def query(
cache: bool = True,
# Whether to raise an error if a paper cannot be converted
error_policy: ErrorPolicy = ErrorPolicy.LOG,
# A list of focuses
focuses: Focuses = None,
):
"""Query conference papers as JSON"""
# Get the base URL for the conference, defaulting to conference.cc if not found
Expand Down Expand Up @@ -228,7 +231,7 @@ def query(

# If no valid starttime found, use a default date
if conference_date is None:
conference_date = datetime(year, 1, 1)
conference_date = date(year, 1, 1)

def matches(paper):
if not affiliation and not author:
Expand Down
65 changes: 62 additions & 3 deletions src/paperoni/discovery/openalex.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pprint
import sys
from dataclasses import dataclass, field
from datetime import datetime
from datetime import date, datetime
from typing import Dict, List, Optional

from ..config import config
Expand All @@ -17,7 +18,9 @@
Topic,
Venue,
VenueType,
rescore,
)
from ..model.focus import Focus, Focuses
from ..utils import QueryError, link_generators as LINK_GENERATORS
from .base import Discoverer

Expand Down Expand Up @@ -186,7 +189,7 @@ def _wrap_paper(self, data: dict) -> PaperInfo:
release_locations = [data["primary_location"]]

# We will use work publication date with primary location to set release
publication_date = datetime.fromisoformat(data["publication_date"])
publication_date = date.fromisoformat(data["publication_date"])

# We will save open access url in paper links
oa_url = data["open_access"]["oa_url"]
Expand Down Expand Up @@ -216,7 +219,7 @@ def _wrap_paper(self, data: dict) -> PaperInfo:
Institution(
name=author_inst["display_name"],
category=INSTITUTION_CATEGORY_MAPPING.get(
author_inst["type"], "wat"
author_inst["type"], InstitutionCategory.unknown
),
aliases=[],
)
Expand Down Expand Up @@ -325,8 +328,64 @@ def query(
# If specified, display debug info
# [alias: -v]
verbose: bool = False,
# A list of focuses
focuses: Focuses = None,
):
"""Query OpenAlex for works."""
if focuses:
if limit is not None:
print(
"The 'limit' parameter is ignored when 'focuses' are provided.",
file=sys.stderr,
)
if any((page is not None, per_page is not None)):
print(
"The 'page' and 'per_page' parameters will effectively be multiplied by the number of 'focuses' provided.",
file=sys.stderr,
)

for focus in focuses.focuses:
match focus:
case Focus(drive_discovery=False):
continue
case Focus(type="author", name=name, score=score):
yield from rescore(
self.query(
author=name,
institution=institution,
title=title,
page=page,
per_page=per_page,
verbose=verbose,
),
score,
)
case Focus(type="author_openalex", name=aid, score=score):
yield from rescore(
self.query(
author_id=aid,
institution=institution,
title=title,
page=page,
per_page=per_page,
verbose=verbose,
),
score,
)
case Focus(type="institution", name=name, score=score):
yield from rescore(
self.query(
author=author,
institution=name,
title=title,
page=page,
per_page=per_page,
verbose=verbose,
),
score,
)
return

if verbose and self.mailto:
print("[openalex: using polite pool]")
qm = OpenAlexQueryManager(mailto=self.mailto)
Expand Down
Loading