Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ license = "MIT"
requires-python = ">=3.12"
dependencies = [
"ovld>=0.5.13",
"serieux>=0.3.4",
"serieux>=0.3.8",
"gifnoc>=0.6.2",
"beautifulsoup4>=4.13.4",
"blessed>=1.21.0",
Expand Down
16 changes: 5 additions & 11 deletions src/paperoni/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from .client.utils import login
from .collection.abc import PaperCollection
from .collection.filecoll import FileCollection
from .collection.finder import Finder
from .collection.finder import find_equivalent, paper_index
from .collection.remotecoll import RemoteCollection
from .config import config
from .dash import History
Expand Down Expand Up @@ -275,20 +275,14 @@ class Get(Productor):

async def run(self, work: "Work"):
ex = work.collection and (await work.collection.exclusions())

find = Finder(
title_finder=lambda scored: scored.value.current.title,
links_finder=lambda scored: scored.value.current.links,
authors_finder=lambda scored: scored.value.current.authors,
id_finder=lambda scored: getattr(scored.value.current, "id", None),
)
find.add(list(work.top))
index = paper_index()
index.index_all(list(work.top))

async for paper in self.iterate(focuses=work.focuses):
if ex and paper.key in ex:
continue

if found := find.find(paper):
if found := find_equivalent(paper, index):
found.value.add(paper)
new_score = work.focuses.score(found.value.current)
if new_score != found.score:
Expand Down Expand Up @@ -326,7 +320,7 @@ async def run(self, work: "Work"):

if work.top.add(scored):
send(workset_added=1)
find.add([scored])
index.index(scored)
work.save()

@dataclass
Expand Down
23 changes: 6 additions & 17 deletions src/paperoni/collection/filecoll.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,19 @@
from dataclasses import dataclass, field
from pathlib import Path

from serieux import dump, load
from serieux import deserialize
from serieux.features.filebacked import FileProxy

from ..utils import deprox
from .memcoll import MemCollection
from .memcoll import MemCollection, PaperIndex


@dataclass(kw_only=True)
class FileCollection(MemCollection):
file: Path = field(compare=False)

def __post_init__(self):
super().__post_init__()

self.file = deprox(self.file)

if not self.file.exists():
self.file.parent.mkdir(exist_ok=True, parents=True)
self._commit()

self.__dict__.update(vars(load(MemCollection, self.file)))
ann = FileProxy(default_factory=PaperIndex, refresh=True)
self._index = deserialize(PaperIndex @ ann, str(self.file))

def _commit(self) -> None:
dump(type(self), self, dest=self.file)

@classmethod
def serieux_serialize(cls, obj, ctx, call_next):
return call_next(MemCollection, obj, ctx)
self._index.save()
151 changes: 107 additions & 44 deletions src/paperoni/collection/finder.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,119 @@
import logging
from dataclasses import dataclass, field
from typing import Callable
from dataclasses import dataclass
from typing import Any, Callable, Iterable

from ovld import ovld, recurse

from ..model import PaperWorkingSet, Scored
from ..model.classes import Paper
from ..utils import normalize_title, quick_author_similarity
from ..utils import normalize_name, normalize_title, quick_author_similarity


@dataclass
class Finder[T]:
title_finder: Callable = lambda p: p.title
links_finder: Callable = lambda p: p.links
authors_finder: Callable = lambda p: p.authors
id_finder: Callable = lambda p: getattr(p, "id", None)
by_title: dict[str, T] = field(default_factory=dict)
by_link: dict[str, T] = field(default_factory=dict)
by_id: dict[str, T] = field(default_factory=dict)

def add(self, entries: list[T]):
class Index[T]:
indexers: dict[str, Callable]
indexes: dict[str, dict[Any, T]] = None

def __post_init__(self):
self.indexes = {name: {} for name in self.indexers}

def index_all(self, entries: Iterable[T]):
for entry in entries:
for lnk in self.links_finder(entry):
self.by_link[lnk] = entry
self.by_title[normalize_title(self.title_finder(entry))] = entry
if (i := self.id_finder(entry)) is not None:
self.by_id[i] = entry

def find(self, p: Paper):
for lnk in p.links:
if result := self.by_link.get(lnk, None):
return result
same_title = self.by_title.get(normalize_title(p.title), None)
if same_title:
au1 = {a.display_name for a in p.authors}
au2 = {a.display_name for a in self.authors_finder(same_title)}
sim = quick_author_similarity(au1, au2)
if sim >= 0.8:
return same_title
else:
logging.warning(
f"Title match but low author similarity ({sim:.2f}) for paper '{p.title}': {au1} vs {au2}"
)
return None
self.index(entry)

def index(self, entry: T):
for name, fn in self.indexers.items():
idx = self.indexes[name]
for value in fn(entry):
idx[value] = entry
Comment on lines +24 to +28
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the following be useful to avoid unexpected replacement of an entry?

Suggested change
def index(self, entry: T):
for name, fn in self.indexers.items():
idx = self.indexes[name]
for value in fn(entry):
idx[value] = entry
def index(self, entry: T):
for name, fn in self.indexers.items():
idx = self.indexes[name]
for value in fn(entry):
assert value not in idx or idx[value] == entry
idx[value] = entry

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Collisions aren't necessarily a big deal here, they might happen for titles, but that's not really going to break anything.


def remove(self, entry: T):
for lnk in self.links_finder(entry):
self.by_link.pop(lnk, None)
title_key = normalize_title(self.title_finder(entry))
self.by_title.pop(title_key, None)
if i := self.id_finder(entry):
self.by_id.pop(i, None)
for name, fn in self.indexers.items():
idx = self.indexes[name]
for value in fn(entry):
idx.pop(value)

def replace(self, entry: T):
entry_id = self.id_finder(entry)
if entry_id and (old_entry := self.by_id.get(entry_id)):
old_entry = self.equiv("id", entry)
if old_entry is not None:
self.remove(old_entry)
self.add([entry])
self.index(entry)

def find(self, index: str, value: Any):
return self.indexes[index].get(value, None)

def equiv(self, index: str, model: T):
idx = self.indexes[index]
for value in self.indexers[index](model):
if result := idx.get(value, None):
return result
else:
return None


def find_equivalent(p: Paper, idx: Index):
if result := idx.equiv("links", p):
return result
if same_title := idx.equiv("title", p):
au1 = list(extract_authors(p))
au2 = list(extract_authors(same_title))
sim = quick_author_similarity(au1, au2)
if sim >= 0.8:
return same_title
else:
logging.warning(
f"Title match but low author similarity ({sim:.2f}) for paper '{p.title}': {au1} vs {au2}"
)
return None


@ovld
def to_paper(p: PaperWorkingSet):
yield from recurse(p.current)


@ovld
def to_paper(p: Scored):
yield from recurse(p.value)


@to_paper.variant
def extract_title(p: Paper):
yield normalize_title(p.title)


@to_paper.variant
def extract_id(p: Paper):
yield p.id


@to_paper.variant
def extract_authors(p: Paper):
for a in p.authors:
yield normalize_name(a.display_name)


@to_paper.variant
def extract_latest(p: Paper):
if p.releases:
d = max(release.venue.date for release in p.releases)
yield f"{d}::{p.id}"
else:
yield f"0::{p.id}"


@to_paper.variant
def extract_links(p: Paper):
yield from p.links


paper_indexers = {
"id": extract_id,
"title": extract_title,
"links": extract_links,
"latest": extract_latest,
}


def paper_index():
return Index(indexers=paper_indexers)
Loading
Loading