Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/basic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ paperoni:
scrape:
urls:
- https://dadelani.github.io/publications
v2:
$class: paperoni.discovery.paperoni_v2:PaperoniV2
data: ${paperoni.data_path}/paperoniv2.json
refine:
prompt:
$class: paperoni.prompt:GenAIPrompt
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pmlr = "paperoni.discovery.pmlr:PMLR"
jmlr = "paperoni.discovery.jmlr:JMLR"
synth = "paperoni.discovery.synth:Synth"
scrape = "paperoni.discovery.scrape:Scrape"
v2 = "paperoni.discovery.paperoni_v2:PaperoniV2"

[project.entry-points."outsight.fixtures"]
dash = "paperoni.dash:Dash"
Expand Down
55 changes: 54 additions & 1 deletion src/paperoni/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from .collection.remotecoll import RemoteCollection
from .config import config
from .dash import History
from .discovery.paperoni_v2 import PaperoniV2
from .display import display, print_field, terminal_width
from .fulltext.locate import URL, locate_all
from .fulltext.pdf import PDF, CachePolicies, get_pdf
Expand Down Expand Up @@ -693,8 +694,60 @@ async def run(self, coll: "Coll"):
elif not len(coll.collection) and not len(await coll.collection.exclusions()):
logging.warning("Collection is not empty. Use --force to drop it.")

@dataclass
class Validate(Productor):
"""Validate the papers in the collection using the paperoni v2 database."""

# The paperoni v2 database
# [positional]
# [metavar v2]
command: Annotated[
Any,
FromEntryPoint(
"paperoni.discovery",
wrap=lambda cls: Auto[cls.query] if cls is PaperoniV2 else None,
),
]

async def run(self, coll: "Coll"):
validated = []
not_found = []
Comment on lines +713 to +714
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is currently only used to count the papers but maybe we could also save the lists which could be used to analyse which papers were not found in the v3 database

total = 0
send(progress=("Validated papers", len(validated), total))
send(progress=("Papers not found", len(not_found), total))

async for paper_v2 in self.iterate(embed=True):
paper_v2_json = paper_v2.info.pop("v2")
total += 1

if "validated" not in paper_v2.flags:
continue

if paper := await coll.collection.find_paper(paper_v2):
paper.flags.add("validated")
await coll.collection.edit_paper(paper)
validated.append(paper_v2_json)

else:
not_found.append(paper_v2_json)

send(
progress=(
"Found papers",
len(validated),
len(validated) + len(not_found),
)
)
send(
progress=(
"Validated papers",
len(validated) + len(not_found),
total,
)
)

# Command to execute
command: TaggedUnion[Search, Import, Export, Drop]
command: TaggedUnion[Search, Import, Export, Drop, Validate]

# Collection dir
# [alias: -c]
Expand Down
4 changes: 2 additions & 2 deletions src/paperoni/collection/abc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import date
from typing import AsyncIterable, Iterable
from typing import AsyncGenerator, Iterable

from ..model.classes import Paper

Expand Down Expand Up @@ -62,7 +62,7 @@ async def search(
include_flags: list[str] = None,
# Flags that must be False
exclude_flags: list[str] = None,
) -> AsyncIterable[Paper]:
) -> AsyncGenerator[Paper, None]:
raise NotImplementedError()

def __len__(self) -> int:
Expand Down
6 changes: 3 additions & 3 deletions src/paperoni/collection/remotecoll.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from dataclasses import dataclass, field
from datetime import date
from typing import Iterable
from typing import AsyncGenerator, Iterable

from fastapi import HTTPException
from serieux import deserialize
Expand Down Expand Up @@ -81,7 +81,7 @@ async def search(
include_flags: list[str] = None,
# Flags that must be False
exclude_flags: list[str] = None,
):
) -> AsyncGenerator[Paper, None]:
params = {}
if paper_id:
params["paper_id"] = paper_id
Expand All @@ -106,7 +106,7 @@ async def search(
while True:
query_params = params.copy()
query_params["offset"] = offset
resp = await self.fetch.read(
resp: dict = await self.fetch.read(
url,
format="json",
cache_into=None,
Expand Down
9 changes: 7 additions & 2 deletions src/paperoni/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import gifnoc
from easy_oauth import OAuthManager
from rapporteur.report import Reporter
from serieux import JSON, TaggedSubclass
from serieux import TaggedSubclass
from serieux.features.encrypt import Secret

from .collection.abc import PaperCollection
Expand Down Expand Up @@ -52,16 +52,21 @@ def __post_init__(self):
self.process_pool = ProcessPoolExecutor(**self.process_pool_executor)


@dataclass
class AutoValidate:
score_threshold: float = 10.0


@dataclass
class PaperoniConfig:
cache_path: Path = None
data_path: Path = None
mailto: str = ""
api_keys: Keys[str, Secret[str]] = field(default_factory=Keys)
discovery: JSON = None
fetch: TaggedSubclass[Fetcher] = field(default_factory=RequestsFetcher)
focuses: Focuses = field(default_factory=Focuses)
autofocus: AutoFocus[str, AutoFocus.Author] = field(default_factory=AutoFocus)
autovalidate: AutoValidate = field(default_factory=AutoValidate)
refine: Refine = None
work_file: Path = None
collection: TaggedSubclass[PaperCollection] = None
Expand Down
142 changes: 142 additions & 0 deletions src/paperoni/discovery/paperoni_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import datetime
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, AsyncGenerator

import gifnoc

from ..discovery.base import Discoverer
from ..model import (
Author,
DatePrecision,
Institution,
InstitutionCategory,
Link,
PaperAuthor,
Release,
Topic,
Venue,
VenueType,
)
from ..model.classes import Paper
from ..utils import url_to_id


def _is_validated(paper: dict) -> bool | None:
for flag in paper["flags"]:
if flag["name"] == "validation":
if flag["value"] == 1:
return True
elif flag["value"] == 0:
return False
else:
return None


def _parse_topic(topic: dict[str, str]) -> Topic:
return Topic(name=topic["name"])


def _parse_link(link: dict[str, str]) -> Link:
type_link = url_to_id(link.get("url", ""))
if type_link:
typ, link = type_link
return Link(type=typ, link=link)

return Link(type=link["type"].split(".", 1)[0], link=link["link"])


def _parse_institution(institution: dict[str, str]) -> Institution:
return Institution(
name=institution["name"],
category=InstitutionCategory(institution["category"]),
)


def _parse_author(author: dict[str, str]) -> PaperAuthor:
return PaperAuthor(
display_name=author["author"]["name"],
author=Author(
name=author["author"]["name"],
aliases=[],
links=list(map(_parse_link, author["author"]["links"]))
+ [Link(type="paperoni_v2", link=author["author"]["author_id"])],
),
affiliations=list(map(_parse_institution, author["affiliations"])),
)


def _parse_release(release: dict[str, Any]) -> Release:
return Release(
venue=Venue(
type=VenueType(release["venue"]["type"]),
name=release["venue"]["name"],
series=release["venue"]["series"],
**(
(
{
"date": datetime.datetime.fromtimestamp(
release["venue"]["date"]["timestamp"]
).date(),
}
if release["venue"]["date"].get("timestamp", None) is not None
else DatePrecision.assimilate_date(release["venue"]["date"]["text"])
)
| {
"date_precision": DatePrecision(
release["venue"]["date"]["precision"]
),
}
),
volume=release["venue"]["volume"],
publisher=release["venue"]["publisher"],
links=list(map(_parse_link, release["venue"]["links"])),
peer_reviewed=release["peer_reviewed"],
),
status=release["status"],
pages=release["pages"],
)


# TODO: complete the conversion of the paperoni v2 database to the paperoni model
# - [ ] Complete the flags information
@dataclass
class PaperoniV2(Discoverer):
# The paperoni v2 JSON export file
# [positional]
# [metavar JSON]
json: Path = field(default_factory=lambda: paperoni_v2_json)

async def query(
self,
# Embed the paperoni v2 paper's JSON in the Paper info dictionary
embed: bool = False,
) -> AsyncGenerator[Paper, None]:
"""Query the paperoni v2 database"""
with self.json.open() as f:
papers = json.load(f)

for paper in papers:
yield Paper(
title=paper["title"],
abstract=paper["abstract"],
authors=list(map(_parse_author, paper["authors"])),
releases=list(map(_parse_release, paper["releases"])),
topics=list(map(_parse_topic, paper["topics"])),
links=list(map(_parse_link, paper["links"])),
flags=set(
(["validated"] if _is_validated(paper) else [])
+ (["~validated"] if _is_validated(paper) is False else [])
),
key=f"paperoni_v2:{paper['paper_id']}",
info={"discovered_by": {"paperoni_v2": paper["paper_id"]}}
| ({"v2": paper} if embed else {}),
score=10.0 if _is_validated(paper) else 0.0,
version=datetime.datetime.now(),
)


paperoni_v2_json: Path | None = gifnoc.define(
"paperoni.discovery.v2.data", Path | None, defaults=None
)
1 change: 1 addition & 0 deletions src/paperoni/discovery/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class Scrape(Discoverer):
force: bool = False

async def query(self):
"""Query the pages to scrape"""
for link in self.links:
prompt_result = await prompt_html(
system_prompt=llm_config.system_prompt,
Expand Down
3 changes: 3 additions & 0 deletions tests/config/test-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ paperoni:
scrape:
urls:
- https://dadelani.github.io/publications
v2:
$class: paperoni.discovery.paperoni_v2:PaperoniV2
data: ../data/paperoniv2.json
refine:
prompt:
$class: GenAIPrompt
Expand Down
Loading
Loading