Skip to content

Commit 4e8d06e

Browse files
committed
Update PDF class, add CLI for fulltext
1 parent e9b77f4 commit 4e8d06e

File tree

8 files changed

+199
-219
lines changed

8 files changed

+199
-219
lines changed

src/paperoni/__main__.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
from dataclasses import dataclass
44
from pathlib import Path
5-
from typing import Annotated, Any
5+
from typing import Annotated, Any, Literal
66

77
import yaml
88
from gifnoc import add_overlay, cli
@@ -11,6 +11,8 @@
1111

1212
from .config import config
1313
from .display import display, terminal_width
14+
from .fulltext.locate import locate_all
15+
from .fulltext.pdf import CachePolicies, get_pdf
1416
from .model import PaperInfo
1517
from .model.merge import merge_all
1618
from .refinement import fetch_all
@@ -50,7 +52,7 @@ class Discover:
5052
"""Discover papers from various sources."""
5153

5254
command: Annotated[
53-
Any, FromEntryPoint("paperoni.discovery", wrap=lambda cls: Auto[cls().query])
55+
Any, FromEntryPoint("paperoni.discovery", wrap=lambda cls: Auto[cls.query])
5456
]
5557

5658
# Output format
@@ -66,6 +68,35 @@ def run(self):
6668
self.format(papers)
6769

6870

71+
def locate(
72+
# Reference to locate
73+
# [positional]
74+
ref: str,
75+
):
76+
for url in locate_all(ref):
77+
print(f"\033[36m[{url.info}]\033[0m {url.url}")
78+
79+
80+
def download(
81+
# Reference to locate
82+
# [positional]
83+
# [nargs: +]
84+
ref: list[str],
85+
# Cache policy
86+
# [alias: -p]
87+
cache_policy: Literal["use", "use_best", "no_download", "force"] = "use",
88+
):
89+
p = get_pdf(ref, cache_policy=getattr(CachePolicies, cache_policy.upper()))
90+
print("Downloaded into:", p.pdf_path.resolve())
91+
92+
93+
@dataclass
94+
class Fulltext:
95+
"""Download and process fulltext."""
96+
97+
run: TaggedUnion[Auto[locate], Auto[download]]
98+
99+
69100
@dataclass
70101
class Refine:
71102
"""Refine paper information."""
@@ -97,7 +128,7 @@ def run(self):
97128
class PaperoniInterface:
98129
"""Paper database"""
99130

100-
command: TaggedUnion[Discover, Refine]
131+
command: TaggedUnion[Discover, Refine, Fulltext]
101132

102133
def run(self):
103134
self.command.run()

src/paperoni/discovery/openalex.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ def _reconstruct_abstract(cls, inverted: Dict[str, List[int]]) -> str:
300300

301301
@dataclass
302302
class OpenAlex(Discoverer):
303+
# Email associated with the query, for politeness
303304
mailto: str = field(default_factory=lambda: config.mailto)
304305

305306
def query(

src/paperoni/fulltext/download.py

Lines changed: 0 additions & 162 deletions
This file was deleted.

src/paperoni/fulltext/pdf.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import hashlib
2+
import shutil
3+
from dataclasses import dataclass
4+
5+
from serieux import dump, load
6+
7+
from ..config import config
8+
from .locate import URL, locate_all
9+
10+
11+
@dataclass
12+
class CachePolicy:
13+
use: bool = True
14+
download: bool = True
15+
best: bool = False
16+
17+
18+
class CachePolicies:
19+
USE = CachePolicy(use=True, download=True, best=False)
20+
USE_BEST = CachePolicy(use=True, download=True, best=True)
21+
NO_DOWNLOAD = CachePolicy(use=True, download=False, best=False)
22+
FORCE = CachePolicy(use=False, download=True, best=True)
23+
24+
25+
@dataclass
26+
class ErrorData:
27+
type: str
28+
message: str
29+
30+
31+
@dataclass
32+
class Info:
33+
id: str = None
34+
title: str = None
35+
ref: str = None
36+
37+
38+
@dataclass
39+
class PDF:
40+
source: URL
41+
hash: str = None
42+
info: Info = None
43+
success: bool = False
44+
error: ErrorData = None
45+
46+
def __post_init__(self):
47+
if self.hash is None:
48+
self.hash = hashlib.sha256(str(self.source).encode()).hexdigest()
49+
self.directory = config.data_path / "pdf" / self.hash
50+
self.meta_path = self.directory / "meta.yaml"
51+
self.pdf_path = self.directory / "fulltext.pdf"
52+
53+
def ensure(self):
54+
self.directory.mkdir(parents=True, exist_ok=True)
55+
56+
def load(self):
57+
if self.meta_path.exists():
58+
return load(PDF, self.meta_path)
59+
else:
60+
return self
61+
62+
def dump(self):
63+
self.ensure()
64+
dump(PDF, self, dest=self.meta_path)
65+
66+
def clear(self):
67+
assert self.directory.resolve().is_relative_to(config.data_path.resolve())
68+
shutil.rmtree(self.directory, ignore_errors=True)
69+
70+
def fetch(self):
71+
self.ensure()
72+
try:
73+
config.fetch.download(
74+
url=self.source.url,
75+
filename=self.pdf_path,
76+
)
77+
self.success = True
78+
return self.pdf_path
79+
except Exception as exc:
80+
self.error = ErrorData(
81+
type=type(exc).__name__,
82+
message=str(exc),
83+
)
84+
self.success = False
85+
raise
86+
finally:
87+
self.dump()
88+
89+
def fulltext(self, cache_policy: CachePolicy = CachePolicies.USE):
90+
if not cache_policy.use or not self.success or not self.pdf_path.exists():
91+
if not cache_policy.download:
92+
raise Exception(
93+
f"No PDF for {self.source.url} and cache policy prevents downloading"
94+
)
95+
return self.fetch()
96+
else:
97+
return self.pdf_path
98+
99+
100+
def get_pdf(refs, cache_policy: CachePolicy = CachePolicies.USE):
101+
if isinstance(refs, str):
102+
refs = [refs]
103+
104+
if cache_policy.use and not cache_policy.best:
105+
urls = [url for ref in refs for url in locate_all(ref)]
106+
for url in urls:
107+
if (p := PDF(url).load()).success:
108+
return p
109+
110+
exceptions = []
111+
for ref in refs:
112+
for url in locate_all(ref):
113+
p = PDF(url).load()
114+
try:
115+
p.fulltext(cache_policy=cache_policy)
116+
return p
117+
except Exception as exc:
118+
exceptions.append(exc)
119+
continue
120+
121+
raise ExceptionGroup("No fulltext found for any reference", exceptions)

0 commit comments

Comments
 (0)