Skip to content

Commit aa12f83

Browse files
committed
default to extract text from source
1 parent 325c214 commit aa12f83

File tree

3 files changed

+45
-14
lines changed

3 files changed

+45
-14
lines changed

src/zotero_arxiv_daily/protocol.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,14 @@ def _generate_tldr_with_llm(self, openai_client:OpenAI,llm_params:dict) -> str:
2626
prompt = f"Given the following information of a paper, generate a one-sentence TLDR summary in {lang}:\n\n"
2727
if self.title:
2828
prompt += f"Title:\n {self.title}\n\n"
29+
30+
if self.abstract:
31+
prompt += f"Abstract: {self.abstract}\n\n"
32+
2933
if self.full_text:
3034
prompt += f"Preview of main content:\n {self.full_text}\n\n"
31-
elif self.abstract:
32-
prompt += f"Abstract: {self.abstract}\n\n"
33-
else:
35+
36+
if not self.full_text and not self.abstract:
3437
logger.warning(f"Neither full text nor abstract is provided for {self.url}")
3538
return "Failed to generate TLDR. Neither full text nor abstract is provided"
3639

src/zotero_arxiv_daily/retriever/arxiv_retriever.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import arxiv
33
from arxiv import Result as ArxivResult
44
from ..protocol import Paper
5-
from ..utils import extract_markdown_from_pdf
5+
from ..utils import extract_markdown_from_pdf, extract_tex_code_from_tar
66
from tempfile import TemporaryDirectory
77
import feedparser
88
from urllib.request import urlretrieve
@@ -43,14 +43,9 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
4343
authors = [a.name for a in raw_paper.authors]
4444
abstract = raw_paper.summary
4545
pdf_url = raw_paper.pdf_url
46-
with TemporaryDirectory() as temp_dir:
47-
path = os.path.join(temp_dir, "paper.pdf")
48-
urlretrieve(pdf_url, path)
49-
try:
50-
full_text = extract_markdown_from_pdf(path)
51-
except Exception as e:
52-
logger.warning(f"Failed to extract full text of {title}: {e}")
53-
full_text = None
46+
full_text = extract_text_from_tar(raw_paper)
47+
if full_text is None:
48+
full_text = extract_text_from_pdf(raw_paper)
5449
return Paper(
5550
source=self.name,
5651
title=title,
@@ -59,4 +54,37 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
5954
url=raw_paper.entry_id,
6055
pdf_url=pdf_url,
6156
full_text=full_text
62-
)
57+
)
58+
59+
def extract_text_from_pdf(paper: ArxivResult) -> str | None:
60+
with TemporaryDirectory() as temp_dir:
61+
path = os.path.join(temp_dir, "paper.pdf")
62+
if paper.pdf_url is None:
63+
logger.warning(f"No PDF URL available for {paper.title}")
64+
return None
65+
urlretrieve(paper.pdf_url, path)
66+
try:
67+
full_text = extract_markdown_from_pdf(path)
68+
except Exception as e:
69+
logger.warning(f"Failed to extract full text of {paper.title} from pdf: {e}")
70+
full_text = None
71+
return full_text
72+
73+
def extract_text_from_tar(paper: ArxivResult) -> str | None:
74+
with TemporaryDirectory() as temp_dir:
75+
path = os.path.join(temp_dir, "paper.tar.gz")
76+
source_url = paper.source_url()
77+
if source_url is None:
78+
logger.warning(f"No source URL available for {paper.title}")
79+
return None
80+
urlretrieve(source_url, path)
81+
try:
82+
file_contents = extract_tex_code_from_tar(path, paper.entry_id)
83+
if "all" not in file_contents:
84+
logger.warning(f"Failed to extract full text of {paper.title} from tar: Main tex file not found.")
85+
return None
86+
full_text = file_contents["all"]
87+
except Exception as e:
88+
logger.warning(f"Failed to extract full text of {paper.title} from tar: {e}")
89+
full_text = None
90+
return full_text

src/zotero_arxiv_daily/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def extract_tex_code_from_tar(file_path:str, paper_id:str) -> dict[str,str]:
5858
content = re.sub(r'\\\\', '', content)
5959
#remove consecutive spaces
6060
content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
61-
if main_tex is None and re.search(r'\\begin\{document\}', content):
61+
if main_tex is None and re.search(r'\\begin\{document\}', content) and not any(w in t for w in ['example', 'sample']):
6262
main_tex = t
6363
logger.debug(f"Choose {t} as main tex file of {paper_id}")
6464
file_contents[t] = content

0 commit comments

Comments
 (0)