default to extract text from source

TideDra · TideDra · commit aa12f8367cb5 · 2026-03-04T09:12:23.000Z
diff --git a/src/zotero_arxiv_daily/protocol.py b/src/zotero_arxiv_daily/protocol.py
@@ -26,11 +26,14 @@ def _generate_tldr_with_llm(self, openai_client:OpenAI,llm_params:dict) -> str:
         prompt = f"Given the following information of a paper, generate a one-sentence TLDR summary in {lang}:\n\n"
         if self.title:
             prompt += f"Title:\n {self.title}\n\n"
+
+        if self.abstract:
+            prompt += f"Abstract: {self.abstract}\n\n"
+
         if self.full_text:
             prompt += f"Preview of main content:\n {self.full_text}\n\n"
-        elif self.abstract:
-            prompt += f"Abstract: {self.abstract}\n\n"
-        else:
+
+        if not self.full_text and not self.abstract:
             logger.warning(f"Neither full text nor abstract is provided for {self.url}")
             return "Failed to generate TLDR. Neither full text nor abstract is provided"
         
diff --git a/src/zotero_arxiv_daily/retriever/arxiv_retriever.py b/src/zotero_arxiv_daily/retriever/arxiv_retriever.py
@@ -2,7 +2,7 @@
 import arxiv
 from arxiv import Result as ArxivResult
 from ..protocol import Paper
-from ..utils import extract_markdown_from_pdf
+from ..utils import extract_markdown_from_pdf, extract_tex_code_from_tar
 from tempfile import TemporaryDirectory
 import feedparser
 from urllib.request import urlretrieve
@@ -43,14 +43,9 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
         authors = [a.name for a in raw_paper.authors]
         abstract = raw_paper.summary
         pdf_url = raw_paper.pdf_url
-        with TemporaryDirectory() as temp_dir:
-            path = os.path.join(temp_dir, "paper.pdf")
-            urlretrieve(pdf_url, path)
-            try:
-                full_text = extract_markdown_from_pdf(path)
-            except Exception as e:
-                logger.warning(f"Failed to extract full text of {title}: {e}")
-                full_text = None
+        full_text = extract_text_from_tar(raw_paper)
+        if full_text is None:
+            full_text = extract_text_from_pdf(raw_paper)
         return Paper(
             source=self.name,
             title=title,
@@ -59,4 +54,37 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
             url=raw_paper.entry_id,
             pdf_url=pdf_url,
             full_text=full_text
-        )
+        )
+
+def extract_text_from_pdf(paper: ArxivResult) -> str | None:
+    with TemporaryDirectory() as temp_dir:
+        path = os.path.join(temp_dir, "paper.pdf")
+        if paper.pdf_url is None:
+            logger.warning(f"No PDF URL available for {paper.title}")
+            return None
+        urlretrieve(paper.pdf_url, path)
+        try:
+            full_text = extract_markdown_from_pdf(path)
+        except Exception as e:
+            logger.warning(f"Failed to extract full text of {paper.title} from pdf: {e}")
+            full_text = None
+        return full_text
+
+def extract_text_from_tar(paper: ArxivResult) -> str | None:
+    with TemporaryDirectory() as temp_dir:
+        path = os.path.join(temp_dir, "paper.tar.gz")
+        source_url = paper.source_url()
+        if source_url is None:
+            logger.warning(f"No source URL available for {paper.title}")
+            return None
+        urlretrieve(source_url, path)
+        try:
+            file_contents = extract_tex_code_from_tar(path, paper.entry_id)
+            if "all" not in file_contents:
+                logger.warning(f"Failed to extract full text of {paper.title} from tar: Main tex file not found.")
+                return None
+            full_text = file_contents["all"]
+        except Exception as e:
+            logger.warning(f"Failed to extract full text of {paper.title} from tar: {e}")
+            full_text = None
+        return full_text
diff --git a/src/zotero_arxiv_daily/utils.py b/src/zotero_arxiv_daily/utils.py
@@ -58,7 +58,7 @@ def extract_tex_code_from_tar(file_path:str, paper_id:str) -> dict[str,str]:
         content = re.sub(r'\\\\', '', content)
         #remove consecutive spaces
         content = re.sub(r'[ \t\r\f]{3,}', ' ', content)
-        if main_tex is None and re.search(r'\\begin\{document\}', content):
+        if main_tex is None and re.search(r'\\begin\{document\}', content) and not any(w in t for w in ['example', 'sample']):
             main_tex = t
             logger.debug(f"Choose {t} as main tex file of {paper_id}")
         file_contents[t] = content