22import arxiv
33from arxiv import Result as ArxivResult
44from ..protocol import Paper
5- from ..utils import extract_markdown_from_pdf
5+ from ..utils import extract_markdown_from_pdf , extract_tex_code_from_tar
66from tempfile import TemporaryDirectory
77import feedparser
88from urllib .request import urlretrieve
@@ -43,14 +43,9 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
4343 authors = [a .name for a in raw_paper .authors ]
4444 abstract = raw_paper .summary
4545 pdf_url = raw_paper .pdf_url
46- with TemporaryDirectory () as temp_dir :
47- path = os .path .join (temp_dir , "paper.pdf" )
48- urlretrieve (pdf_url , path )
49- try :
50- full_text = extract_markdown_from_pdf (path )
51- except Exception as e :
52- logger .warning (f"Failed to extract full text of { title } : { e } " )
53- full_text = None
46+ full_text = extract_text_from_tar (raw_paper )
47+ if full_text is None :
48+ full_text = extract_text_from_pdf (raw_paper )
5449 return Paper (
5550 source = self .name ,
5651 title = title ,
@@ -59,4 +54,37 @@ def convert_to_paper(self, raw_paper:ArxivResult) -> Paper:
5954 url = raw_paper .entry_id ,
6055 pdf_url = pdf_url ,
6156 full_text = full_text
62- )
57+ )
58+
59+ def extract_text_from_pdf (paper : ArxivResult ) -> str | None :
60+ with TemporaryDirectory () as temp_dir :
61+ path = os .path .join (temp_dir , "paper.pdf" )
62+ if paper .pdf_url is None :
63+ logger .warning (f"No PDF URL available for { paper .title } " )
64+ return None
65+ urlretrieve (paper .pdf_url , path )
66+ try :
67+ full_text = extract_markdown_from_pdf (path )
68+ except Exception as e :
69+ logger .warning (f"Failed to extract full text of { paper .title } from pdf: { e } " )
70+ full_text = None
71+ return full_text
72+
73+ def extract_text_from_tar (paper : ArxivResult ) -> str | None :
74+ with TemporaryDirectory () as temp_dir :
75+ path = os .path .join (temp_dir , "paper.tar.gz" )
76+ source_url = paper .source_url ()
77+ if source_url is None :
78+ logger .warning (f"No source URL available for { paper .title } " )
79+ return None
80+ urlretrieve (source_url , path )
81+ try :
82+ file_contents = extract_tex_code_from_tar (path , paper .entry_id )
83+ if "all" not in file_contents :
84+ logger .warning (f"Failed to extract full text of { paper .title } from tar: Main tex file not found." )
85+ return None
86+ full_text = file_contents ["all" ]
87+ except Exception as e :
88+ logger .warning (f"Failed to extract full text of { paper .title } from tar: { e } " )
89+ full_text = None
90+ return full_text
0 commit comments