22import datetime as dt
33import argparse
44import json
5+ import logging
56import os
67import re
78import sys
89import urllib .parse
910import urllib .request
1011import xml .etree .ElementTree as ET
12+ from typing import Dict , List , Set , Tuple , Optional
13+
14+ # Configure logging
15+ logging .basicConfig (level = logging .INFO , format = '%(asctime)s - %(levelname)s - %(message)s' )
16+ logger = logging .getLogger (__name__ )
1117
1218
1319ARXIV_API = "http://export.arxiv.org/api/query"
1420MARKER_START = "<!-- research-bot:start -->"
1521MARKER_END = "<!-- research-bot:end -->"
1622
1723
18- def load_config (path : str ) -> dict :
24+ def load_config (path : str ) -> Dict [ str , any ] :
1925 cfg = {
2026 "queries" : [
2127 "\" isaac gym\" " ,
@@ -32,25 +38,42 @@ def load_config(path: str) -> dict:
3238 if os .path .exists (path ):
3339 try :
3440 import yaml # type: ignore
35- except Exception :
36- # Minimal YAML parser fallback not implemented; use defaults
41+ except ImportError :
42+ logger . warning ( "PyYAML not available, using default configuration" )
3743 return cfg
3844 else :
39- with open (path , "r" , encoding = "utf-8" ) as f :
40- user_cfg = yaml .safe_load (f ) or {}
41- cfg .update ({k : v for k , v in user_cfg .items () if v is not None })
45+ try :
46+ with open (path , "r" , encoding = "utf-8" ) as f :
47+ user_cfg = yaml .safe_load (f ) or {}
48+ cfg .update ({k : v for k , v in user_cfg .items () if v is not None })
49+ except yaml .YAMLError as e :
50+ logger .error (f"Failed to parse YAML config: { e } " )
51+ return cfg
52+ except IOError as e :
53+ logger .error (f"Failed to read config file: { e } " )
54+ return cfg
4255 return cfg
4356
4457
45- def iso_date (s : str ) -> dt .date :
46- # arXiv returns e.g. 2024-08-21T12:34:56Z
47- return dt .datetime .fromisoformat (s .replace ("Z" , "+00:00" )).date ()
58+ def iso_date (s : str ) -> Optional [dt .date ]:
59+ """Parse ISO date string from arXiv (e.g., 2024-08-21T12:34:56Z)."""
60+ try :
61+ return dt .datetime .fromisoformat (s .replace ("Z" , "+00:00" )).date ()
62+ except (ValueError , AttributeError ) as e :
63+ logger .warning (f"Failed to parse date '{ s } ': { e } " )
64+ return None
4865
4966
50- def parse_arxiv_feed (xml_bytes : bytes ) -> list [dict ]:
51- root = ET .fromstring (xml_bytes )
67+ def parse_arxiv_feed (xml_bytes : bytes ) -> List [Dict [str , any ]]:
68+ """Parse arXiv Atom feed XML and extract paper information."""
69+ try :
70+ root = ET .fromstring (xml_bytes )
71+ except ET .ParseError as e :
72+ logger .error (f"Failed to parse XML: { e } " )
73+ return []
74+
5275 ns = {"atom" : "http://www.w3.org/2005/Atom" }
53- papers : list [ dict ] = []
76+ papers : List [ Dict [ str , any ] ] = []
5477 for entry in root .findall ("atom:entry" , ns ):
5578 eid = entry .findtext ("atom:id" , default = "" , namespaces = ns )
5679 title = (entry .findtext ("atom:title" , default = "" , namespaces = ns ) or "" ).strip ()
@@ -91,44 +114,58 @@ def parse_arxiv_feed(xml_bytes: bytes) -> list[dict]:
91114 return papers
92115
93116
94- def fetch_arxiv (query : str , max_results : int ) -> list [ dict ]:
117+ def fetch_arxiv (query : str , max_results : int ) -> List [ Dict [ str , any ] ]:
95118 params = {
96119 "search_query" : f"all:{ query } " ,
97120 "sortBy" : "submittedDate" ,
98121 "sortOrder" : "descending" ,
99122 "max_results" : str (max_results ),
100123 }
124+ """Fetch papers from arXiv API for the given query."""
101125 url = f"{ ARXIV_API } ?{ urllib .parse .urlencode (params )} "
102126 req = urllib .request .Request (url , headers = {"User-Agent" : "research-bot/1.0" })
103- with urllib .request .urlopen (req , timeout = 30 ) as resp :
104- data = resp .read ()
105- return parse_arxiv_feed (data )
106-
107-
108- def load_seen (path : str ) -> set [str ]:
127+
128+ try :
129+ with urllib .request .urlopen (req , timeout = 30 ) as resp :
130+ data = resp .read ()
131+ return parse_arxiv_feed (data )
132+ except urllib .error .URLError as e :
133+ logger .error (f"Network error fetching arXiv data for query '{ query } ': { e } " )
134+ return []
135+ except TimeoutError as e :
136+ logger .error (f"Timeout fetching arXiv data for query '{ query } ': { e } " )
137+ return []
138+
139+
140+ def load_seen (path : str ) -> Set [str ]:
109141 if not os .path .exists (path ):
110142 return set ()
111- with open ( path , "r" , encoding = "utf-8" ) as f :
112- try :
143+ try :
144+ with open ( path , "r" , encoding = "utf-8" ) as f :
113145 data = json .load (f )
114- except Exception :
115- return set ()
146+ except (json .JSONDecodeError , IOError ) as e :
147+ logger .warning (f"Failed to load state file: { e } " )
148+ return set ()
116149 return set (data .get ("arxiv_ids" , []))
117150
118151
119- def save_seen (path : str , ids : set [str ]) -> None :
120- os .makedirs (os .path .dirname (path ), exist_ok = True )
121- with open (path , "w" , encoding = "utf-8" ) as f :
122- json .dump ({"arxiv_ids" : sorted (ids )}, f , indent = 2 )
152+ def save_seen (path : str , ids : Set [str ]) -> None :
153+ """Save seen paper IDs to state file."""
154+ try :
155+ os .makedirs (os .path .dirname (path ), exist_ok = True )
156+ with open (path , "w" , encoding = "utf-8" ) as f :
157+ json .dump ({"arxiv_ids" : sorted (ids )}, f , indent = 2 )
158+ except IOError as e :
159+ logger .error (f"Failed to save state file: { e } " )
123160
124161
125- def extract_existing_ids_from_readme (readme_text : str ) -> set [str ]:
162+ def extract_existing_ids_from_readme (readme_text : str ) -> Set [str ]:
126163 # Capture IDs like 2401.00001 possibly followed by version or .pdf, but only keep the ID
127164 pattern = re .compile (r"arxiv\.org/(?:abs|pdf)/([0-9]+\.[0-9]+)(?:v\d+)?(?:\.pdf)?" )
128165 return set (m .group (1 ) for m in pattern .finditer (readme_text ))
129166
130167
131- def render_bullets (papers : list [ dict ]) -> str :
168+ def render_bullets (papers : List [ Dict [ str , any ] ]) -> str :
132169 lines = []
133170 for p in papers :
134171 authors = ", " .join (p .get ("authors" , [])[:3 ])
@@ -141,7 +178,7 @@ def render_bullets(papers: list[dict]) -> str:
141178 return "\n " .join (lines )
142179
143180
144- def update_readme (readme_path : str , section_title : str , bullets_block : str ) -> tuple [str , bool ]:
181+ def update_readme (readme_path : str , section_title : str , bullets_block : str ) -> Tuple [str , bool ]:
145182 changed = False
146183 if not os .path .exists (readme_path ):
147184 return "" , False
@@ -191,10 +228,12 @@ def main() -> int:
191228 collected : dict [str , dict ] = {}
192229 for q in queries :
193230 try :
194- for p in fetch_arxiv (q , max_results ):
231+ papers = fetch_arxiv (q , max_results )
232+ for p in papers :
195233 collected [p ["id" ]] = p
234+ logger .info (f"Fetched { len (papers )} papers for query '{ q } '" )
196235 except Exception as e :
197- print (f"Warning: query '{ q } ' failed : { e } " , file = sys . stderr )
236+ logger . error (f"Unexpected error for query '{ q } ': { e } " , exc_info = True )
198237
199238 if not collected :
200239 print ("No papers fetched; exiting." )
0 commit comments