Skip to content

Commit b70dafc

Browse files
committed
fix: improve research bot implementation quality
- Add .gitignore to exclude Python artifacts - Fix CI/CD path configuration - Improve error handling with specific exceptions - Add comprehensive type hints - Implement structured logging - Expand test coverage from 5 to 17 tests - Handle edge cases and network errors - Improve code maintainability and debugging Addresses all issues from technical review
1 parent 622542a commit b70dafc

File tree

5 files changed

+316
-35
lines changed

5 files changed

+316
-35
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ jobs:
2020
- name: Ruff
2121
run: |
2222
ruff --version
23-
ruff check awesome-isaac-gym/scripts awesome-isaac-gym/tests
23+
ruff check scripts tests
2424
- name: Pytest
2525
run: |
26-
pytest -q awesome-isaac-gym/tests
26+
pytest -q tests
2727
2828
bot-idempotency:
2929
runs-on: ubuntu-latest
@@ -37,7 +37,7 @@ jobs:
3737
python-version: '3.11'
3838
- name: Run bot and verify no diff
3939
run: |
40-
python awesome-isaac-gym/scripts/research_bot.py || true
40+
python scripts/research_bot.py || true
4141
git status --porcelain
4242
# Ensure the bot is idempotent for its own PR branch
4343
if ! git diff --quiet; then

.gitignore

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
*.egg
7+
*.egg-info/
8+
dist/
9+
build/
10+
*.pyo
11+
*.pyc
12+
.pytest_cache/
13+
.coverage
14+
htmlcov/
15+
.tox/
16+
.env
17+
.venv
18+
venv/
19+
ENV/
20+
21+
# IDE
22+
.vscode/
23+
.idea/
24+
*.swp
25+
*.swo
26+
*~
27+
.DS_Store
28+
29+
# Bot state
30+
.bot/state.json
31+
32+
# Logs
33+
*.log
34+
logs/
2.66 KB
Binary file not shown.

scripts/research_bot.py

Lines changed: 71 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,26 @@
22
import datetime as dt
33
import argparse
44
import json
5+
import logging
56
import os
67
import re
78
import sys
89
import urllib.parse
910
import urllib.request
1011
import xml.etree.ElementTree as ET
12+
from typing import Dict, List, Set, Tuple, Optional
13+
14+
# Configure logging
15+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16+
logger = logging.getLogger(__name__)
1117

1218

1319
ARXIV_API = "http://export.arxiv.org/api/query"
1420
MARKER_START = "<!-- research-bot:start -->"
1521
MARKER_END = "<!-- research-bot:end -->"
1622

1723

18-
def load_config(path: str) -> dict:
24+
def load_config(path: str) -> Dict[str, any]:
1925
cfg = {
2026
"queries": [
2127
"\"isaac gym\"",
@@ -32,25 +38,42 @@ def load_config(path: str) -> dict:
3238
if os.path.exists(path):
3339
try:
3440
import yaml # type: ignore
35-
except Exception:
36-
# Minimal YAML parser fallback not implemented; use defaults
41+
except ImportError:
42+
logger.warning("PyYAML not available, using default configuration")
3743
return cfg
3844
else:
39-
with open(path, "r", encoding="utf-8") as f:
40-
user_cfg = yaml.safe_load(f) or {}
41-
cfg.update({k: v for k, v in user_cfg.items() if v is not None})
45+
try:
46+
with open(path, "r", encoding="utf-8") as f:
47+
user_cfg = yaml.safe_load(f) or {}
48+
cfg.update({k: v for k, v in user_cfg.items() if v is not None})
49+
except yaml.YAMLError as e:
50+
logger.error(f"Failed to parse YAML config: {e}")
51+
return cfg
52+
except IOError as e:
53+
logger.error(f"Failed to read config file: {e}")
54+
return cfg
4255
return cfg
4356

4457

45-
def iso_date(s: str) -> dt.date:
46-
# arXiv returns e.g. 2024-08-21T12:34:56Z
47-
return dt.datetime.fromisoformat(s.replace("Z", "+00:00")).date()
58+
def iso_date(s: str) -> Optional[dt.date]:
59+
"""Parse ISO date string from arXiv (e.g., 2024-08-21T12:34:56Z)."""
60+
try:
61+
return dt.datetime.fromisoformat(s.replace("Z", "+00:00")).date()
62+
except (ValueError, AttributeError) as e:
63+
logger.warning(f"Failed to parse date '{s}': {e}")
64+
return None
4865

4966

50-
def parse_arxiv_feed(xml_bytes: bytes) -> list[dict]:
51-
root = ET.fromstring(xml_bytes)
67+
def parse_arxiv_feed(xml_bytes: bytes) -> List[Dict[str, any]]:
68+
"""Parse arXiv Atom feed XML and extract paper information."""
69+
try:
70+
root = ET.fromstring(xml_bytes)
71+
except ET.ParseError as e:
72+
logger.error(f"Failed to parse XML: {e}")
73+
return []
74+
5275
ns = {"atom": "http://www.w3.org/2005/Atom"}
53-
papers: list[dict] = []
76+
papers: List[Dict[str, any]] = []
5477
for entry in root.findall("atom:entry", ns):
5578
eid = entry.findtext("atom:id", default="", namespaces=ns)
5679
title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
@@ -91,44 +114,58 @@ def parse_arxiv_feed(xml_bytes: bytes) -> list[dict]:
91114
return papers
92115

93116

94-
def fetch_arxiv(query: str, max_results: int) -> list[dict]:
117+
def fetch_arxiv(query: str, max_results: int) -> List[Dict[str, any]]:
95118
params = {
96119
"search_query": f"all:{query}",
97120
"sortBy": "submittedDate",
98121
"sortOrder": "descending",
99122
"max_results": str(max_results),
100123
}
124+
"""Fetch papers from arXiv API for the given query."""
101125
url = f"{ARXIV_API}?{urllib.parse.urlencode(params)}"
102126
req = urllib.request.Request(url, headers={"User-Agent": "research-bot/1.0"})
103-
with urllib.request.urlopen(req, timeout=30) as resp:
104-
data = resp.read()
105-
return parse_arxiv_feed(data)
106-
107-
108-
def load_seen(path: str) -> set[str]:
127+
128+
try:
129+
with urllib.request.urlopen(req, timeout=30) as resp:
130+
data = resp.read()
131+
return parse_arxiv_feed(data)
132+
except urllib.error.URLError as e:
133+
logger.error(f"Network error fetching arXiv data for query '{query}': {e}")
134+
return []
135+
except TimeoutError as e:
136+
logger.error(f"Timeout fetching arXiv data for query '{query}': {e}")
137+
return []
138+
139+
140+
def load_seen(path: str) -> Set[str]:
109141
if not os.path.exists(path):
110142
return set()
111-
with open(path, "r", encoding="utf-8") as f:
112-
try:
143+
try:
144+
with open(path, "r", encoding="utf-8") as f:
113145
data = json.load(f)
114-
except Exception:
115-
return set()
146+
except (json.JSONDecodeError, IOError) as e:
147+
logger.warning(f"Failed to load state file: {e}")
148+
return set()
116149
return set(data.get("arxiv_ids", []))
117150

118151

119-
def save_seen(path: str, ids: set[str]) -> None:
120-
os.makedirs(os.path.dirname(path), exist_ok=True)
121-
with open(path, "w", encoding="utf-8") as f:
122-
json.dump({"arxiv_ids": sorted(ids)}, f, indent=2)
152+
def save_seen(path: str, ids: Set[str]) -> None:
153+
"""Save seen paper IDs to state file."""
154+
try:
155+
os.makedirs(os.path.dirname(path), exist_ok=True)
156+
with open(path, "w", encoding="utf-8") as f:
157+
json.dump({"arxiv_ids": sorted(ids)}, f, indent=2)
158+
except IOError as e:
159+
logger.error(f"Failed to save state file: {e}")
123160

124161

125-
def extract_existing_ids_from_readme(readme_text: str) -> set[str]:
162+
def extract_existing_ids_from_readme(readme_text: str) -> Set[str]:
126163
# Capture IDs like 2401.00001 possibly followed by version or .pdf, but only keep the ID
127164
pattern = re.compile(r"arxiv\.org/(?:abs|pdf)/([0-9]+\.[0-9]+)(?:v\d+)?(?:\.pdf)?")
128165
return set(m.group(1) for m in pattern.finditer(readme_text))
129166

130167

131-
def render_bullets(papers: list[dict]) -> str:
168+
def render_bullets(papers: List[Dict[str, any]]) -> str:
132169
lines = []
133170
for p in papers:
134171
authors = ", ".join(p.get("authors", [])[:3])
@@ -141,7 +178,7 @@ def render_bullets(papers: list[dict]) -> str:
141178
return "\n".join(lines)
142179

143180

144-
def update_readme(readme_path: str, section_title: str, bullets_block: str) -> tuple[str, bool]:
181+
def update_readme(readme_path: str, section_title: str, bullets_block: str) -> Tuple[str, bool]:
145182
changed = False
146183
if not os.path.exists(readme_path):
147184
return "", False
@@ -191,10 +228,12 @@ def main() -> int:
191228
collected: dict[str, dict] = {}
192229
for q in queries:
193230
try:
194-
for p in fetch_arxiv(q, max_results):
231+
papers = fetch_arxiv(q, max_results)
232+
for p in papers:
195233
collected[p["id"]] = p
234+
logger.info(f"Fetched {len(papers)} papers for query '{q}'")
196235
except Exception as e:
197-
print(f"Warning: query '{q}' failed: {e}", file=sys.stderr)
236+
logger.error(f"Unexpected error for query '{q}': {e}", exc_info=True)
198237

199238
if not collected:
200239
print("No papers fetched; exiting.")

0 commit comments

Comments
 (0)