Skip to content

Commit ba640e7

Browse files
committed
update
1 parent cc7545c commit ba640e7

24 files changed

+365
-1342
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ test.ipynb
1515
logs
1616
models
1717
test-compose.yml
18-
outputs
18+
outputs
19+
config/private.yaml

config/base.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@ llm:
1818
api:
1919
key: ???
2020
base_url: ???
21-
model: Qwen/Qwen3-30B-A3B-Instruct-2507
22-
max_retries: 3
2321
timeout: 180
2422
generation_kwargs:
2523
max_tokens: 16384
2624
temperature: 0.4
25+
model: gpt-4o-mini
2726
language: English
2827

2928
reranker:

config/public.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
zotero:
2-
ignore_collection: null
2+
include_path: null
33
source:
44
arxiv:
55
query: cs.AI+cs.CV+cs.LG+cs.CL

src/zotero_arxiv_daily/construct_email.py

Lines changed: 9 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
1-
from paper import ArxivPaper
1+
from .protocol import Paper
22
import math
3-
from tqdm import tqdm
4-
from email.header import Header
5-
from email.mime.text import MIMEText
6-
from email.utils import parseaddr, formataddr
7-
import smtplib
8-
import datetime
9-
import time
10-
from loguru import logger
3+
114

125
framework = """
136
<!DOCTYPE HTML>
@@ -59,8 +52,7 @@ def get_empty_html():
5952
"""
6053
return block_template
6154

62-
def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str, pdf_url:str, code_url:str=None, affiliations:str=None):
63-
code = f'<a href="{code_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #5bc0de; padding: 8px 16px; border-radius: 4px; margin-left: 8px;">Code</a>' if code_url else ''
55+
def get_block_html(title:str, authors:str, rate:str, tldr:str, pdf_url:str, affiliations:str=None):
6456
block_template = """
6557
<table border="0" cellpadding="0" cellspacing="0" width="100%" style="font-family: Arial, sans-serif; border: 1px solid #ddd; border-radius: 8px; padding: 16px; background-color: #f9f9f9;">
6658
<tr>
@@ -82,24 +74,18 @@ def get_block_html(title:str, authors:str, rate:str,arxiv_id:str, abstract:str,
8274
</tr>
8375
<tr>
8476
<td style="font-size: 14px; color: #333; padding: 8px 0;">
85-
<strong>arXiv ID:</strong> {arxiv_id}
86-
</td>
87-
</tr>
88-
<tr>
89-
<td style="font-size: 14px; color: #333; padding: 8px 0;">
90-
<strong>TLDR:</strong> {abstract}
77+
<strong>TLDR:</strong> {tldr}
9178
</td>
9279
</tr>
9380
9481
<tr>
9582
<td style="padding: 8px 0;">
9683
<a href="{pdf_url}" style="display: inline-block; text-decoration: none; font-size: 14px; font-weight: bold; color: #fff; background-color: #d9534f; padding: 8px 16px; border-radius: 4px;">PDF</a>
97-
{code}
9884
</td>
9985
</tr>
10086
</table>
10187
"""
102-
return block_template.format(title=title, authors=authors,rate=rate,arxiv_id=arxiv_id, abstract=abstract, pdf_url=pdf_url, code=code, affiliations=affiliations)
88+
return block_template.format(title=title, authors=authors,rate=rate, tldr=tldr, pdf_url=pdf_url, affiliations=affiliations)
10389

10490
def get_stars(score:float):
10591
full_star = '<span class="full-star">⭐</span>'
@@ -118,14 +104,14 @@ def get_stars(score:float):
118104
return '<div class="star-wrapper">'+full_star * full_star_num + half_star * half_star_num + '</div>'
119105

120106

121-
def render_email(papers:list[ArxivPaper]):
107+
def render_email(papers:list[Paper]) -> str:
122108
parts = []
123109
if len(papers) == 0 :
124110
return framework.replace('__CONTENT__', get_empty_html())
125111

126-
for p in tqdm(papers,desc='Rendering Email'):
112+
for p in papers:
127113
rate = get_stars(p.score)
128-
authors = [a.name for a in p.authors[:5]]
114+
authors = p.authors[:5]
129115
authors = ', '.join(authors)
130116
if len(p.authors) > 5:
131117
authors += ', ...'
@@ -136,31 +122,7 @@ def render_email(papers:list[ArxivPaper]):
136122
affiliations += ', ...'
137123
else:
138124
affiliations = 'Unknown Affiliation'
139-
parts.append(get_block_html(p.title, authors,rate,p.arxiv_id ,p.tldr, p.pdf_url, p.code_url, affiliations))
140-
time.sleep(10)
125+
parts.append(get_block_html(p.title, authors,rate,p.tldr, p.pdf_url, affiliations))
141126

142127
content = '<br>' + '</br><br>'.join(parts) + '</br>'
143128
return framework.replace('__CONTENT__', content)
144-
145-
def send_email(sender:str, receiver:str, password:str,smtp_server:str,smtp_port:int, html:str,):
146-
def _format_addr(s):
147-
name, addr = parseaddr(s)
148-
return formataddr((Header(name, 'utf-8').encode(), addr))
149-
150-
msg = MIMEText(html, 'html', 'utf-8')
151-
msg['From'] = _format_addr('Github Action <%s>' % sender)
152-
msg['To'] = _format_addr('You <%s>' % receiver)
153-
today = datetime.datetime.now().strftime('%Y/%m/%d')
154-
msg['Subject'] = Header(f'Daily arXiv {today}', 'utf-8').encode()
155-
156-
try:
157-
server = smtplib.SMTP(smtp_server, smtp_port)
158-
server.starttls()
159-
except Exception as e:
160-
logger.warning(f"Failed to use TLS. {e}")
161-
logger.warning(f"Try to use SSL.")
162-
server = smtplib.SMTP_SSL(smtp_server, smtp_port)
163-
164-
server.login(sender, password)
165-
server.sendmail(sender, [receiver], msg.as_string())
166-
server.quit()

src/zotero_arxiv_daily/executor.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,25 @@
22
from pyzotero import zotero
33
from omegaconf import DictConfig
44
from .utils import glob_match
5-
from .retriever import get_retriever, BaseRetriever
5+
from .retriever import get_retriever_cls
66
from .protocol import CorpusPaper
77
import random
88
from datetime import datetime
9-
from .reranker import get_reranker
9+
from .reranker import get_reranker_cls
10+
from .construct_email import render_email
11+
from .utils import send_email
12+
from openai import OpenAI
1013
class Executor:
1114
def __init__(self, config:DictConfig):
1215
self.config = config
13-
self.retrievers: dict[str, BaseRetriever] = {
14-
source: get_retriever(source)(config) for source in config.executor.source
16+
self.retrievers = {
17+
source: get_retriever_cls(source)(config) for source in config.executor.source
1518
}
16-
self.reranker = get_reranker(config.executor.reranker)
17-
19+
self.reranker = get_reranker_cls(config.executor.reranker)(config)
20+
self.openai_client = OpenAI(api_key=config.llm.api.key, base_url=config.llm.api.base_url)
1821
def fetch_zotero_corpus(self) -> list[CorpusPaper]:
1922
logger.info("Fetching zotero corpus")
20-
zot = zotero.Zotero(self.config.zotero.id, 'user', self.config.zotero.api_key)
23+
zot = zotero.Zotero(self.config.zotero.user_id, 'user', self.config.zotero.api_key)
2124
collections = zot.everything(zot.collections())
2225
collections = {c['key']:c for c in collections}
2326
corpus = zot.everything(zot.items(itemType='conferencePaper || journalArticle || preprint'))
@@ -56,11 +59,17 @@ def filter_corpus(self, corpus:list[CorpusPaper]) -> list[CorpusPaper]:
5659
def run(self):
5760
corpus = self.fetch_zotero_corpus()
5861
corpus = self.filter_corpus(corpus)
59-
source_papers = {}
62+
all_papers = []
6063
for source, retriever in self.retrievers.items():
6164
logger.info(f"Retrieving {source} papers...")
6265
papers = retriever.retrieve_papers()
6366
if len(papers) == 0:
6467
logger.info(f"No {source} papers found")
6568
continue
66-
source_papers[source] = papers
69+
all_papers.extend(papers)
70+
reranked_papers = self.reranker.rerank(all_papers, corpus)
71+
for p in reranked_papers:
72+
p.generate_tldr(self.openai_client, self.config.llm)
73+
p.generate_affiliations(self.openai_client, self.config.llm)
74+
email_content = render_email(reranked_papers)
75+
send_email(self.config, email_content)

src/zotero_arxiv_daily/main.py

Lines changed: 40 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,120 +1,51 @@
1-
import arxiv
21
import os
2+
import logging
33
import sys
4-
from pyzotero import zotero
5-
from recommender import rerank_paper
6-
from construct_email import render_email, send_email
7-
from tqdm import tqdm
8-
from loguru import logger
9-
from gitignore_parser import parse_gitignore
10-
from tempfile import mkstemp
11-
from paper import ArxivPaper
12-
from llm import set_global_llm
13-
import feedparser
144
from omegaconf import DictConfig
155
import hydra
6+
from loguru import logger
7+
from zotero_arxiv_daily.executor import Executor
168
os.environ["TOKENIZERS_PARALLELISM"] = "false"
179

18-
def get_zotero_corpus(id:str,key:str) -> list[dict]:
19-
zot = zotero.Zotero(id, 'user', key)
20-
collections = zot.everything(zot.collections())
21-
collections = {c['key']:c for c in collections}
22-
corpus = zot.everything(zot.items(itemType='conferencePaper || journalArticle || preprint'))
23-
corpus = [c for c in corpus if c['data']['abstractNote'] != '']
24-
def get_collection_path(col_key:str) -> str:
25-
if p := collections[col_key]['data']['parentCollection']:
26-
return get_collection_path(p) + '/' + collections[col_key]['data']['name']
27-
else:
28-
return collections[col_key]['data']['name']
29-
for c in corpus:
30-
paths = [get_collection_path(col) for col in c['data']['collections']]
31-
c['paths'] = paths
32-
return corpus
33-
34-
def filter_corpus(corpus:list[dict], pattern:str) -> list[dict]:
35-
_,filename = mkstemp()
36-
with open(filename,'w') as file:
37-
file.write(pattern)
38-
matcher = parse_gitignore(filename,base_dir='./')
39-
new_corpus = []
40-
for c in corpus:
41-
match_results = [matcher(p) for p in c['paths']]
42-
if not any(match_results):
43-
new_corpus.append(c)
44-
os.remove(filename)
45-
return new_corpus
46-
47-
48-
def get_arxiv_paper(query:str, debug:bool=False) -> list[ArxivPaper]:
49-
client = arxiv.Client(num_retries=10,delay_seconds=10)
50-
feed = feedparser.parse(f"https://rss.arxiv.org/atom/{query}")
51-
if 'Feed error for query' in feed.feed.title:
52-
raise Exception(f"Invalid ARXIV_QUERY: {query}.")
53-
if not debug:
54-
papers = []
55-
all_paper_ids = [i.id.removeprefix("oai:arXiv.org:") for i in feed.entries if i.arxiv_announce_type == 'new']
56-
bar = tqdm(total=len(all_paper_ids),desc="Retrieving Arxiv papers")
57-
for i in range(0,len(all_paper_ids),50):
58-
search = arxiv.Search(id_list=all_paper_ids[i:i+50])
59-
batch = [ArxivPaper(p) for p in client.results(search)]
60-
bar.update(len(batch))
61-
papers.extend(batch)
62-
bar.close()
6310

64-
else:
65-
logger.debug("Retrieve 5 arxiv papers regardless of the date.")
66-
search = arxiv.Search(query='cat:cs.AI', sort_by=arxiv.SortCriterion.SubmittedDate)
67-
papers = []
68-
for i in client.results(search):
69-
papers.append(ArxivPaper(i))
70-
if len(papers) == 5:
71-
break
72-
73-
return papers
74-
75-
@hydra.main(version_base=None, config_path="config", config_name="default")
11+
@hydra.main(version_base=None, config_path="../../config", config_name="default")
7612
def main(config:DictConfig):
77-
assert (
78-
not config.llm.use_api or config.llm.api.key is not None
79-
) # If use_llm_api is True, openai_api_key must be provided
80-
if config.executor.debug:
81-
logger.remove()
82-
logger.add(sys.stdout, level="DEBUG")
83-
logger.debug("Debug mode is on.")
84-
else:
85-
logger.remove()
86-
logger.add(sys.stdout, level="INFO")
87-
88-
logger.info("Retrieving Zotero corpus...")
89-
corpus = get_zotero_corpus(config.zotero.user_id, config.zotero.api_key)
90-
logger.info(f"Retrieved {len(corpus)} papers from Zotero.")
91-
if config.zotero.ignore_collection:
92-
logger.info(f"Ignoring papers in:\n {config.zotero.ignore_collection}...")
93-
corpus = filter_corpus(corpus, config.zotero.ignore_collection)
94-
logger.info(f"Remaining {len(corpus)} papers after filtering.")
95-
logger.info("Retrieving Arxiv papers...")
96-
papers = get_arxiv_paper(config.arxiv.query, config.executor.debug)
97-
if len(papers) == 0:
98-
logger.info("No new papers found. Yesterday maybe a holiday and no one submit their work :). If this is not the case, please check the ARXIV_QUERY.")
99-
if not config.executor.send_empty:
100-
exit(0)
101-
else:
102-
logger.info("Reranking papers...")
103-
papers = rerank_paper(papers, corpus)
104-
if config.executor.max_paper_num != -1:
105-
papers = papers[:config.executor.max_paper_num]
106-
if config.llm.use_api:
107-
logger.info("Using OpenAI API as global LLM.")
108-
set_global_llm(api_key=config.llm.api.key, base_url=config.llm.api.base_url, model=config.llm.name, lang=config.llm.generation_kwargs.language)
109-
else:
110-
logger.info("Using Local LLM as global LLM.")
111-
set_global_llm(lang=config.llm.generation_kwargs.language)
112-
113-
html = render_email(papers)
114-
logger.info("Sending email...")
115-
send_email(config.email.sender, config.email.receiver, config.email.sender_password, config.email.smtp_server, config.email.smtp_port, html)
116-
logger.success("Email sent successfully! If you don't receive the email, please check the configuration and the junk box.")
117-
13+
# Configure loguru log level based on config
14+
log_level = "DEBUG" if config.executor.debug else "INFO"
15+
logger.remove() # Remove default handler
16+
logger.add(
17+
sys.stderr,
18+
level=log_level,
19+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
20+
)
21+
22+
# Intercept standard logging (including httpx) and route through loguru
23+
# Only show WARNING and above for httpx to reduce noise
24+
class InterceptHandler(logging.Handler):
25+
def emit(self, record):
26+
# Filter httpx INFO logs
27+
if record.name == "httpx" and record.levelno < logging.WARNING:
28+
return
29+
30+
# Get corresponding Loguru level if it exists
31+
try:
32+
level = logger.level(record.levelname).name
33+
except ValueError:
34+
level = record.levelno
35+
36+
# Find caller from where the logged message originated
37+
frame, depth = sys._getframe(), 6
38+
while frame and frame.f_code.co_filename == logging.__file__:
39+
frame = frame.f_back
40+
depth += 1
41+
42+
logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
43+
44+
# Remove all existing handlers and add our interceptor
45+
logging.basicConfig(handlers=[InterceptHandler()], level=0, force=True)
46+
47+
executor = Executor(config)
48+
executor.run()
11849

11950
if __name__ == '__main__':
12051
main()

0 commit comments

Comments
 (0)