1- import arxiv
21import os
2+ import logging
33import sys
4- from pyzotero import zotero
5- from recommender import rerank_paper
6- from construct_email import render_email , send_email
7- from tqdm import tqdm
8- from loguru import logger
9- from gitignore_parser import parse_gitignore
10- from tempfile import mkstemp
11- from paper import ArxivPaper
12- from llm import set_global_llm
13- import feedparser
144from omegaconf import DictConfig
155import hydra
6+ from loguru import logger
7+ from zotero_arxiv_daily .executor import Executor
168os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
179
18- def get_zotero_corpus (id :str ,key :str ) -> list [dict ]:
19- zot = zotero .Zotero (id , 'user' , key )
20- collections = zot .everything (zot .collections ())
21- collections = {c ['key' ]:c for c in collections }
22- corpus = zot .everything (zot .items (itemType = 'conferencePaper || journalArticle || preprint' ))
23- corpus = [c for c in corpus if c ['data' ]['abstractNote' ] != '' ]
24- def get_collection_path (col_key :str ) -> str :
25- if p := collections [col_key ]['data' ]['parentCollection' ]:
26- return get_collection_path (p ) + '/' + collections [col_key ]['data' ]['name' ]
27- else :
28- return collections [col_key ]['data' ]['name' ]
29- for c in corpus :
30- paths = [get_collection_path (col ) for col in c ['data' ]['collections' ]]
31- c ['paths' ] = paths
32- return corpus
33-
34- def filter_corpus (corpus :list [dict ], pattern :str ) -> list [dict ]:
35- _ ,filename = mkstemp ()
36- with open (filename ,'w' ) as file :
37- file .write (pattern )
38- matcher = parse_gitignore (filename ,base_dir = './' )
39- new_corpus = []
40- for c in corpus :
41- match_results = [matcher (p ) for p in c ['paths' ]]
42- if not any (match_results ):
43- new_corpus .append (c )
44- os .remove (filename )
45- return new_corpus
46-
47-
48- def get_arxiv_paper (query :str , debug :bool = False ) -> list [ArxivPaper ]:
49- client = arxiv .Client (num_retries = 10 ,delay_seconds = 10 )
50- feed = feedparser .parse (f"https://rss.arxiv.org/atom/{ query } " )
51- if 'Feed error for query' in feed .feed .title :
52- raise Exception (f"Invalid ARXIV_QUERY: { query } ." )
53- if not debug :
54- papers = []
55- all_paper_ids = [i .id .removeprefix ("oai:arXiv.org:" ) for i in feed .entries if i .arxiv_announce_type == 'new' ]
56- bar = tqdm (total = len (all_paper_ids ),desc = "Retrieving Arxiv papers" )
57- for i in range (0 ,len (all_paper_ids ),50 ):
58- search = arxiv .Search (id_list = all_paper_ids [i :i + 50 ])
59- batch = [ArxivPaper (p ) for p in client .results (search )]
60- bar .update (len (batch ))
61- papers .extend (batch )
62- bar .close ()
6310
64- else :
65- logger .debug ("Retrieve 5 arxiv papers regardless of the date." )
66- search = arxiv .Search (query = 'cat:cs.AI' , sort_by = arxiv .SortCriterion .SubmittedDate )
67- papers = []
68- for i in client .results (search ):
69- papers .append (ArxivPaper (i ))
70- if len (papers ) == 5 :
71- break
72-
73- return papers
74-
75- @hydra .main (version_base = None , config_path = "config" , config_name = "default" )
11+ @hydra .main (version_base = None , config_path = "../../config" , config_name = "default" )
7612def main (config :DictConfig ):
77- assert (
78- not config .llm .use_api or config .llm .api .key is not None
79- ) # If use_llm_api is True, openai_api_key must be provided
80- if config .executor .debug :
81- logger .remove ()
82- logger .add (sys .stdout , level = "DEBUG" )
83- logger .debug ("Debug mode is on." )
84- else :
85- logger .remove ()
86- logger .add (sys .stdout , level = "INFO" )
87-
88- logger .info ("Retrieving Zotero corpus..." )
89- corpus = get_zotero_corpus (config .zotero .user_id , config .zotero .api_key )
90- logger .info (f"Retrieved { len (corpus )} papers from Zotero." )
91- if config .zotero .ignore_collection :
92- logger .info (f"Ignoring papers in:\n { config .zotero .ignore_collection } ..." )
93- corpus = filter_corpus (corpus , config .zotero .ignore_collection )
94- logger .info (f"Remaining { len (corpus )} papers after filtering." )
95- logger .info ("Retrieving Arxiv papers..." )
96- papers = get_arxiv_paper (config .arxiv .query , config .executor .debug )
97- if len (papers ) == 0 :
98- logger .info ("No new papers found. Yesterday maybe a holiday and no one submit their work :). If this is not the case, please check the ARXIV_QUERY." )
99- if not config .executor .send_empty :
100- exit (0 )
101- else :
102- logger .info ("Reranking papers..." )
103- papers = rerank_paper (papers , corpus )
104- if config .executor .max_paper_num != - 1 :
105- papers = papers [:config .executor .max_paper_num ]
106- if config .llm .use_api :
107- logger .info ("Using OpenAI API as global LLM." )
108- set_global_llm (api_key = config .llm .api .key , base_url = config .llm .api .base_url , model = config .llm .name , lang = config .llm .generation_kwargs .language )
109- else :
110- logger .info ("Using Local LLM as global LLM." )
111- set_global_llm (lang = config .llm .generation_kwargs .language )
112-
113- html = render_email (papers )
114- logger .info ("Sending email..." )
115- send_email (config .email .sender , config .email .receiver , config .email .sender_password , config .email .smtp_server , config .email .smtp_port , html )
116- logger .success ("Email sent successfully! If you don't receive the email, please check the configuration and the junk box." )
117-
13+ # Configure loguru log level based on config
14+ log_level = "DEBUG" if config .executor .debug else "INFO"
15+ logger .remove () # Remove default handler
16+ logger .add (
17+ sys .stderr ,
18+ level = log_level ,
19+ format = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
20+ )
21+
22+ # Intercept standard logging (including httpx) and route through loguru
23+ # Only show WARNING and above for httpx to reduce noise
24+ class InterceptHandler (logging .Handler ):
25+ def emit (self , record ):
26+ # Filter httpx INFO logs
27+ if record .name == "httpx" and record .levelno < logging .WARNING :
28+ return
29+
30+ # Get corresponding Loguru level if it exists
31+ try :
32+ level = logger .level (record .levelname ).name
33+ except ValueError :
34+ level = record .levelno
35+
36+ # Find caller from where the logged message originated
37+ frame , depth = sys ._getframe (), 6
38+ while frame and frame .f_code .co_filename == logging .__file__ :
39+ frame = frame .f_back
40+ depth += 1
41+
42+ logger .opt (depth = depth , exception = record .exc_info ).log (level , record .getMessage ())
43+
44+ # Remove all existing handlers and add our interceptor
45+ logging .basicConfig (handlers = [InterceptHandler ()], level = 0 , force = True )
46+
47+ executor = Executor (config )
48+ executor .run ()
11849
11950if __name__ == '__main__' :
12051 main ()
0 commit comments