From 9e9047aa32fceaa794dbdc604728567c26a7da32 Mon Sep 17 00:00:00 2001 From: Jasonzhang517 Date: Mon, 3 Mar 2025 02:32:30 +0000 Subject: [PATCH] fixed date filtering bugs and workflow bug resolved conflicts --- .github/workflows/scholar_classifier.yml | 2 +- scholar_classifier.py | 128 ++++++++++++++++------- tests/test_integration.py | 7 +- 3 files changed, 98 insertions(+), 39 deletions(-) diff --git a/.github/workflows/scholar_classifier.yml b/.github/workflows/scholar_classifier.yml index f7f0983..8211c98 100644 --- a/.github/workflows/scholar_classifier.yml +++ b/.github/workflows/scholar_classifier.yml @@ -31,5 +31,5 @@ jobs: GMAIL_USERNAME: ${{ secrets.GMAIL_USERNAME }} GMAIL_APP_PASSWORD: ${{ secrets.GMAIL_APP_PASSWORD }} SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }} - PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} + PPLX_API_KEY: ${{ secrets.PPLX_API_KEY }} run: python scholar_classifier.py diff --git a/scholar_classifier.py b/scholar_classifier.py index 6de358e..5d7988a 100644 --- a/scholar_classifier.py +++ b/scholar_classifier.py @@ -30,6 +30,7 @@ import os import urllib.parse from string import Template +from datetime import datetime, timedelta import html2text import yaml @@ -79,7 +80,7 @@ class ScholarClassifier: - Sending weekly updates to system channels """ - def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_notifier=None): + def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_notifier=None, debug_mode=False): """ Initialize the classifier with configuration and clients. @@ -88,6 +89,7 @@ def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_n config_dict: Configuration dictionary (optional) pplx_client: Pre-configured Perplexity client (optional) slack_notifier: Pre-configured Slack notifier (optional) + debug_mode: If True, disable Slack notifications (default: False) Raises: ValueError: If neither config_file nor config_dict is provided @@ -132,6 +134,11 @@ def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_n self._processed_titles = set() self._processed_urls = set() + # Add debug mode flag + self.debug_mode = debug_mode + if self.debug_mode: + self.logger.info("Running in debug mode - Slack notifications disabled") + def _init_research_topics(self): """Initialize research topics from configuration.""" return [ResearchTopic(**topic_config) for topic_config in self.config["research_topics"]] @@ -272,7 +279,7 @@ def extract_and_classify_papers(self, email_message): self.logger.info("No papers found in email") return [] - # Filter out duplicates before processing + # Filter out duplicates and patents before processing filtered_papers = [] for paper in papers: title = paper.title.lower().strip() @@ -286,6 +293,14 @@ def extract_and_classify_papers(self, email_message): self.logger.info(f"Skipping duplicate paper (by URL): {paper.title}") continue + # Skip if it's a patent + if any(word in title.lower() for word in ['patent', 'apparatus', 'method and system']): + self.logger.info(f"Skipping patent: {paper.title}") + continue + if 'patent' in paper.authors.lower(): + self.logger.info(f"Skipping patent (from authors): {paper.title}") + continue + # Add to tracking sets self._processed_titles.add(title) if url: @@ -294,7 +309,7 @@ def extract_and_classify_papers(self, email_message): filtered_papers.append(paper) self.logger.info( - f"Found {len(papers)} papers, {len(filtered_papers)} after duplicate filtering" + f"Found {len(papers)} papers, {len(filtered_papers)} after filtering out duplicates and patents" ) results = [] @@ -430,33 +445,34 @@ def extract_and_classify_papers(self, email_message): def _build_email_search_query(self): """Build IMAP search query based on search criteria.""" - from datetime import datetime, timedelta + # Load search criteria with open("search_criteria.yml", "r") as f: criteria = yaml.safe_load(f)["email_filter"] - - # Base query parts - query_parts = [f'FROM "{criteria["from"]}"', f'SUBJECT "{criteria["subject"]}"'] - - # Add time window - if criteria["time_window"]: - - # Parse time window - amount = int(criteria["time_window"][:-1]) - unit = criteria["time_window"][-1] - - if unit == "D": - delta = timedelta(days=amount) - elif unit == "W": - delta = timedelta(weeks=amount) - elif unit == "M": - delta = timedelta(days=amount * 30) - - # Calculate date range - since_date = datetime.now() - delta - date_str = since_date.strftime("%d-%b-%Y") - query_parts.append(f'SINCE "{date_str}"') - - return " ".join(query_parts) + # Parse time window + time_window = criteria["time_window"] + amount = int(time_window[:-1]) # get number + unit = time_window[-1] # get unit (D/W/M) + # Calculate date range + end_date = datetime.now() + if unit == "D": + delta = timedelta(days=amount) + elif unit == "W": + delta = timedelta(weeks=amount) + elif unit == "M": + delta = timedelta(days=amount * 30) # approximate + else: + raise ValueError(f"Invalid time window unit: {unit}") + start_date = end_date - delta + # Format dates for IMAP query + since_date = start_date.strftime("%d-%b-%Y") + before_date = end_date.strftime("%d-%b-%Y") + # Build query with date restriction + query = f'(FROM "{criteria["from"]}" SUBJECT "{criteria["subject"]}" SINCE "{since_date}" BEFORE "{before_date}")' + # Log the query for debugging + self.logger.info(f"Using search query: {query}") + self.logger.info(f"Time window: {time_window}") + self.logger.info(f"Date range: {since_date} to {before_date}") + return query def run(self, folder=None): """Main execution loop.""" @@ -474,6 +490,7 @@ def run(self, folder=None): if status != "OK": self.logger.error(f"Failed to select folder {folder_name}: {folder_info}") return + # Build and execute search query search_query = self._build_email_search_query() self.logger.info(f"Using search query: {search_query}") @@ -484,23 +501,46 @@ def run(self, folder=None): self.logger.info(f"Searching in folder: {folder}") - processed_papers = False + total_papers = 0 + processed_emails = 0 + self.logger.info("\n=== Starting Paper Processing ===") for num in message_numbers[0].split(): + processed_emails += 1 _, msg_data = mail.fetch(num, "(RFC822)") email_body = msg_data[0][1] email_message = email.message_from_bytes(email_body) + self.logger.info(f"\nProcessing email {processed_emails}: {email_message['subject']}") # Extract and classify papers in one step paper_results = self.extract_and_classify_papers(email_message) # Send notifications using slack_notifier directly if paper_results: - processed_papers = True - if self.slack_notifier: + total_papers += len(paper_results) + if self.slack_notifier and not self.debug_mode: # Only send if not in debug mode self.slack_notifier.notify_matches(paper_results) - - # Send weekly update if papers were processed - if processed_papers: + else: + # Print paper results to console instead + self.logger.info(f"\n=== Paper Results (Email {processed_emails}) ===") + for i, (paper, topics) in enumerate(paper_results, 1): + self.logger.info( + f"\nPaper {total_papers - len(paper_results) + i}/{total_papers} " + f"(email {i}/{len(paper_results)})" + ) + self.logger.info(f"Title: {paper.title}") + self.logger.info(f"Authors: {paper.authors}") + self.logger.info(f"Venue: {paper.venue}") + self.logger.info(f"URL: {paper.url}") + self.logger.info(f"Matched Topics: {[t.name for t in topics]}") + self.logger.info(f"Abstract: {paper.abstract[:200]}...") + self.logger.info("-" * 80) + + self.logger.info("\n=== Processing Complete ===") + self.logger.info(f"Processed {processed_emails} emails") + self.logger.info(f"Total papers extracted: {total_papers}") + + # Skip weekly update in debug mode + if total_papers > 0 and not self.debug_mode: self.send_weekly_update_notification() mail.logout() @@ -674,6 +714,9 @@ def _load_research_topics(self): def send_weekly_update_notification(self): """Send notifications to systems channels about weekly paper updates.""" + if self.debug_mode: + self.logger.info("Debug mode: Skipping weekly update notification") + return def format_topic_summary(papers_by_topic): summary = [] @@ -725,6 +768,18 @@ def format_topic_summary(papers_by_topic): if __name__ == "__main__": + import argparse + + # Set up argument parser + parser = argparse.ArgumentParser(description='Run the Scholar Classifier') + parser.add_argument( + '--debug', + action='store_true', # This makes it a flag that's either True or False + default=False, # Default value is False + help='Run in debug mode (disable Slack notifications)' + ) + args = parser.parse_args() + # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -732,9 +787,12 @@ def format_topic_summary(papers_by_topic): # Load environment variables load_dotenv() + # Log debug mode status + logger.info(f"Debug mode: {'enabled' if args.debug else 'disabled'}") + # Load config and print relevant parts (without sensitive data) logger.info("Loading configuration...") - classifier = ScholarClassifier(config_file="config.yml") + classifier = ScholarClassifier(config_file="config.yml", debug_mode=args.debug) # Print config structure (without passwords) safe_config = classifier.config.copy() diff --git a/tests/test_integration.py b/tests/test_integration.py index d29f42d..f939b0a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -23,10 +23,10 @@ from scholar_classifier import ScholarClassifier -class TestGmailIntegration(unittest.TestCase): +class TestIntegration(unittest.TestCase): @classmethod def setUpClass(cls): - """Set up test environment.""" + """Set up test environment for full pipeline integration testing.""" # Setup detailed logging logging.basicConfig( level=logging.DEBUG, @@ -63,6 +63,7 @@ def setUpClass(cls): def test_end_to_end_pipeline(self): """Test the entire pipeline from Gmail connection to paper classification.""" try: + # Use the main program's methods mail = self.classifier.connect_to_gmail() # for dmitrii's account @@ -83,7 +84,7 @@ def test_end_to_end_pipeline(self): messages = message_numbers[0].split() self.assertTrue(len(messages) > 0, "No Google Scholar alert emails found") - print(f"\nFound {len(messages)} Google Scholar alert emails from Dec 23, 2024") + print(f"\nFound {len(messages)} Google Scholar alert emails from Jan 05, 2025") # Use messages directly since they're already filtered test_messages = messages