From 9e9047aa32fceaa794dbdc604728567c26a7da32 Mon Sep 17 00:00:00 2001
From: Jasonzhang517 <yzhang298@e.ntu.edu.sg>
Date: Mon, 3 Mar 2025 02:32:30 +0000
Subject: [PATCH] fixed date filtering bugs and workflow bug

resolved conflicts
---
 .github/workflows/scholar_classifier.yml |   2 +-
 scholar_classifier.py                    | 128 ++++++++++++++++-------
 tests/test_integration.py                |   7 +-
 3 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/scholar_classifier.yml b/.github/workflows/scholar_classifier.yml
index f7f0983..8211c98 100644
--- a/.github/workflows/scholar_classifier.yml
+++ b/.github/workflows/scholar_classifier.yml
@@ -31,5 +31,5 @@ jobs:
         GMAIL_USERNAME: ${{ secrets.GMAIL_USERNAME }}
         GMAIL_APP_PASSWORD: ${{ secrets.GMAIL_APP_PASSWORD }}
         SLACK_API_TOKEN: ${{ secrets.SLACK_API_TOKEN }}
-        PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }}
+        PPLX_API_KEY: ${{ secrets.PPLX_API_KEY }}
       run: python scholar_classifier.py
diff --git a/scholar_classifier.py b/scholar_classifier.py
index 6de358e..5d7988a 100644
--- a/scholar_classifier.py
+++ b/scholar_classifier.py
@@ -30,6 +30,7 @@
 import os
 import urllib.parse
 from string import Template
+from datetime import datetime, timedelta
 
 import html2text
 import yaml
@@ -79,7 +80,7 @@ class ScholarClassifier:
     - Sending weekly updates to system channels
     """
 
-    def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_notifier=None):
+    def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_notifier=None, debug_mode=False):
         """
         Initialize the classifier with configuration and clients.
 
@@ -88,6 +89,7 @@ def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_n
             config_dict: Configuration dictionary (optional)
             pplx_client: Pre-configured Perplexity client (optional)
             slack_notifier: Pre-configured Slack notifier (optional)
+            debug_mode: If True, disable Slack notifications (default: False)
 
         Raises:
             ValueError: If neither config_file nor config_dict is provided
@@ -132,6 +134,11 @@ def __init__(self, config_file=None, config_dict=None, pplx_client=None, slack_n
         self._processed_titles = set()
         self._processed_urls = set()
 
+        # Add debug mode flag
+        self.debug_mode = debug_mode
+        if self.debug_mode:
+            self.logger.info("Running in debug mode - Slack notifications disabled")
+
     def _init_research_topics(self):
         """Initialize research topics from configuration."""
         return [ResearchTopic(**topic_config) for topic_config in self.config["research_topics"]]
@@ -272,7 +279,7 @@ def extract_and_classify_papers(self, email_message):
             self.logger.info("No papers found in email")
             return []
 
-        # Filter out duplicates before processing
+        # Filter out duplicates and patents before processing
         filtered_papers = []
         for paper in papers:
             title = paper.title.lower().strip()
@@ -286,6 +293,14 @@ def extract_and_classify_papers(self, email_message):
                 self.logger.info(f"Skipping duplicate paper (by URL): {paper.title}")
                 continue
 
+            # Skip if it's a patent
+            if any(word in title.lower() for word in ['patent', 'apparatus', 'method and system']):
+                self.logger.info(f"Skipping patent: {paper.title}")
+                continue
+            if 'patent' in paper.authors.lower():
+                self.logger.info(f"Skipping patent (from authors): {paper.title}")
+                continue
+
             # Add to tracking sets
             self._processed_titles.add(title)
             if url:
@@ -294,7 +309,7 @@ def extract_and_classify_papers(self, email_message):
             filtered_papers.append(paper)
 
         self.logger.info(
-            f"Found {len(papers)} papers, {len(filtered_papers)} after duplicate filtering"
+            f"Found {len(papers)} papers, {len(filtered_papers)} after filtering out duplicates and patents"
         )
 
         results = []
@@ -430,33 +445,34 @@ def extract_and_classify_papers(self, email_message):
 
     def _build_email_search_query(self):
         """Build IMAP search query based on search criteria."""
-        from datetime import datetime, timedelta
+        # Load search criteria
         with open("search_criteria.yml", "r") as f:
             criteria = yaml.safe_load(f)["email_filter"]
-
-        # Base query parts
-        query_parts = [f'FROM "{criteria["from"]}"', f'SUBJECT "{criteria["subject"]}"']
-
-        # Add time window
-        if criteria["time_window"]:
-
-            # Parse time window
-            amount = int(criteria["time_window"][:-1])
-            unit = criteria["time_window"][-1]
-
-            if unit == "D":
-                delta = timedelta(days=amount)
-            elif unit == "W":
-                delta = timedelta(weeks=amount)
-            elif unit == "M":
-                delta = timedelta(days=amount * 30)
-
-            # Calculate date range
-            since_date = datetime.now() - delta
-            date_str = since_date.strftime("%d-%b-%Y")
-            query_parts.append(f'SINCE "{date_str}"')
-
-        return " ".join(query_parts)
+        # Parse time window
+        time_window = criteria["time_window"]
+        amount = int(time_window[:-1])  # get number
+        unit = time_window[-1]          # get unit (D/W/M)
+        # Calculate date range
+        end_date = datetime.now()
+        if unit == "D":
+            delta = timedelta(days=amount)
+        elif unit == "W":
+            delta = timedelta(weeks=amount)
+        elif unit == "M":
+            delta = timedelta(days=amount * 30)  # approximate
+        else:
+            raise ValueError(f"Invalid time window unit: {unit}")
+        start_date = end_date - delta
+        # Format dates for IMAP query
+        since_date = start_date.strftime("%d-%b-%Y")
+        before_date = end_date.strftime("%d-%b-%Y")
+        # Build query with date restriction
+        query = f'(FROM "{criteria["from"]}" SUBJECT "{criteria["subject"]}" SINCE "{since_date}" BEFORE "{before_date}")'
+        # Log the query for debugging
+        self.logger.info(f"Using search query: {query}")
+        self.logger.info(f"Time window: {time_window}")
+        self.logger.info(f"Date range: {since_date} to {before_date}")
+        return query
 
     def run(self, folder=None):
         """Main execution loop."""
@@ -474,6 +490,7 @@ def run(self, folder=None):
             if status != "OK":
                 self.logger.error(f"Failed to select folder {folder_name}: {folder_info}")
                 return
+
             # Build and execute search query
             search_query = self._build_email_search_query()
             self.logger.info(f"Using search query: {search_query}")
@@ -484,23 +501,46 @@ def run(self, folder=None):
 
             self.logger.info(f"Searching in folder: {folder}")
 
-            processed_papers = False
+            total_papers = 0
+            processed_emails = 0
+            self.logger.info("\n=== Starting Paper Processing ===")
             for num in message_numbers[0].split():
+                processed_emails += 1
                 _, msg_data = mail.fetch(num, "(RFC822)")
                 email_body = msg_data[0][1]
                 email_message = email.message_from_bytes(email_body)
+                self.logger.info(f"\nProcessing email {processed_emails}: {email_message['subject']}")
 
                 # Extract and classify papers in one step
                 paper_results = self.extract_and_classify_papers(email_message)
 
                 # Send notifications using slack_notifier directly
                 if paper_results:
-                    processed_papers = True
-                    if self.slack_notifier:
+                    total_papers += len(paper_results)
+                    if self.slack_notifier and not self.debug_mode:  # Only send if not in debug mode
                         self.slack_notifier.notify_matches(paper_results)
-
-            # Send weekly update if papers were processed
-            if processed_papers:
+                    else:
+                        # Print paper results to console instead
+                        self.logger.info(f"\n=== Paper Results (Email {processed_emails}) ===")
+                        for i, (paper, topics) in enumerate(paper_results, 1):
+                            self.logger.info(
+                                f"\nPaper {total_papers - len(paper_results) + i}/{total_papers} "
+                                f"(email {i}/{len(paper_results)})"
+                            )
+                            self.logger.info(f"Title: {paper.title}")
+                            self.logger.info(f"Authors: {paper.authors}")
+                            self.logger.info(f"Venue: {paper.venue}")
+                            self.logger.info(f"URL: {paper.url}")
+                            self.logger.info(f"Matched Topics: {[t.name for t in topics]}")
+                            self.logger.info(f"Abstract: {paper.abstract[:200]}...")
+                            self.logger.info("-" * 80)
+
+            self.logger.info("\n=== Processing Complete ===")
+            self.logger.info(f"Processed {processed_emails} emails")
+            self.logger.info(f"Total papers extracted: {total_papers}")
+
+            # Skip weekly update in debug mode
+            if total_papers > 0 and not self.debug_mode:
                 self.send_weekly_update_notification()
 
             mail.logout()
@@ -674,6 +714,9 @@ def _load_research_topics(self):
 
     def send_weekly_update_notification(self):
         """Send notifications to systems channels about weekly paper updates."""
+        if self.debug_mode:
+            self.logger.info("Debug mode: Skipping weekly update notification")
+            return
 
         def format_topic_summary(papers_by_topic):
             summary = []
@@ -725,6 +768,18 @@ def format_topic_summary(papers_by_topic):
 
 
 if __name__ == "__main__":
+    import argparse
+
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description='Run the Scholar Classifier')
+    parser.add_argument(
+        '--debug',
+        action='store_true',  # This makes it a flag that's either True or False
+        default=False,        # Default value is False
+        help='Run in debug mode (disable Slack notifications)'
+    )
+    args = parser.parse_args()
+
     # Set up logging
     logging.basicConfig(level=logging.DEBUG)
     logger = logging.getLogger(__name__)
@@ -732,9 +787,12 @@ def format_topic_summary(papers_by_topic):
     # Load environment variables
     load_dotenv()
 
+    # Log debug mode status
+    logger.info(f"Debug mode: {'enabled' if args.debug else 'disabled'}")
+
     # Load config and print relevant parts (without sensitive data)
     logger.info("Loading configuration...")
-    classifier = ScholarClassifier(config_file="config.yml")
+    classifier = ScholarClassifier(config_file="config.yml", debug_mode=args.debug)
 
     # Print config structure (without passwords)
     safe_config = classifier.config.copy()
diff --git a/tests/test_integration.py b/tests/test_integration.py
index d29f42d..f939b0a 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -23,10 +23,10 @@
 from scholar_classifier import ScholarClassifier
 
 
-class TestGmailIntegration(unittest.TestCase):
+class TestIntegration(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        """Set up test environment."""
+        """Set up test environment for full pipeline integration testing."""
         # Setup detailed logging
         logging.basicConfig(
             level=logging.DEBUG,
@@ -63,6 +63,7 @@ def setUpClass(cls):
     def test_end_to_end_pipeline(self):
         """Test the entire pipeline from Gmail connection to paper classification."""
         try:
+            # Use the main program's methods
             mail = self.classifier.connect_to_gmail()
 
             # for dmitrii's account
@@ -83,7 +84,7 @@ def test_end_to_end_pipeline(self):
 
             messages = message_numbers[0].split()
             self.assertTrue(len(messages) > 0, "No Google Scholar alert emails found")
-            print(f"\nFound {len(messages)} Google Scholar alert emails from Dec 23, 2024")
+            print(f"\nFound {len(messages)} Google Scholar alert emails from Jan 05, 2025")
 
             # Use messages directly since they're already filtered
             test_messages = messages