diff --git a/scholar_classifier.py b/scholar_classifier.py index 3501f80..4fadcc3 100644 --- a/scholar_classifier.py +++ b/scholar_classifier.py @@ -279,7 +279,7 @@ def extract_and_classify_papers(self, email_message): self.logger.info("No papers found in email") return [] - # Filter out duplicates before processing + # Filter out duplicates and patents before processing filtered_papers = [] for paper in papers: title = paper.title.lower().strip() @@ -293,6 +293,14 @@ def extract_and_classify_papers(self, email_message): self.logger.info(f"Skipping duplicate paper (by URL): {paper.title}") continue + # Skip if it's a patent + if any(word in title.lower() for word in ['patent', 'apparatus', 'method and system']): + self.logger.info(f"Skipping patent: {paper.title}") + continue + if 'patent' in paper.authors.lower(): + self.logger.info(f"Skipping patent (from authors): {paper.title}") + continue + # Add to tracking sets self._processed_titles.add(title) if url: @@ -301,7 +309,7 @@ def extract_and_classify_papers(self, email_message): filtered_papers.append(paper) self.logger.info( - f"Found {len(papers)} papers, {len(filtered_papers)} after duplicate filtering" + f"Found {len(papers)} papers, {len(filtered_papers)} after filtering out duplicates and patents" ) results = [] @@ -760,6 +768,18 @@ def format_topic_summary(papers_by_topic): if __name__ == "__main__": + import argparse + + # Set up argument parser + parser = argparse.ArgumentParser(description='Run the Scholar Classifier') + parser.add_argument( + '--debug', + action='store_true', # This makes it a flag that's either True or False + default=False, # Default value is False + help='Run in debug mode (disable Slack notifications)' + ) + args = parser.parse_args() + # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -767,10 +787,12 @@ def format_topic_summary(papers_by_topic): # Load environment variables load_dotenv() + # Log debug mode status + logger.info(f"Debug mode: {'enabled' if args.debug else 'disabled'}") + # Load config and print relevant parts (without sensitive data) logger.info("Loading configuration...") - classifier = ScholarClassifier(config_file="config.yml", debug_mode=True) # Enable debug mode - + classifier = ScholarClassifier(config_file="config.yml", debug_mode=args.debug) # Print config structure (without passwords) safe_config = classifier.config.copy() safe_config["email"]["password"] = f"<{len(safe_config['email']['password'])} chars>" diff --git a/tests/test_integration.py b/tests/test_integration.py index d29f42d..f939b0a 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -23,10 +23,10 @@ from scholar_classifier import ScholarClassifier -class TestGmailIntegration(unittest.TestCase): +class TestIntegration(unittest.TestCase): @classmethod def setUpClass(cls): - """Set up test environment.""" + """Set up test environment for full pipeline integration testing.""" # Setup detailed logging logging.basicConfig( level=logging.DEBUG, @@ -63,6 +63,7 @@ def setUpClass(cls): def test_end_to_end_pipeline(self): """Test the entire pipeline from Gmail connection to paper classification.""" try: + # Use the main program's methods mail = self.classifier.connect_to_gmail() # for dmitrii's account @@ -83,7 +84,7 @@ def test_end_to_end_pipeline(self): messages = message_numbers[0].split() self.assertTrue(len(messages) > 0, "No Google Scholar alert emails found") - print(f"\nFound {len(messages)} Google Scholar alert emails from Dec 23, 2024") + print(f"\nFound {len(messages)} Google Scholar alert emails from Jan 05, 2025") # Use messages directly since they're already filtered test_messages = messages