Skip to content

fixed date filtering bugs and workflow bug #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions scholar_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def extract_and_classify_papers(self, email_message):
self.logger.info("No papers found in email")
return []

# Filter out duplicates before processing
# Filter out duplicates and patents before processing
filtered_papers = []
for paper in papers:
title = paper.title.lower().strip()
Expand All @@ -293,6 +293,14 @@ def extract_and_classify_papers(self, email_message):
self.logger.info(f"Skipping duplicate paper (by URL): {paper.title}")
continue

# Skip if it's a patent
if any(word in title.lower() for word in ['patent', 'apparatus', 'method and system']):
self.logger.info(f"Skipping patent: {paper.title}")
continue
if 'patent' in paper.authors.lower():
self.logger.info(f"Skipping patent (from authors): {paper.title}")
continue

# Add to tracking sets
self._processed_titles.add(title)
if url:
Expand All @@ -301,7 +309,7 @@ def extract_and_classify_papers(self, email_message):
filtered_papers.append(paper)

self.logger.info(
f"Found {len(papers)} papers, {len(filtered_papers)} after duplicate filtering"
f"Found {len(papers)} papers, {len(filtered_papers)} after filtering out duplicates and patents"
)

results = []
Expand Down Expand Up @@ -760,17 +768,31 @@ def format_topic_summary(papers_by_topic):


if __name__ == "__main__":
import argparse

# Set up argument parser
parser = argparse.ArgumentParser(description='Run the Scholar Classifier')
parser.add_argument(
'--debug',
action='store_true', # This makes it a flag that's either True or False
default=False, # Default value is False
help='Run in debug mode (disable Slack notifications)'
)
args = parser.parse_args()

# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Log debug mode status
logger.info(f"Debug mode: {'enabled' if args.debug else 'disabled'}")

# Load config and print relevant parts (without sensitive data)
logger.info("Loading configuration...")
classifier = ScholarClassifier(config_file="config.yml", debug_mode=True) # Enable debug mode

classifier = ScholarClassifier(config_file="config.yml", debug_mode=args.debug)
# Print config structure (without passwords)
safe_config = classifier.config.copy()
safe_config["email"]["password"] = f"<{len(safe_config['email']['password'])} chars>"
Expand Down
7 changes: 4 additions & 3 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
from scholar_classifier import ScholarClassifier


class TestGmailIntegration(unittest.TestCase):
class TestIntegration(unittest.TestCase):
@classmethod
def setUpClass(cls):
"""Set up test environment."""
"""Set up test environment for full pipeline integration testing."""
# Setup detailed logging
logging.basicConfig(
level=logging.DEBUG,
Expand Down Expand Up @@ -63,6 +63,7 @@ def setUpClass(cls):
def test_end_to_end_pipeline(self):
"""Test the entire pipeline from Gmail connection to paper classification."""
try:
# Use the main program's methods
mail = self.classifier.connect_to_gmail()

# for dmitrii's account
Expand All @@ -83,7 +84,7 @@ def test_end_to_end_pipeline(self):

messages = message_numbers[0].split()
self.assertTrue(len(messages) > 0, "No Google Scholar alert emails found")
print(f"\nFound {len(messages)} Google Scholar alert emails from Dec 23, 2024")
print(f"\nFound {len(messages)} Google Scholar alert emails from Jan 05, 2025")

# Use messages directly since they're already filtered
test_messages = messages
Expand Down