feat: Handle arXiv preprints separately from journal/conference assessment (fixes #72) (#89)

coding-ai-assistant[bot] · florath · web-flow · commit 6827f2a50107 · 2025-11-22T21:45:40.000+01:00
* feat: Handle arXiv preprints separately from journal/conference assessment (fixes #72) This change prevents arXiv preprints from being assessed as journals or conferences, which was causing inflated 'unknown' statistics and wasting backend query resources on non-venues. Changes: - Add _is_arxiv_entry() method to detect arXiv patterns in BibTeX entries - Skip arXiv entries during parsing and track them separately - Update BibtexAssessmentResult model with arxiv_entries_count and skipped_entries_count fields - Modified parse_bibtex_file() to return tuple with separate counts - Enhanced summary output to show arXiv entries separately - Add comprehensive test coverage for arXiv detection Detection patterns: - "arXiv preprint arXiv:XXXX.XXXXX" - "ArXiv e-prints" - Bare arXiv identifiers "arXiv:XXXX.XXXXX" - Old-style arXiv IDs "arXiv:cs.AI/9901001" - Misc entries containing "arxiv" Fixes inflated 'unknown' assessment statistics by properly categorizing arXiv preprints as a separate class of entries rather than attempting to assess them as publication venues. * refactor: Improve user-facing logging for arXiv preprint filtering Enhanced the logging output to make it clearer when arXiv preprints are being skipped during BibTeX parsing. Changes: - Split logging into separate, clearer messages - Use status_logger.info() for arXiv skipping (visible to users) - Clarify that arXiv entries are "not publication venues" - Remove confusing "problematic entries" wording - Keep technical parsing details in detail_logger.debug() Before: "Successfully parsed 11 entries [...] skipped 80 problematic entries (59 arXiv, 21 other)" After: "Skipped 59 arXiv preprint(s) - not publication venues" This addresses user confusion about why entries with journal fields containing "arXiv preprint arXiv:..." are being excluded from assessment. --------- Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
diff --git a/src/aletheia_probe/batch_assessor.py b/src/aletheia_probe/batch_assessor.py
@@ -75,21 +75,28 @@ async def assess_bibtex_file(
 
         # Parse the BibTeX file to extract journal entries
         try:
-            bibtex_entries = BibtexParser.parse_bibtex_file(file_path, relax_bibtex)
-            detail_logger.debug(f"Successfully parsed {len(bibtex_entries)} entries")
+            bibtex_entries, skipped_count, arxiv_count = BibtexParser.parse_bibtex_file(
+                file_path, relax_bibtex
+            )
+            detail_logger.debug(
+                f"Successfully parsed {len(bibtex_entries)} entries, skipped {skipped_count}, found {arxiv_count} arXiv entries"
+            )
         except Exception as e:
             detail_logger.error(f"Failed to parse BibTeX file: {e}")
             raise ValueError(f"Failed to parse BibTeX file: {e}") from e
 
+        total_entries = len(bibtex_entries) + skipped_count + arxiv_count
         status_logger.info(
             f"Found {len(bibtex_entries)} entries with journal information"
         )
 
         # Prepare result object
         result = BibtexAssessmentResult(
             file_path=str(file_path),
-            total_entries=len(bibtex_entries),
+            total_entries=total_entries,
             entries_with_journals=len(bibtex_entries),
+            arxiv_entries_count=arxiv_count,
+            skipped_entries_count=skipped_count,
             predatory_count=0,
             legitimate_count=0,
             insufficient_data_count=0,
@@ -260,7 +267,16 @@ def format_summary(result: BibtexAssessmentResult, verbose: bool = False) -> str
         summary_lines.append("BibTeX Assessment Summary")
         summary_lines.append("=" * 40)
         summary_lines.append(f"File: {result.file_path}")
-        summary_lines.append(f"Total entries processed: {result.total_entries}")
+        summary_lines.append(f"Total entries in file: {result.total_entries}")
+        summary_lines.append(f"Entries assessed: {result.entries_with_journals}")
+        if result.arxiv_entries_count > 0:
+            summary_lines.append(
+                f"Skipped arXiv preprints: {result.arxiv_entries_count}"
+            )
+        if result.skipped_entries_count > 0:
+            summary_lines.append(
+                f"Skipped other entries: {result.skipped_entries_count}"
+            )
         summary_lines.append(f"Processing time: {result.processing_time:.2f}s")
         summary_lines.append("")
 
diff --git a/src/aletheia_probe/bibtex_parser.py b/src/aletheia_probe/bibtex_parser.py
@@ -26,7 +26,7 @@ class BibtexParser:
     @staticmethod
     def parse_bibtex_file(
         file_path: Path, relax_parsing: bool = False
-    ) -> list[BibtexEntry]:
+    ) -> tuple[list[BibtexEntry], int, int]:
         """Parse a BibTeX file and extract journal entries.
 
         This method tries multiple encoding strategies to maximize the number
@@ -38,7 +38,10 @@ def parse_bibtex_file(
                          malformed BibTeX files (e.g., duplicate keys, syntax errors)
 
         Returns:
-            List of BibtexEntry objects with extracted journal information
+            A tuple containing:
+            - List of BibtexEntry objects with extracted journal information
+            - Number of entries skipped (excluding arXiv)
+            - Number of arXiv entries detected and skipped
 
         Raises:
             FileNotFoundError: If the BibTeX file doesn't exist
@@ -89,9 +92,18 @@ def parse_bibtex_file(
 
                     entries = []
                     skipped_entries = 0
+                    arxiv_entries = 0
 
                     for entry_key, entry in bib_data.entries.items():
                         try:
+                            # First, check for arXiv entries to correctly categorize skipped entries
+                            if BibtexParser._is_arxiv_entry(entry):
+                                arxiv_entries += 1
+                                detail_logger.debug(
+                                    f"Skipping arXiv entry: {entry_key}"
+                                )
+                                continue
+
                             # Extract each entry with individual error handling
                             processed_entry = BibtexParser._process_entry_safely(
                                 entry_key, entry
@@ -107,18 +119,25 @@ def parse_bibtex_file(
                             skipped_entries += 1
                             continue
 
-                    if skipped_entries > 0:
+                    # Log parsing results with clear messaging
+                    detail_logger.debug(
+                        f"Successfully parsed {len(entries)} entries from {file_path.name} "
+                        f"with {description}"
+                    )
+
+                    # Inform user about arXiv preprints (if any)
+                    if arxiv_entries > 0:
                         status_logger.info(
-                            f"Successfully parsed {len(entries)} entries from {file_path.name} "
-                            f"with {description}, skipped {skipped_entries} problematic entries"
+                            f"Skipped {arxiv_entries} arXiv preprint(s) - not publication venues"
                         )
-                    else:
+
+                    # Log other skipped entries
+                    if skipped_entries > 0:
                         detail_logger.debug(
-                            f"Successfully parsed {len(entries)} entries from {file_path.name} "
-                            f"with {description}"
+                            f"Skipped {skipped_entries} other entries due to processing errors"
                         )
 
-                    return entries
+                    return entries, skipped_entries, arxiv_entries
 
                 except UnicodeDecodeError as e:
                     last_error = e
@@ -187,6 +206,10 @@ def _process_entry_safely(entry_key: str, entry: Entry) -> BibtexEntry | None:
                 venue_name = BibtexParser._extract_journal_name(entry)
 
             if not venue_name:
+                # This can happen if the entry is an arXiv preprint or if it's a non-journal/conference type
+                detail_logger.debug(
+                    f"Skipping entry '{entry_key}' because no venue name could be extracted."
+                )
                 return None
 
             return BibtexEntry(
@@ -472,3 +495,63 @@ def _remove_nested_braces(value: str) -> str:
             value = re.sub(r"\{([^{}]*)\}", r"\1", value)
 
         return value.strip()
+
+    @staticmethod
+    def _is_arxiv_entry(entry: Entry) -> bool:
+        """Detects if a BibTeX entry is an arXiv preprint.
+
+        Checks the 'journal', 'booktitle', 'eprint', and 'title' fields
+        for common arXiv patterns.
+
+        Args:
+            entry: BibTeX entry object.
+
+        Returns:
+            True if the entry is identified as an arXiv preprint, False otherwise.
+        """
+        import re
+
+        # Patterns to identify arXiv entries
+        # - "arXiv preprint arXiv:XXXX.XXXXX"
+        # - "ArXiv e-prints"
+        # - "arXiv:XXXX.XXXXX" (bare arXiv identifier)
+        # - "e-print" field containing "arXiv"
+        # - Journal field containing only arXiv identifier
+
+        arxiv_patterns = [
+            r"arxiv\s+preprint\s+arxiv:\d{4}\.\d{5}(v\d+)?",  # arXiv preprint arXiv:XXXX.XXXXX
+            r"arxiv\s+e-prints",  # ArXiv e-prints
+            r"arxiv:\d{4}\.\d{5}(v\d+)?",  # bare arXiv identifier
+            r"arxiv:\w+\.\w+(v\d+)?",  # arXiv:cs.AI/9901001 (old style)
+            r"eprint:\s*arxiv",  # for entries where eprint field is "eprint = {arXiv}"
+        ]
+
+        # Combine all relevant fields into a single string for pattern matching
+        # Prioritize 'journal' and 'booktitle' as they are often used for venue names
+        # 'eprint' is a direct indicator, 'title' might contain it if poorly formatted
+        fields_to_check = [
+            BibtexParser._get_field_safely(entry, "journal"),
+            BibtexParser._get_field_safely(entry, "booktitle"),
+            BibtexParser._get_field_safely(entry, "eprint"),
+            BibtexParser._get_field_safely(entry, "title"),
+        ]
+
+        # Filter out None values and convert to lowercase for case-insensitive matching
+        checked_content = " ".join(
+            [f.lower() for f in fields_to_check if f is not None]
+        )
+
+        for pattern in arxiv_patterns:
+            if re.search(pattern, checked_content, re.IGNORECASE):
+                detail_logger.debug(
+                    f"Detected arXiv pattern '{pattern}' in entry: {entry.key}"
+                )
+                return True
+
+        # Additionally, check if the entry type itself is 'misc' and contains 'arxiv' in title/journal
+        if entry.type.lower() == "misc":
+            if re.search(r"arxiv", checked_content, re.IGNORECASE):
+                detail_logger.debug(f"Detected arXiv in 'misc' type entry: {entry.key}")
+                return True
+
+        return False
diff --git a/src/aletheia_probe/models.py b/src/aletheia_probe/models.py
@@ -180,6 +180,12 @@ class BibtexAssessmentResult(BaseModel):
     entries_with_journals: int = Field(
         ..., description="Number of entries with identifiable journals"
     )
+    arxiv_entries_count: int = Field(
+        0, description="Number of entries identified as arXiv preprints"
+    )
+    skipped_entries_count: int = Field(
+        0, description="Number of entries skipped for other reasons"
+    )
     assessment_results: list[tuple[BibtexEntry, AssessmentResult]] = Field(
         default_factory=list, description="List of (entry, assessment) pairs"
     )
diff --git a/tests/unit/test_bibtex_parser.py b/tests/unit/test_bibtex_parser.py

Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,12 @@ class BibtexAssessmentResult(BaseModel):`
`180`	`180`	`entries_with_journals: int = Field(`
`181`	`181`	`..., description="Number of entries with identifiable journals"`
`182`	`182`	`)`
	`183`	`+ arxiv_entries_count: int = Field(`
	`184`	`+ 0, description="Number of entries identified as arXiv preprints"`
	`185`	`+ )`
	`186`	`+ skipped_entries_count: int = Field(`
	`187`	`+ 0, description="Number of entries skipped for other reasons"`
	`188`	`+ )`
`183`	`189`	`assessment_results: list[tuple[BibtexEntry, AssessmentResult]] = Field(`
`184`	`190`	`default_factory=list, description="List of (entry, assessment) pairs"`
`185`	`191`	`)`