Skip to content

Commit 6827f2a

Browse files
feat: Handle arXiv preprints separately from journal/conference assessment (fixes #72) (#89)
* feat: Handle arXiv preprints separately from journal/conference assessment (fixes #72) This change prevents arXiv preprints from being assessed as journals or conferences, which was causing inflated 'unknown' statistics and wasting backend query resources on non-venues. Changes: - Add _is_arxiv_entry() method to detect arXiv patterns in BibTeX entries - Skip arXiv entries during parsing and track them separately - Update BibtexAssessmentResult model with arxiv_entries_count and skipped_entries_count fields - Modified parse_bibtex_file() to return tuple with separate counts - Enhanced summary output to show arXiv entries separately - Add comprehensive test coverage for arXiv detection Detection patterns: - "arXiv preprint arXiv:XXXX.XXXXX" - "ArXiv e-prints" - Bare arXiv identifiers "arXiv:XXXX.XXXXX" - Old-style arXiv IDs "arXiv:cs.AI/9901001" - Misc entries containing "arxiv" Fixes inflated 'unknown' assessment statistics by properly categorizing arXiv preprints as a separate class of entries rather than attempting to assess them as publication venues. * refactor: Improve user-facing logging for arXiv preprint filtering Enhanced the logging output to make it clearer when arXiv preprints are being skipped during BibTeX parsing. Changes: - Split logging into separate, clearer messages - Use status_logger.info() for arXiv skipping (visible to users) - Clarify that arXiv entries are "not publication venues" - Remove confusing "problematic entries" wording - Keep technical parsing details in detail_logger.debug() Before: "Successfully parsed 11 entries [...] skipped 80 problematic entries (59 arXiv, 21 other)" After: "Skipped 59 arXiv preprint(s) - not publication venues" This addresses user confusion about why entries with journal fields containing "arXiv preprint arXiv:..." are being excluded from assessment. --------- Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 6e41a93 commit 6827f2a

File tree

4 files changed

+184
-33
lines changed

4 files changed

+184
-33
lines changed

src/aletheia_probe/batch_assessor.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,21 +75,28 @@ async def assess_bibtex_file(
7575

7676
# Parse the BibTeX file to extract journal entries
7777
try:
78-
bibtex_entries = BibtexParser.parse_bibtex_file(file_path, relax_bibtex)
79-
detail_logger.debug(f"Successfully parsed {len(bibtex_entries)} entries")
78+
bibtex_entries, skipped_count, arxiv_count = BibtexParser.parse_bibtex_file(
79+
file_path, relax_bibtex
80+
)
81+
detail_logger.debug(
82+
f"Successfully parsed {len(bibtex_entries)} entries, skipped {skipped_count}, found {arxiv_count} arXiv entries"
83+
)
8084
except Exception as e:
8185
detail_logger.error(f"Failed to parse BibTeX file: {e}")
8286
raise ValueError(f"Failed to parse BibTeX file: {e}") from e
8387

88+
total_entries = len(bibtex_entries) + skipped_count + arxiv_count
8489
status_logger.info(
8590
f"Found {len(bibtex_entries)} entries with journal information"
8691
)
8792

8893
# Prepare result object
8994
result = BibtexAssessmentResult(
9095
file_path=str(file_path),
91-
total_entries=len(bibtex_entries),
96+
total_entries=total_entries,
9297
entries_with_journals=len(bibtex_entries),
98+
arxiv_entries_count=arxiv_count,
99+
skipped_entries_count=skipped_count,
93100
predatory_count=0,
94101
legitimate_count=0,
95102
insufficient_data_count=0,
@@ -260,7 +267,16 @@ def format_summary(result: BibtexAssessmentResult, verbose: bool = False) -> str
260267
summary_lines.append("BibTeX Assessment Summary")
261268
summary_lines.append("=" * 40)
262269
summary_lines.append(f"File: {result.file_path}")
263-
summary_lines.append(f"Total entries processed: {result.total_entries}")
270+
summary_lines.append(f"Total entries in file: {result.total_entries}")
271+
summary_lines.append(f"Entries assessed: {result.entries_with_journals}")
272+
if result.arxiv_entries_count > 0:
273+
summary_lines.append(
274+
f"Skipped arXiv preprints: {result.arxiv_entries_count}"
275+
)
276+
if result.skipped_entries_count > 0:
277+
summary_lines.append(
278+
f"Skipped other entries: {result.skipped_entries_count}"
279+
)
264280
summary_lines.append(f"Processing time: {result.processing_time:.2f}s")
265281
summary_lines.append("")
266282

src/aletheia_probe/bibtex_parser.py

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class BibtexParser:
2626
@staticmethod
2727
def parse_bibtex_file(
2828
file_path: Path, relax_parsing: bool = False
29-
) -> list[BibtexEntry]:
29+
) -> tuple[list[BibtexEntry], int, int]:
3030
"""Parse a BibTeX file and extract journal entries.
3131
3232
This method tries multiple encoding strategies to maximize the number
@@ -38,7 +38,10 @@ def parse_bibtex_file(
3838
malformed BibTeX files (e.g., duplicate keys, syntax errors)
3939
4040
Returns:
41-
List of BibtexEntry objects with extracted journal information
41+
A tuple containing:
42+
- List of BibtexEntry objects with extracted journal information
43+
- Number of entries skipped (excluding arXiv)
44+
- Number of arXiv entries detected and skipped
4245
4346
Raises:
4447
FileNotFoundError: If the BibTeX file doesn't exist
@@ -89,9 +92,18 @@ def parse_bibtex_file(
8992

9093
entries = []
9194
skipped_entries = 0
95+
arxiv_entries = 0
9296

9397
for entry_key, entry in bib_data.entries.items():
9498
try:
99+
# First, check for arXiv entries to correctly categorize skipped entries
100+
if BibtexParser._is_arxiv_entry(entry):
101+
arxiv_entries += 1
102+
detail_logger.debug(
103+
f"Skipping arXiv entry: {entry_key}"
104+
)
105+
continue
106+
95107
# Extract each entry with individual error handling
96108
processed_entry = BibtexParser._process_entry_safely(
97109
entry_key, entry
@@ -107,18 +119,25 @@ def parse_bibtex_file(
107119
skipped_entries += 1
108120
continue
109121

110-
if skipped_entries > 0:
122+
# Log parsing results with clear messaging
123+
detail_logger.debug(
124+
f"Successfully parsed {len(entries)} entries from {file_path.name} "
125+
f"with {description}"
126+
)
127+
128+
# Inform user about arXiv preprints (if any)
129+
if arxiv_entries > 0:
111130
status_logger.info(
112-
f"Successfully parsed {len(entries)} entries from {file_path.name} "
113-
f"with {description}, skipped {skipped_entries} problematic entries"
131+
f"Skipped {arxiv_entries} arXiv preprint(s) - not publication venues"
114132
)
115-
else:
133+
134+
# Log other skipped entries
135+
if skipped_entries > 0:
116136
detail_logger.debug(
117-
f"Successfully parsed {len(entries)} entries from {file_path.name} "
118-
f"with {description}"
137+
f"Skipped {skipped_entries} other entries due to processing errors"
119138
)
120139

121-
return entries
140+
return entries, skipped_entries, arxiv_entries
122141

123142
except UnicodeDecodeError as e:
124143
last_error = e
@@ -187,6 +206,10 @@ def _process_entry_safely(entry_key: str, entry: Entry) -> BibtexEntry | None:
187206
venue_name = BibtexParser._extract_journal_name(entry)
188207

189208
if not venue_name:
209+
# This can happen if the entry is an arXiv preprint or if it's a non-journal/conference type
210+
detail_logger.debug(
211+
f"Skipping entry '{entry_key}' because no venue name could be extracted."
212+
)
190213
return None
191214

192215
return BibtexEntry(
@@ -472,3 +495,63 @@ def _remove_nested_braces(value: str) -> str:
472495
value = re.sub(r"\{([^{}]*)\}", r"\1", value)
473496

474497
return value.strip()
498+
499+
@staticmethod
500+
def _is_arxiv_entry(entry: Entry) -> bool:
501+
"""Detects if a BibTeX entry is an arXiv preprint.
502+
503+
Checks the 'journal', 'booktitle', 'eprint', and 'title' fields
504+
for common arXiv patterns.
505+
506+
Args:
507+
entry: BibTeX entry object.
508+
509+
Returns:
510+
True if the entry is identified as an arXiv preprint, False otherwise.
511+
"""
512+
import re
513+
514+
# Patterns to identify arXiv entries
515+
# - "arXiv preprint arXiv:XXXX.XXXXX"
516+
# - "ArXiv e-prints"
517+
# - "arXiv:XXXX.XXXXX" (bare arXiv identifier)
518+
# - "e-print" field containing "arXiv"
519+
# - Journal field containing only arXiv identifier
520+
521+
arxiv_patterns = [
522+
r"arxiv\s+preprint\s+arxiv:\d{4}\.\d{5}(v\d+)?", # arXiv preprint arXiv:XXXX.XXXXX
523+
r"arxiv\s+e-prints", # ArXiv e-prints
524+
r"arxiv:\d{4}\.\d{5}(v\d+)?", # bare arXiv identifier
525+
r"arxiv:\w+\.\w+(v\d+)?", # arXiv:cs.AI/9901001 (old style)
526+
r"eprint:\s*arxiv", # for entries where eprint field is "eprint = {arXiv}"
527+
]
528+
529+
# Combine all relevant fields into a single string for pattern matching
530+
# Prioritize 'journal' and 'booktitle' as they are often used for venue names
531+
# 'eprint' is a direct indicator, 'title' might contain it if poorly formatted
532+
fields_to_check = [
533+
BibtexParser._get_field_safely(entry, "journal"),
534+
BibtexParser._get_field_safely(entry, "booktitle"),
535+
BibtexParser._get_field_safely(entry, "eprint"),
536+
BibtexParser._get_field_safely(entry, "title"),
537+
]
538+
539+
# Filter out None values and convert to lowercase for case-insensitive matching
540+
checked_content = " ".join(
541+
[f.lower() for f in fields_to_check if f is not None]
542+
)
543+
544+
for pattern in arxiv_patterns:
545+
if re.search(pattern, checked_content, re.IGNORECASE):
546+
detail_logger.debug(
547+
f"Detected arXiv pattern '{pattern}' in entry: {entry.key}"
548+
)
549+
return True
550+
551+
# Additionally, check if the entry type itself is 'misc' and contains 'arxiv' in title/journal
552+
if entry.type.lower() == "misc":
553+
if re.search(r"arxiv", checked_content, re.IGNORECASE):
554+
detail_logger.debug(f"Detected arXiv in 'misc' type entry: {entry.key}")
555+
return True
556+
557+
return False

src/aletheia_probe/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,12 @@ class BibtexAssessmentResult(BaseModel):
180180
entries_with_journals: int = Field(
181181
..., description="Number of entries with identifiable journals"
182182
)
183+
arxiv_entries_count: int = Field(
184+
0, description="Number of entries identified as arXiv preprints"
185+
)
186+
skipped_entries_count: int = Field(
187+
0, description="Number of entries skipped for other reasons"
188+
)
183189
assessment_results: list[tuple[BibtexEntry, AssessmentResult]] = Field(
184190
default_factory=list, description="List of (entry, assessment) pairs"
185191
)

0 commit comments

Comments
 (0)