Skip to content

Commit 2c04a9e

Browse files
feat: Clean LaTeX escape sequences in journal/conference names (fixes #71) (#83)
Add LaTeX escape sequence cleaning to BibTeX parser to properly handle special characters in venue names. This fixes cases where journals like "Computers \& Security" were not matching in backend databases due to unprocessed escape sequences. Changes: - Add _clean_latex_escapes() method to handle common LaTeX escapes - Integrate escape cleaning into _remove_nested_braces() workflow - Handle both single and double backslash patterns - Add comprehensive unit tests for escape sequence cleaning Handles escape sequences: \& \' \" \{ \} \$ \% \# \_ \^ \~ [AI-assisted] Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 13f94c0 commit 2c04a9e

File tree

2 files changed

+164
-6
lines changed

2 files changed

+164
-6
lines changed

src/aletheia_probe/bibtex_parser.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -356,27 +356,86 @@ def _get_field_safely(entry: Entry, field_name: str) -> str | None:
356356
detail_logger.debug(f"Error getting field '{field_name}': {e}")
357357
return None
358358

359+
@staticmethod
360+
def _clean_latex_escapes(value: str) -> str:
361+
"""Clean LaTeX escape sequences from BibTeX field values.
362+
363+
BibTeX files often contain LaTeX escape sequences like \\& or \\'
364+
that need to be converted to their actual characters for proper
365+
database matching and display.
366+
367+
Args:
368+
value: BibTeX field value that may contain LaTeX escape sequences
369+
370+
Returns:
371+
Value with LaTeX escape sequences converted to actual characters
372+
373+
Examples:
374+
"Computers \\& Security" -> "Computers & Security"
375+
"Journal of \\"Research\\"" -> 'Journal of "Research"'
376+
"Test\\_Case" -> "Test_Case"
377+
"""
378+
import re
379+
380+
# Map of LaTeX escape sequences to their actual characters
381+
# Handle both single and double backslash patterns
382+
escape_mappings = [
383+
(r"\\\\&", "&"), # Double backslash (raw string in files)
384+
(r"\\&", "&"), # Single backslash
385+
(r"\\\\'", "'"),
386+
(r"\\'", "'"),
387+
(r'\\\\"', '"'),
388+
(r'\\"', '"'),
389+
(r"\\\\{", "{"),
390+
(r"\\{", "{"),
391+
(r"\\\\}", "}"),
392+
(r"\\}", "}"),
393+
(r"\\\\\$", "$"),
394+
(r"\\\$", "$"),
395+
(r"\\\\%", "%"),
396+
(r"\\%", "%"),
397+
(r"\\\\#", "#"),
398+
(r"\\#", "#"),
399+
(r"\\\\_", "_"),
400+
(r"\\_", "_"),
401+
(r"\\\\\^", "^"),
402+
(r"\\\^", "^"),
403+
(r"\\\\~", "~"),
404+
(r"\\~", "~"),
405+
]
406+
407+
# Apply all escape sequence replacements
408+
for pattern, replacement in escape_mappings:
409+
value = re.sub(pattern, replacement, value)
410+
411+
return value
412+
359413
@staticmethod
360414
def _remove_nested_braces(value: str) -> str:
361-
"""Remove nested curly braces from BibTeX field values.
415+
"""Remove nested curly braces and clean LaTeX escapes from BibTeX field values.
362416
363-
BibTeX often uses nested braces like {{IEEE}} or {{{CLOUD}}} for formatting.
364-
This method recursively removes all levels of curly braces.
417+
BibTeX often uses nested braces like {{IEEE}} or {{{CLOUD}}} for formatting,
418+
and LaTeX escape sequences like \\& for special characters.
419+
This method removes all levels of curly braces and converts escape sequences.
365420
366421
Args:
367-
value: BibTeX field value that may contain nested braces
422+
value: BibTeX field value that may contain nested braces and escapes
368423
369424
Returns:
370-
Value with all curly braces removed
425+
Value with all curly braces removed and LaTeX escapes cleaned
371426
372427
Examples:
373428
"{{IEEE}} Conference" -> "IEEE Conference"
374429
"{{{CLOUD}}}" -> "CLOUD"
430+
"Computers \\& Security" -> "Computers & Security"
375431
"Normal text" -> "Normal text"
376432
"""
377433
import re
378434

379-
# Remove nested curly braces iteratively until none remain
435+
# First, clean LaTeX escape sequences
436+
value = BibtexParser._clean_latex_escapes(value)
437+
438+
# Then remove nested curly braces iteratively until none remain
380439
# This handles multiple levels like {{{text}}} -> {{text}} -> {text} -> text
381440
while re.search(r"\{[^{}]*\}", value):
382441
value = re.sub(r"\{([^{}]*)\}", r"\1", value)

tests/unit/test_bibtex_parser.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,3 +817,102 @@ def test_brace_removal_edge_cases(self):
817817
# Test braces with special characters
818818
result = BibtexParser._remove_nested_braces("{{IEEE-802.11}} {Conference}")
819819
assert result == "IEEE-802.11 Conference"
820+
821+
def test_clean_latex_escapes(self):
822+
"""Test cleaning of LaTeX escape sequences."""
823+
# Test ampersand escape
824+
result = BibtexParser._clean_latex_escapes("Computers \\& Security")
825+
assert result == "Computers & Security"
826+
827+
# Test double backslash ampersand
828+
result = BibtexParser._clean_latex_escapes("Computers \\\\& Security")
829+
assert result == "Computers & Security"
830+
831+
# Test quote escapes
832+
result = BibtexParser._clean_latex_escapes(r"Journal of \"Research\"")
833+
assert result == 'Journal of "Research"'
834+
835+
# Test apostrophe escape
836+
result = BibtexParser._clean_latex_escapes(r"Author\'s Work")
837+
assert result == "Author's Work"
838+
839+
# Test underscore escape
840+
result = BibtexParser._clean_latex_escapes(r"Test\_Case")
841+
assert result == "Test_Case"
842+
843+
# Test multiple escapes in same string
844+
result = BibtexParser._clean_latex_escapes(r"A \& B: \"Test\" \% Done")
845+
assert result == 'A & B: "Test" % Done'
846+
847+
# Test all common escapes
848+
result = BibtexParser._clean_latex_escapes(
849+
r"Test\& \' \" \{ \} \$ \% \# \_ \^ \~"
850+
)
851+
assert result == "Test& ' \" { } $ % # _ ^ ~"
852+
853+
# Test text without escapes
854+
result = BibtexParser._clean_latex_escapes("Normal text")
855+
assert result == "Normal text"
856+
857+
# Test empty string
858+
result = BibtexParser._clean_latex_escapes("")
859+
assert result == ""
860+
861+
def test_remove_nested_braces_with_latex_escapes(self):
862+
"""Test that _remove_nested_braces also cleans LaTeX escapes."""
863+
# Test combined braces and escapes
864+
result = BibtexParser._remove_nested_braces(r"{Computers \& Security}")
865+
assert result == "Computers & Security"
866+
867+
# Test nested braces with escapes
868+
result = BibtexParser._remove_nested_braces(r"{{IEEE}} \& {{ACM}}")
869+
assert result == "IEEE & ACM"
870+
871+
# Test complex combination
872+
result = BibtexParser._remove_nested_braces(
873+
r"{Journal of \"Machine Learning\"} \& {{AI}}"
874+
)
875+
assert result == 'Journal of "Machine Learning" & AI'
876+
877+
def test_parse_bibtex_file_with_latex_escapes(self, tmp_path):
878+
"""Test parsing BibTeX file with LaTeX escape sequences in journal names."""
879+
bibtex_content = """
880+
@article{test_latex_escapes,
881+
title={Test Article with LaTeX Escapes},
882+
journal={Computers \\& Security},
883+
author={Test Author},
884+
year={2023}
885+
}
886+
887+
@article{test_quotes,
888+
title={Article with Quotes},
889+
journal={Journal of \\"Research\\"},
890+
author={Another Author},
891+
year={2024}
892+
}
893+
894+
@article{test_multiple,
895+
title={Multiple Escapes},
896+
journal={Test \\& Review: \\"Quality\\" \\% Assessment},
897+
author={Third Author},
898+
year={2025}
899+
}
900+
"""
901+
test_file = tmp_path / "test_latex.bib"
902+
test_file.write_text(bibtex_content, encoding="utf-8")
903+
904+
entries = BibtexParser.parse_bibtex_file(test_file)
905+
906+
assert len(entries) == 3
907+
908+
# Check first entry with ampersand escape
909+
entry1 = [e for e in entries if e.key == "test_latex_escapes"][0]
910+
assert entry1.journal_name == "Computers & Security"
911+
912+
# Check second entry with quote escapes
913+
entry2 = [e for e in entries if e.key == "test_quotes"][0]
914+
assert entry2.journal_name == 'Journal of "Research"'
915+
916+
# Check third entry with multiple escapes
917+
entry3 = [e for e in entries if e.key == "test_multiple"][0]
918+
assert entry3.journal_name == 'Test & Review: "Quality" % Assessment'

0 commit comments

Comments
 (0)