Skip to content

Commit 81405d8

Browse files
feat: Remove brackets from journal names to improve matching (issue #67) (#68)
* feat: Remove brackets from journal names to improve matching (issue #67) This change addresses issue #67 by implementing comprehensive bracket removal for journal and conference names, improving database lookup success rates. Changes: - Enhanced BibTeX parser to handle nested curly braces ({{IEEE}} -> IEEE) - Added bracket removal in normalizer for [], (), and {} content - Implemented acronym preservation (IEEE, ACM, CLOUD, etc. stay uppercase) - Added comprehensive test coverage for both components Examples: - "Advances in Neural Information Processing Systems (NeurIPS)" -> "Advances in Neural Information Processing Systems" - "{{IEEE}} {{International Conference}} on {{Cloud Computing}} ({{CLOUD}})" -> "IEEE International Conference on CLOUD Computing" The implementation uses a defense-in-depth approach with bracket removal at both BibTeX parsing and normalization stages to ensure reliable journal name cleaning for better database matching. * style: Fix code formatting with ruff format Applied ruff formatting to ensure code style consistency. No functional changes. --------- Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 8dd4409 commit 81405d8

File tree

4 files changed

+364
-3
lines changed

4 files changed

+364
-3
lines changed

src/aletheia_probe/bibtex_parser.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,8 @@ def _get_field_safely(entry: Entry, field_name: str) -> str | None:
333333
# Clean up common BibTeX formatting
334334
if isinstance(value, str):
335335
try:
336-
# Remove curly braces and extra whitespace
337-
cleaned = value.strip("{}").strip()
336+
# Remove nested curly braces (BibTeX formatting)
337+
cleaned = BibtexParser._remove_nested_braces(value)
338338
return cleaned if cleaned else None
339339
except (UnicodeDecodeError, UnicodeEncodeError) as e:
340340
detail_logger.debug(
@@ -355,3 +355,30 @@ def _get_field_safely(entry: Entry, field_name: str) -> str | None:
355355
except Exception as e:
356356
detail_logger.debug(f"Error getting field '{field_name}': {e}")
357357
return None
358+
359+
@staticmethod
360+
def _remove_nested_braces(value: str) -> str:
361+
"""Remove nested curly braces from BibTeX field values.
362+
363+
BibTeX often uses nested braces like {{IEEE}} or {{{CLOUD}}} for formatting.
364+
This method recursively removes all levels of curly braces.
365+
366+
Args:
367+
value: BibTeX field value that may contain nested braces
368+
369+
Returns:
370+
Value with all curly braces removed
371+
372+
Examples:
373+
"{{IEEE}} Conference" -> "IEEE Conference"
374+
"{{{CLOUD}}}" -> "CLOUD"
375+
"Normal text" -> "Normal text"
376+
"""
377+
import re
378+
379+
# Remove nested curly braces iteratively until none remain
380+
# This handles multiple levels like {{{text}}} -> {{text}} -> {text} -> text
381+
while re.search(r"\{[^{}]*\}", value):
382+
value = re.sub(r"\{([^{}]*)\}", r"\1", value)
383+
384+
return value.strip()

src/aletheia_probe/normalizer.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,65 @@ def __init__(self) -> None:
1919
(r"\s*&\s*", " & "), # Normalize ampersands
2020
]
2121

22+
# Common acronyms that should remain uppercase
23+
self.acronyms = {
24+
"IEEE",
25+
"ACM",
26+
"SIGCOMM",
27+
"SIGCHI",
28+
"SIGKDD",
29+
"SIGMOD",
30+
"SIGPLAN",
31+
"VLDB",
32+
"ICML",
33+
"NIPS",
34+
"NEURIPS",
35+
"ICLR",
36+
"AAAI",
37+
"IJCAI",
38+
"CIKM",
39+
"WWW",
40+
"KDD",
41+
"ICDM",
42+
"SDM",
43+
"PAKDD",
44+
"ECML",
45+
"PKDD",
46+
"CLOUD",
47+
"NASA",
48+
"NIH",
49+
"NSF",
50+
"DARPA",
51+
"NIST",
52+
"ISO",
53+
"IEC",
54+
"ITU",
55+
"RFC",
56+
"HTTP",
57+
"TCP",
58+
"IP",
59+
"UDP",
60+
"DNS",
61+
"SSL",
62+
"TLS",
63+
"AI",
64+
"ML",
65+
"NLP",
66+
"CV",
67+
"HCI",
68+
"DB",
69+
"OS",
70+
"SE",
71+
"PL",
72+
"UK",
73+
"USA",
74+
"US",
75+
"EU",
76+
"UN",
77+
"WHO",
78+
"NATO",
79+
}
80+
2281
# Common abbreviation expansions
2382
self.abbreviations = {
2483
"J.": "Journal",
@@ -99,12 +158,47 @@ def _clean_text(self, text: str) -> str:
99158
text = self.issn_pattern.sub("", text)
100159
text = self.doi_pattern.sub("", text)
101160

161+
# Remove content within brackets and parentheses that could interfere with matching
162+
text = self._remove_bracketed_content(text)
163+
102164
# Apply cleanup patterns
103165
for pattern, replacement in self.cleanup_patterns:
104166
text = re.sub(pattern, replacement, text)
105167

106168
return text.strip()
107169

170+
def _remove_bracketed_content(self, text: str) -> str:
171+
"""Remove content within brackets and parentheses that could interfere with journal matching.
172+
173+
Examples:
174+
"Journal of Science (ISSN: 1234-5678)" -> "Journal of Science"
175+
"{{IEEE}} Conference on {{Cloud Computing}} ({{CLOUD}})" -> "IEEE Conference on Cloud Computing"
176+
"Advances in Neural Information Processing Systems (NeurIPS)" -> "Advances in Neural Information Processing Systems"
177+
178+
Args:
179+
text: Input text that may contain bracketed content
180+
181+
Returns:
182+
Text with bracketed content removed and whitespace normalized
183+
"""
184+
# Remove nested curly braces (BibTeX formatting) - handle multiple levels
185+
# This handles cases like {{IEEE}} -> IEEE
186+
while re.search(r"\{[^{}]*\}", text):
187+
text = re.sub(r"\{([^{}]*)\}", r"\1", text)
188+
189+
# Remove content within square brackets [...]
190+
# This handles abbreviations and annotations like [2023], [Online]
191+
text = re.sub(r"\[[^\]]*\]", "", text)
192+
193+
# Remove content within parentheses (...)
194+
# This handles journal/conference abbreviations like (NeurIPS), (CLOUD)
195+
text = re.sub(r"\([^)]*\)", "", text)
196+
197+
# Clean up multiple spaces left by bracket removal
198+
text = re.sub(r"\s+", " ", text)
199+
200+
return text.strip()
201+
108202
def _expand_abbreviations(self, text: str) -> str:
109203
"""Expand common journal abbreviations."""
110204
words = text.split()
@@ -126,8 +220,11 @@ def _normalize_case(self, text: str) -> str:
126220
normalized_words = []
127221

128222
for i, word in enumerate(words):
223+
# Check if word is a known acronym (case-insensitive)
224+
if word.upper() in self.acronyms:
225+
normalized_words.append(word.upper())
129226
# Keep certain words lowercase unless at start
130-
if i > 0 and word.lower() in [
227+
elif i > 0 and word.lower() in [
131228
"of",
132229
"and",
133230
"or",

tests/unit/test_bibtex_parser.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,3 +708,112 @@ def test_conference_name_extraction_priority(self, tmp_path):
708708
assert entry.series == "VLC"
709709
assert entry.booktitle == "2023 Very Long Conference Name with Year and Edition"
710710
assert entry.organization == "IEEE"
711+
712+
def test_nested_brace_removal(self):
713+
"""Test removal of nested curly braces in BibTeX fields."""
714+
# Test the static method directly
715+
from aletheia_probe.bibtex_parser import BibtexParser
716+
717+
# Single level braces
718+
result = BibtexParser._remove_nested_braces("{IEEE}")
719+
assert result == "IEEE"
720+
721+
# Double nested braces (common in BibTeX)
722+
result = BibtexParser._remove_nested_braces("{{IEEE}}")
723+
assert result == "IEEE"
724+
725+
# Triple nested braces
726+
result = BibtexParser._remove_nested_braces("{{{CLOUD}}}")
727+
assert result == "CLOUD"
728+
729+
# Mixed content with multiple braced sections
730+
result = BibtexParser._remove_nested_braces(
731+
"{{IEEE}} {{International Conference}} on {{Cloud Computing}}"
732+
)
733+
assert result == "IEEE International Conference on Cloud Computing"
734+
735+
# Text without braces should remain unchanged
736+
result = BibtexParser._remove_nested_braces("Plain text")
737+
assert result == "Plain text"
738+
739+
# Empty braces
740+
result = BibtexParser._remove_nested_braces("{}")
741+
assert result == ""
742+
743+
# Nested empty braces
744+
result = BibtexParser._remove_nested_braces("{{}}")
745+
assert result == ""
746+
747+
def test_parse_bibtex_conference_with_nested_braces(self, tmp_path):
748+
"""Test parsing BibTeX entries with heavily nested braces."""
749+
bibtex_content = """
750+
@inproceedings{test2018,
751+
title = {{{Software}} {{Greenability}}: {{A Case Study}} of {{Cloud-Based Applications}}},
752+
booktitle = {2018 {{IEEE}} 11th {{International Conference}} on {{Cloud Computing}} ({{CLOUD}})},
753+
author = {Test Author},
754+
year = 2018
755+
}
756+
"""
757+
test_file = tmp_path / "test_nested_braces.bib"
758+
test_file.write_text(bibtex_content, encoding="utf-8")
759+
760+
entries = BibtexParser.parse_bibtex_file(test_file)
761+
762+
assert len(entries) == 1
763+
entry = entries[0]
764+
765+
# Title should have nested braces removed
766+
expected_title = (
767+
"Software Greenability: A Case Study of Cloud-Based Applications"
768+
)
769+
assert entry.title == expected_title
770+
771+
# Conference name should have nested braces removed
772+
expected_conference = (
773+
"2018 IEEE 11th International Conference on Cloud Computing (CLOUD)"
774+
)
775+
assert entry.journal_name == expected_conference
776+
777+
def test_parse_bibtex_journal_with_nested_braces(self, tmp_path):
778+
"""Test parsing journal entries with nested braces."""
779+
bibtex_content = """
780+
@article{test2023,
781+
title = {{{Advanced}} {{Machine Learning}} {Techniques}},
782+
journal = {{{IEEE}} {{Transactions}} on {{Pattern Analysis}}},
783+
author = {Test Author},
784+
year = 2023
785+
}
786+
"""
787+
test_file = tmp_path / "test_journal_nested.bib"
788+
test_file.write_text(bibtex_content, encoding="utf-8")
789+
790+
entries = BibtexParser.parse_bibtex_file(test_file)
791+
792+
assert len(entries) == 1
793+
entry = entries[0]
794+
795+
# Title should have all braces removed
796+
expected_title = "Advanced Machine Learning Techniques"
797+
assert entry.title == expected_title
798+
799+
# Journal should have nested braces removed
800+
expected_journal = "IEEE Transactions on Pattern Analysis"
801+
assert entry.journal_name == expected_journal
802+
803+
def test_brace_removal_edge_cases(self):
804+
"""Test edge cases for nested brace removal."""
805+
# Test asymmetric braces (malformed)
806+
result = BibtexParser._remove_nested_braces("{incomplete")
807+
assert result == "{incomplete" # Should not remove incomplete braces
808+
809+
# Test mixed valid and invalid braces
810+
result = BibtexParser._remove_nested_braces("{valid} {incomplete")
811+
assert result == "valid {incomplete"
812+
813+
# Test deeply nested braces
814+
result = BibtexParser._remove_nested_braces("{{{{deep}}}}")
815+
assert result == "deep"
816+
817+
# Test braces with special characters
818+
result = BibtexParser._remove_nested_braces("{{IEEE-802.11}} {Conference}")
819+
assert result == "IEEE-802.11 Conference"

0 commit comments

Comments
 (0)