feat: Remove brackets from journal names to improve matching (issue #67) (#68)

coding-ai-assistant[bot] · florath · web-flow · commit 81405d8cc789 · 2025-11-22T10:16:59.000+01:00
* feat: Remove brackets from journal names to improve matching (issue #67) This change addresses issue #67 by implementing comprehensive bracket removal for journal and conference names, improving database lookup success rates. Changes: - Enhanced BibTeX parser to handle nested curly braces ({{IEEE}} -> IEEE) - Added bracket removal in normalizer for [], (), and {} content - Implemented acronym preservation (IEEE, ACM, CLOUD, etc. stay uppercase) - Added comprehensive test coverage for both components Examples: - "Advances in Neural Information Processing Systems (NeurIPS)" -> "Advances in Neural Information Processing Systems" - "{{IEEE}} {{International Conference}} on {{Cloud Computing}} ({{CLOUD}})" -> "IEEE International Conference on CLOUD Computing" The implementation uses a defense-in-depth approach with bracket removal at both BibTeX parsing and normalization stages to ensure reliable journal name cleaning for better database matching. * style: Fix code formatting with ruff format Applied ruff formatting to ensure code style consistency. No functional changes. --------- Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
diff --git a/src/aletheia_probe/bibtex_parser.py b/src/aletheia_probe/bibtex_parser.py
@@ -333,8 +333,8 @@ def _get_field_safely(entry: Entry, field_name: str) -> str | None:
                 # Clean up common BibTeX formatting
                 if isinstance(value, str):
                     try:
-                        # Remove curly braces and extra whitespace
-                        cleaned = value.strip("{}").strip()
+                        # Remove nested curly braces (BibTeX formatting)
+                        cleaned = BibtexParser._remove_nested_braces(value)
                         return cleaned if cleaned else None
                     except (UnicodeDecodeError, UnicodeEncodeError) as e:
                         detail_logger.debug(
@@ -355,3 +355,30 @@ def _get_field_safely(entry: Entry, field_name: str) -> str | None:
         except Exception as e:
             detail_logger.debug(f"Error getting field '{field_name}': {e}")
             return None
+
+    @staticmethod
+    def _remove_nested_braces(value: str) -> str:
+        """Remove nested curly braces from BibTeX field values.
+
+        BibTeX often uses nested braces like {{IEEE}} or {{{CLOUD}}} for formatting.
+        This method recursively removes all levels of curly braces.
+
+        Args:
+            value: BibTeX field value that may contain nested braces
+
+        Returns:
+            Value with all curly braces removed
+
+        Examples:
+            "{{IEEE}} Conference" -> "IEEE Conference"
+            "{{{CLOUD}}}" -> "CLOUD"
+            "Normal text" -> "Normal text"
+        """
+        import re
+
+        # Remove nested curly braces iteratively until none remain
+        # This handles multiple levels like {{{text}}} -> {{text}} -> {text} -> text
+        while re.search(r"\{[^{}]*\}", value):
+            value = re.sub(r"\{([^{}]*)\}", r"\1", value)
+
+        return value.strip()
diff --git a/src/aletheia_probe/normalizer.py b/src/aletheia_probe/normalizer.py
@@ -19,6 +19,65 @@ def __init__(self) -> None:
             (r"\s*&\s*", " & "),  # Normalize ampersands
         ]
 
+        # Common acronyms that should remain uppercase
+        self.acronyms = {
+            "IEEE",
+            "ACM",
+            "SIGCOMM",
+            "SIGCHI",
+            "SIGKDD",
+            "SIGMOD",
+            "SIGPLAN",
+            "VLDB",
+            "ICML",
+            "NIPS",
+            "NEURIPS",
+            "ICLR",
+            "AAAI",
+            "IJCAI",
+            "CIKM",
+            "WWW",
+            "KDD",
+            "ICDM",
+            "SDM",
+            "PAKDD",
+            "ECML",
+            "PKDD",
+            "CLOUD",
+            "NASA",
+            "NIH",
+            "NSF",
+            "DARPA",
+            "NIST",
+            "ISO",
+            "IEC",
+            "ITU",
+            "RFC",
+            "HTTP",
+            "TCP",
+            "IP",
+            "UDP",
+            "DNS",
+            "SSL",
+            "TLS",
+            "AI",
+            "ML",
+            "NLP",
+            "CV",
+            "HCI",
+            "DB",
+            "OS",
+            "SE",
+            "PL",
+            "UK",
+            "USA",
+            "US",
+            "EU",
+            "UN",
+            "WHO",
+            "NATO",
+        }
+
         # Common abbreviation expansions
         self.abbreviations = {
             "J.": "Journal",
@@ -99,12 +158,47 @@ def _clean_text(self, text: str) -> str:
         text = self.issn_pattern.sub("", text)
         text = self.doi_pattern.sub("", text)
 
+        # Remove content within brackets and parentheses that could interfere with matching
+        text = self._remove_bracketed_content(text)
+
         # Apply cleanup patterns
         for pattern, replacement in self.cleanup_patterns:
             text = re.sub(pattern, replacement, text)
 
         return text.strip()
 
+    def _remove_bracketed_content(self, text: str) -> str:
+        """Remove content within brackets and parentheses that could interfere with journal matching.
+
+        Examples:
+            "Journal of Science (ISSN: 1234-5678)" -> "Journal of Science"
+            "{{IEEE}} Conference on {{Cloud Computing}} ({{CLOUD}})" -> "IEEE Conference on Cloud Computing"
+            "Advances in Neural Information Processing Systems (NeurIPS)" -> "Advances in Neural Information Processing Systems"
+
+        Args:
+            text: Input text that may contain bracketed content
+
+        Returns:
+            Text with bracketed content removed and whitespace normalized
+        """
+        # Remove nested curly braces (BibTeX formatting) - handle multiple levels
+        # This handles cases like {{IEEE}} -> IEEE
+        while re.search(r"\{[^{}]*\}", text):
+            text = re.sub(r"\{([^{}]*)\}", r"\1", text)
+
+        # Remove content within square brackets [...]
+        # This handles abbreviations and annotations like [2023], [Online]
+        text = re.sub(r"\[[^\]]*\]", "", text)
+
+        # Remove content within parentheses (...)
+        # This handles journal/conference abbreviations like (NeurIPS), (CLOUD)
+        text = re.sub(r"\([^)]*\)", "", text)
+
+        # Clean up multiple spaces left by bracket removal
+        text = re.sub(r"\s+", " ", text)
+
+        return text.strip()
+
     def _expand_abbreviations(self, text: str) -> str:
         """Expand common journal abbreviations."""
         words = text.split()
@@ -126,8 +220,11 @@ def _normalize_case(self, text: str) -> str:
         normalized_words = []
 
         for i, word in enumerate(words):
+            # Check if word is a known acronym (case-insensitive)
+            if word.upper() in self.acronyms:
+                normalized_words.append(word.upper())
             # Keep certain words lowercase unless at start
-            if i > 0 and word.lower() in [
+            elif i > 0 and word.lower() in [
                 "of",
                 "and",
                 "or",
diff --git a/tests/unit/test_bibtex_parser.py b/tests/unit/test_bibtex_parser.py
@@ -708,3 +708,112 @@ def test_conference_name_extraction_priority(self, tmp_path):
         assert entry.series == "VLC"
         assert entry.booktitle == "2023 Very Long Conference Name with Year and Edition"
         assert entry.organization == "IEEE"
+
+    def test_nested_brace_removal(self):
+        """Test removal of nested curly braces in BibTeX fields."""
+        # Test the static method directly
+        from aletheia_probe.bibtex_parser import BibtexParser
+
+        # Single level braces
+        result = BibtexParser._remove_nested_braces("{IEEE}")
+        assert result == "IEEE"
+
+        # Double nested braces (common in BibTeX)
+        result = BibtexParser._remove_nested_braces("{{IEEE}}")
+        assert result == "IEEE"
+
+        # Triple nested braces
+        result = BibtexParser._remove_nested_braces("{{{CLOUD}}}")
+        assert result == "CLOUD"
+
+        # Mixed content with multiple braced sections
+        result = BibtexParser._remove_nested_braces(
+            "{{IEEE}} {{International Conference}} on {{Cloud Computing}}"
+        )
+        assert result == "IEEE International Conference on Cloud Computing"
+
+        # Text without braces should remain unchanged
+        result = BibtexParser._remove_nested_braces("Plain text")
+        assert result == "Plain text"
+
+        # Empty braces
+        result = BibtexParser._remove_nested_braces("{}")
+        assert result == ""
+
+        # Nested empty braces
+        result = BibtexParser._remove_nested_braces("{{}}")
+        assert result == ""
+
+    def test_parse_bibtex_conference_with_nested_braces(self, tmp_path):
+        """Test parsing BibTeX entries with heavily nested braces."""
+        bibtex_content = """
+@inproceedings{test2018,
+  title = {{{Software}} {{Greenability}}: {{A Case Study}} of {{Cloud-Based Applications}}},
+  booktitle = {2018 {{IEEE}} 11th {{International Conference}} on {{Cloud Computing}} ({{CLOUD}})},
+  author = {Test Author},
+  year = 2018
+}
+"""
+        test_file = tmp_path / "test_nested_braces.bib"
+        test_file.write_text(bibtex_content, encoding="utf-8")
+
+        entries = BibtexParser.parse_bibtex_file(test_file)
+
+        assert len(entries) == 1
+        entry = entries[0]
+
+        # Title should have nested braces removed
+        expected_title = (
+            "Software Greenability: A Case Study of Cloud-Based Applications"
+        )
+        assert entry.title == expected_title
+
+        # Conference name should have nested braces removed
+        expected_conference = (
+            "2018 IEEE 11th International Conference on Cloud Computing (CLOUD)"
+        )
+        assert entry.journal_name == expected_conference
+
+    def test_parse_bibtex_journal_with_nested_braces(self, tmp_path):
+        """Test parsing journal entries with nested braces."""
+        bibtex_content = """
+@article{test2023,
+  title = {{{Advanced}} {{Machine Learning}} {Techniques}},
+  journal = {{{IEEE}} {{Transactions}} on {{Pattern Analysis}}},
+  author = {Test Author},
+  year = 2023
+}
+"""
+        test_file = tmp_path / "test_journal_nested.bib"
+        test_file.write_text(bibtex_content, encoding="utf-8")
+
+        entries = BibtexParser.parse_bibtex_file(test_file)
+
+        assert len(entries) == 1
+        entry = entries[0]
+
+        # Title should have all braces removed
+        expected_title = "Advanced Machine Learning Techniques"
+        assert entry.title == expected_title
+
+        # Journal should have nested braces removed
+        expected_journal = "IEEE Transactions on Pattern Analysis"
+        assert entry.journal_name == expected_journal
+
+    def test_brace_removal_edge_cases(self):
+        """Test edge cases for nested brace removal."""
+        # Test asymmetric braces (malformed)
+        result = BibtexParser._remove_nested_braces("{incomplete")
+        assert result == "{incomplete"  # Should not remove incomplete braces
+
+        # Test mixed valid and invalid braces
+        result = BibtexParser._remove_nested_braces("{valid} {incomplete")
+        assert result == "valid {incomplete"
+
+        # Test deeply nested braces
+        result = BibtexParser._remove_nested_braces("{{{{deep}}}}")
+        assert result == "deep"
+
+        # Test braces with special characters
+        result = BibtexParser._remove_nested_braces("{{IEEE-802.11}} {Conference}")
+        assert result == "IEEE-802.11 Conference"
diff --git a/tests/unit/test_normalizer.py b/tests/unit/test_normalizer.py