Skip to content

Commit d011abe

Browse files
feat: Implement conference name normalization to reduce 'unknown' assessments (fixes #69) (#85)
This change adds normalization of conference names during BibTeX parsing to reduce variations caused by different citation styles, particularly the "Proceedings of" and "Proceedings of the" prefixes that appear inconsistently across BibTeX entries. Changes: - Add _normalize_conference_name() method to BibtexParser - Apply normalization to all conference name extraction paths - Remove "Proceedings of the" and "Proceedings of" prefixes (case-insensitive) - Normalize whitespace in conference names Benefits: - Reduces duplicate queries for the same conference with different formatting - Improves cache hit rate for case-insensitive conference name matching - Works with existing InputNormalizer infrastructure for additional normalization Testing: - Added test_conference_name_normalization() to verify BibTeX parsing - Added test_normalize_conference_name_method() to verify normalization logic - All 272 tests pass with 57% coverage [AI-assisted] Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 8d113e6 commit d011abe

File tree

2 files changed

+131
-5
lines changed

2 files changed

+131
-5
lines changed

src/aletheia_probe/bibtex_parser.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def _extract_conference_name(entry: Entry) -> str | None:
249249
"""Extract conference name from a BibTeX @inproceedings entry.
250250
251251
Prioritizes series name (normalized) over full booktitle.
252-
Attempts to extract conference series by removing years and ordinals.
252+
Normalizes conference names to improve matching across variations.
253253
254254
Args:
255255
entry: BibTeX entry
@@ -263,20 +263,51 @@ def _extract_conference_name(entry: Entry) -> str | None:
263263
# Remove common artifacts (quotes, extra spaces)
264264
series = series.strip("'\"").strip()
265265
if series:
266-
return series
266+
return BibtexParser._normalize_conference_name(series)
267267

268268
# Priority 2: Try to extract from booktitle
269269
booktitle = BibtexParser._get_field_safely(entry, "booktitle")
270270
if booktitle:
271-
return booktitle
271+
return BibtexParser._normalize_conference_name(booktitle)
272272

273273
# Priority 3: Fallback to organization
274274
organization = BibtexParser._get_field_safely(entry, "organization")
275275
if organization:
276-
return organization
276+
return BibtexParser._normalize_conference_name(organization)
277277

278278
return None
279279

280+
@staticmethod
281+
def _normalize_conference_name(name: str) -> str:
282+
"""Normalize conference names to improve matching across variations.
283+
284+
Removes common prefixes like "Proceedings of" and "Proceedings of the"
285+
to reduce variation between different citation styles.
286+
287+
Args:
288+
name: Raw conference name
289+
290+
Returns:
291+
Normalized conference name
292+
293+
Examples:
294+
"Proceedings of the IEEE conference on computer vision" ->
295+
"IEEE conference on computer vision"
296+
"Proceedings of Semantic Web" -> "Semantic Web"
297+
"""
298+
import re
299+
300+
# Remove "Proceedings of the" prefix (case-insensitive)
301+
name = re.sub(r"^proceedings\s+of\s+the\s+", "", name, flags=re.IGNORECASE)
302+
303+
# Remove "Proceedings of" prefix (case-insensitive)
304+
name = re.sub(r"^proceedings\s+of\s+", "", name, flags=re.IGNORECASE)
305+
306+
# Clean up any extra whitespace
307+
name = re.sub(r"\s+", " ", name).strip()
308+
309+
return name
310+
280311
@staticmethod
281312
def _extract_authors_safely(entry: Entry) -> str | None:
282313
"""Extract author information from a BibTeX entry with error handling.

tests/unit/test_bibtex_parser.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,8 @@ def test_parse_bibtex_conference_with_nested_braces(self, tmp_path):
768768
)
769769
assert entry.title == expected_title
770770

771-
# Conference name should have nested braces removed
771+
# Conference name should have nested braces removed and no proceedings prefix
772+
# Note: The booktitle doesn't have "Proceedings of" so it stays as is
772773
expected_conference = (
773774
"2018 IEEE 11th International Conference on Cloud Computing (CLOUD)"
774775
)
@@ -916,3 +917,97 @@ def test_parse_bibtex_file_with_latex_escapes(self, tmp_path):
916917
# Check third entry with multiple escapes
917918
entry3 = [e for e in entries if e.key == "test_multiple"][0]
918919
assert entry3.journal_name == 'Test & Review: "Quality" % Assessment'
920+
921+
def test_conference_name_normalization(self, tmp_path):
922+
"""Test normalization of conference names to reduce variations."""
923+
bibtex_content = """
924+
@inproceedings{cvpr_proceedings,
925+
title={Test Paper 1},
926+
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
927+
author={Author One},
928+
year={2023}
929+
}
930+
931+
@inproceedings{cvpr_short,
932+
title={Test Paper 2},
933+
booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
934+
author={Author Two},
935+
year={2023}
936+
}
937+
938+
@inproceedings{neurips_proceedings,
939+
title={Test Paper 3},
940+
booktitle={Proceedings of Advances in Neural Information Processing Systems},
941+
author={Author Three},
942+
year={2023}
943+
}
944+
945+
@inproceedings{neurips_normal,
946+
title={Test Paper 4},
947+
booktitle={Advances in Neural Information Processing Systems},
948+
author={Author Four},
949+
year={2023}
950+
}
951+
"""
952+
test_file = tmp_path / "test_conference_normalization.bib"
953+
test_file.write_text(bibtex_content, encoding="utf-8")
954+
955+
entries = BibtexParser.parse_bibtex_file(test_file)
956+
957+
assert len(entries) == 4
958+
959+
# Both CVPR entries should normalize to the same form (without "Proceedings of the")
960+
cvpr_proceedings = [e for e in entries if e.key == "cvpr_proceedings"][0]
961+
cvpr_short = [e for e in entries if e.key == "cvpr_short"][0]
962+
assert (
963+
cvpr_proceedings.journal_name
964+
== "IEEE Conference on Computer Vision and Pattern Recognition"
965+
)
966+
assert (
967+
cvpr_short.journal_name
968+
== "IEEE Conference on Computer Vision and Pattern Recognition"
969+
)
970+
971+
# Both NeurIPS entries should normalize to the same form (without "Proceedings of")
972+
neurips_proceedings = [e for e in entries if e.key == "neurips_proceedings"][0]
973+
neurips_normal = [e for e in entries if e.key == "neurips_normal"][0]
974+
assert (
975+
neurips_proceedings.journal_name
976+
== "Advances in Neural Information Processing Systems"
977+
)
978+
assert (
979+
neurips_normal.journal_name
980+
== "Advances in Neural Information Processing Systems"
981+
)
982+
983+
def test_normalize_conference_name_method(self):
984+
"""Test the _normalize_conference_name static method directly."""
985+
# Test "Proceedings of the" removal
986+
result = BibtexParser._normalize_conference_name(
987+
"Proceedings of the IEEE Conference on Computer Vision"
988+
)
989+
assert result == "IEEE Conference on Computer Vision"
990+
991+
# Test "Proceedings of" removal
992+
result = BibtexParser._normalize_conference_name(
993+
"Proceedings of Advances in Neural Information Processing Systems"
994+
)
995+
assert result == "Advances in Neural Information Processing Systems"
996+
997+
# Test case-insensitive matching
998+
result = BibtexParser._normalize_conference_name(
999+
"PROCEEDINGS OF THE International Conference on Machine Learning"
1000+
)
1001+
assert result == "International Conference on Machine Learning"
1002+
1003+
# Test name without "Proceedings of" prefix remains unchanged
1004+
result = BibtexParser._normalize_conference_name(
1005+
"IEEE Conference on Computer Vision and Pattern Recognition"
1006+
)
1007+
assert result == "IEEE Conference on Computer Vision and Pattern Recognition"
1008+
1009+
# Test whitespace normalization
1010+
result = BibtexParser._normalize_conference_name(
1011+
"Proceedings of the Conference Name"
1012+
)
1013+
assert result == "Conference Name"

0 commit comments

Comments
 (0)