Skip to content

Commit 737ffaf

Browse files
feat: add reproducible acronym collection from BibTeX files (#1002)
Add new command 'aletheia-probe acronym add-bibtex' to pre-collect acronyms from BibTeX files before running assessments. This ensures reproducible results by stabilizing the acronym cache before processing. Key features: - Pre-collect acronyms from BibTeX files without running assessments - Detect and mark ambiguous acronyms (same acronym for different venues) - Ambiguous acronyms are excluded from automatic matching - Dry-run mode to preview what would be added - New command to list all ambiguous acronyms Database changes: - Added is_ambiguous column to venue_acronyms table - Automatic schema migration drops old table if column missing Implementation: - Enhanced AcronymCache with conflict detection and ambiguous marking - Added BibtexParser.extract_acronyms_from_entries() method - New data models: AcronymMapping, AcronymConflict, AcronymCollectionResult - CLI commands: acronym add-bibtex, acronym list-ambiguous This resolves the reproducibility issue where acronyms collected during a run would cause different results based on processing order. Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 3887c2b commit 737ffaf

File tree

5 files changed

+434
-4
lines changed

5 files changed

+434
-4
lines changed

src/aletheia_probe/bibtex_parser.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,3 +1076,89 @@ def _detect_venue_type(entry: Entry, venue_name: str) -> VenueType:
10761076
f"No venue type pattern matched for '{venue_name}' (entry type: {entry.type})"
10771077
)
10781078
return VenueType.UNKNOWN
1079+
1080+
@staticmethod
1081+
def extract_acronyms_from_entries(
1082+
entries: list[BibtexEntry], acronym_cache: AcronymCache
1083+
) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str, list[str]]]]:
1084+
"""Extract acronym mappings from BibTeX entries.
1085+
1086+
Args:
1087+
entries: List of BibtexEntry objects to process
1088+
acronym_cache: AcronymCache instance for conflict checking
1089+
1090+
Returns:
1091+
Tuple of (new_mappings, conflicts) where:
1092+
- new_mappings: List of (acronym, venue_name, normalized_name, entity_type)
1093+
- conflicts: List of (acronym, entity_type, [venue_names])
1094+
"""
1095+
from .normalizer import input_normalizer
1096+
1097+
detail_logger.debug(f"Extracting acronyms from {len(entries)} BibTeX entries")
1098+
1099+
# Track mappings: (acronym, entity_type) -> list of (venue_name, normalized_name)
1100+
mappings: dict[tuple[str, str], list[tuple[str, str]]] = {}
1101+
1102+
for entry in entries:
1103+
if not entry.journal_name:
1104+
continue
1105+
1106+
# Use normalizer to extract acronyms from venue name
1107+
extracted_acronyms = input_normalizer._extract_acronyms(entry.journal_name)
1108+
acronym_mappings = input_normalizer._extract_acronym_mappings_from_text(
1109+
entry.journal_name, extracted_acronyms
1110+
)
1111+
1112+
# Store each acronym mapping
1113+
for acronym, full_name in acronym_mappings.items():
1114+
# Normalize the venue name
1115+
normalized_name = acronym_cache._normalize_venue_name(full_name)
1116+
entity_type = entry.venue_type.value
1117+
1118+
key = (acronym, entity_type)
1119+
if key not in mappings:
1120+
mappings[key] = []
1121+
1122+
# Check if this venue is already in the list (avoid duplicates)
1123+
venue_pair = (full_name, normalized_name)
1124+
if venue_pair not in mappings[key]:
1125+
mappings[key].append(venue_pair)
1126+
1127+
# Detect conflicts and build result lists
1128+
new_mappings: list[tuple[str, str, str, str]] = []
1129+
conflicts: list[tuple[str, str, list[str]]] = []
1130+
1131+
for (acronym, entity_type), venue_list in mappings.items():
1132+
if len(venue_list) == 1:
1133+
# No conflict within file, check against database
1134+
full_name, normalized_name = venue_list[0]
1135+
has_conflict, existing_name = acronym_cache.check_acronym_conflict(
1136+
acronym, entity_type, normalized_name
1137+
)
1138+
1139+
if has_conflict:
1140+
# Conflict with existing database entry
1141+
detail_logger.debug(
1142+
f"Database conflict: '{acronym}' maps to '{normalized_name}' but database has '{existing_name}'"
1143+
)
1144+
conflicts.append(
1145+
(acronym, entity_type, [normalized_name, existing_name or ""])
1146+
)
1147+
else:
1148+
# No conflict, add to new mappings
1149+
new_mappings.append(
1150+
(acronym, full_name, normalized_name, entity_type)
1151+
)
1152+
else:
1153+
# Multiple venues for same acronym within file
1154+
venue_names = [normalized for _, normalized in venue_list]
1155+
detail_logger.debug(
1156+
f"File conflict: '{acronym}' maps to multiple venues: {venue_names}"
1157+
)
1158+
conflicts.append((acronym, entity_type, venue_names))
1159+
1160+
detail_logger.debug(
1161+
f"Extracted {len(new_mappings)} new acronyms, found {len(conflicts)} conflicts"
1162+
)
1163+
1164+
return new_mappings, conflicts

src/aletheia_probe/cache/acronym_cache.py

Lines changed: 151 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,24 @@ def get_full_name_for_acronym(self, acronym: str, entity_type: str) -> str | Non
5151

5252
cursor.execute(
5353
"""
54-
SELECT normalized_name FROM venue_acronyms
54+
SELECT normalized_name, is_ambiguous FROM venue_acronyms
5555
WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
5656
""",
5757
(acronym.strip(), entity_type),
5858
)
5959

6060
row = cursor.fetchone()
6161
if row:
62+
# Check if acronym is ambiguous
63+
if row["is_ambiguous"]:
64+
detail_logger.debug(
65+
f"Acronym '{acronym}' is ambiguous (maps to multiple venues), cannot be used for matching"
66+
)
67+
status_logger.warning(
68+
f"Acronym '{acronym}' is ambiguous and cannot be used for automatic matching"
69+
)
70+
return None
71+
6272
detail_logger.debug(
6373
f"Found mapping for '{acronym}' -> '{row['normalized_name']}'"
6474
)
@@ -189,8 +199,8 @@ def _store_mapping(
189199
cursor.execute(
190200
"""
191201
INSERT OR REPLACE INTO venue_acronyms
192-
(acronym, normalized_name, entity_type, source, created_at, last_used_at)
193-
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
202+
(acronym, normalized_name, entity_type, source, is_ambiguous, created_at, last_used_at)
203+
VALUES (?, ?, ?, ?, FALSE, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
194204
""",
195205
(acronym, normalized_name, entity_type, source),
196206
)
@@ -465,3 +475,141 @@ def clear_acronym_database(self, entity_type: str | None = None) -> int:
465475
f"Database clear operation completed, {count} entries deleted"
466476
)
467477
return count
478+
479+
def mark_acronym_as_ambiguous(
480+
self, acronym: str, entity_type: str, venues: list[str] | None = None
481+
) -> None:
482+
"""Mark an acronym as ambiguous (maps to multiple venues).
483+
484+
Args:
485+
acronym: The acronym to mark as ambiguous
486+
entity_type: VenueType value (e.g., 'journal', 'conference')
487+
venues: Optional list of conflicting venue names for logging
488+
"""
489+
detail_logger.debug(
490+
f"Marking acronym '{acronym}' (entity_type={entity_type}) as ambiguous"
491+
)
492+
493+
if venues:
494+
status_logger.warning(
495+
f"Acronym '{acronym}' is ambiguous - maps to multiple venues: {', '.join(venues)}"
496+
)
497+
else:
498+
status_logger.warning(
499+
f"Acronym '{acronym}' (entity_type={entity_type}) marked as ambiguous"
500+
)
501+
502+
with self.get_connection() as conn:
503+
cursor = conn.cursor()
504+
cursor.execute(
505+
"""
506+
UPDATE venue_acronyms
507+
SET is_ambiguous = TRUE
508+
WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
509+
""",
510+
(acronym.strip(), entity_type),
511+
)
512+
conn.commit()
513+
detail_logger.debug(f"Marked acronym '{acronym}' as ambiguous in database")
514+
515+
def check_acronym_conflict(
516+
self, acronym: str, entity_type: str, normalized_name: str
517+
) -> tuple[bool, str | None]:
518+
"""Check if storing this acronym would create a conflict.
519+
520+
Args:
521+
acronym: The acronym to check
522+
entity_type: VenueType value
523+
normalized_name: The normalized venue name
524+
525+
Returns:
526+
Tuple of (has_conflict, existing_name)
527+
- has_conflict: True if acronym exists with different normalized_name
528+
- existing_name: The existing normalized_name if conflict exists, None otherwise
529+
"""
530+
detail_logger.debug(
531+
f"Checking for conflict: '{acronym}' (entity_type={entity_type}) -> '{normalized_name}'"
532+
)
533+
534+
with self.get_connection_with_row_factory() as conn:
535+
cursor = conn.cursor()
536+
cursor.execute(
537+
"""
538+
SELECT normalized_name FROM venue_acronyms
539+
WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
540+
""",
541+
(acronym.strip(), entity_type),
542+
)
543+
544+
row = cursor.fetchone()
545+
if row:
546+
existing_name = str(row["normalized_name"])
547+
if existing_name != normalized_name:
548+
# Check if names are equivalent (minor variations)
549+
if not are_conference_names_equivalent(
550+
existing_name, normalized_name
551+
):
552+
detail_logger.debug(
553+
f"Conflict detected: existing '{existing_name}' != new '{normalized_name}'"
554+
)
555+
return True, existing_name
556+
else:
557+
detail_logger.debug(
558+
f"Names are equivalent variants: '{existing_name}' ≈ '{normalized_name}'"
559+
)
560+
return False, None
561+
else:
562+
detail_logger.debug("Acronym already mapped to same venue")
563+
return False, None
564+
else:
565+
detail_logger.debug("No existing mapping found")
566+
return False, None
567+
568+
def list_ambiguous_acronyms(
569+
self, entity_type: str | None = None
570+
) -> list[dict[str, str]]:
571+
"""List all acronyms marked as ambiguous.
572+
573+
Args:
574+
entity_type: Optional VenueType value to filter by
575+
576+
Returns:
577+
List of dictionaries containing ambiguous acronym details
578+
"""
579+
detail_logger.debug(
580+
f"Listing ambiguous acronyms (entity_type={entity_type or 'all'})"
581+
)
582+
583+
with self.get_connection_with_row_factory() as conn:
584+
cursor = conn.cursor()
585+
586+
if entity_type:
587+
query = """
588+
SELECT acronym, normalized_name, entity_type, source, created_at
589+
FROM venue_acronyms
590+
WHERE is_ambiguous = TRUE AND entity_type = ?
591+
ORDER BY acronym ASC
592+
"""
593+
cursor.execute(query, (entity_type,))
594+
else:
595+
query = """
596+
SELECT acronym, normalized_name, entity_type, source, created_at
597+
FROM venue_acronyms
598+
WHERE is_ambiguous = TRUE
599+
ORDER BY acronym ASC
600+
"""
601+
cursor.execute(query)
602+
603+
rows = cursor.fetchall()
604+
detail_logger.debug(f"Found {len(rows)} ambiguous acronym entries")
605+
606+
return [
607+
{
608+
"acronym": row["acronym"],
609+
"normalized_name": row["normalized_name"],
610+
"entity_type": row["entity_type"],
611+
"source": row["source"],
612+
"created_at": row["created_at"],
613+
}
614+
for row in rows
615+
]

src/aletheia_probe/cache/schema.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ def init_database(db_path: Path) -> None:
2222
name_type_values = ", ".join(f"'{t.value}'" for t in NameType)
2323

2424
with get_configured_connection(db_path) as conn:
25+
# Check if venue_acronyms table exists and has correct schema
26+
cursor = conn.cursor()
27+
cursor.execute(
28+
"SELECT name FROM sqlite_master WHERE type='table' AND name='venue_acronyms'"
29+
)
30+
if cursor.fetchone():
31+
# Table exists, check if it has is_ambiguous column
32+
cursor.execute("PRAGMA table_info(venue_acronyms)")
33+
columns = {row[1] for row in cursor.fetchall()}
34+
if "is_ambiguous" not in columns:
35+
# Old schema, drop and recreate
36+
cursor.execute("DROP TABLE venue_acronyms")
37+
conn.commit()
2538
conn.executescript(
2639
f"""
2740
-- Core journals table (normalized, one entry per unique journal)
@@ -98,6 +111,7 @@ def init_database(db_path: Path) -> None:
98111
normalized_name TEXT NOT NULL,
99112
entity_type TEXT NOT NULL,
100113
source TEXT,
114+
is_ambiguous BOOLEAN DEFAULT FALSE,
101115
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
102116
last_used_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
103117
PRIMARY KEY (acronym, entity_type),

0 commit comments

Comments
 (0)