sustainet-guardian
diff --git a/‎src/aletheia_probe/bibtex_parser.py‎
Lines changed: 86 additions & 0 deletions b/‎src/aletheia_probe/bibtex_parser.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎src/aletheia_probe/cache/acronym_cache.py‎
Lines changed: 151 additions & 3 deletions b/‎src/aletheia_probe/cache/acronym_cache.py‎
Lines changed: 151 additions & 3 deletions
diff --git a/‎src/aletheia_probe/cache/schema.py‎
Lines changed: 14 additions & 0 deletions b/‎src/aletheia_probe/cache/schema.py‎
Lines changed: 14 additions & 0 deletions
@@ -1076,3 +1076,89 @@ def _detect_venue_type(entry: Entry, venue_name: str) -> VenueType:
             f"No venue type pattern matched for '{venue_name}' (entry type: {entry.type})"
         )
         return VenueType.UNKNOWN
+
+    @staticmethod
+    def extract_acronyms_from_entries(
+        entries: list[BibtexEntry], acronym_cache: AcronymCache
+    ) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str, list[str]]]]:
+        """Extract acronym mappings from BibTeX entries.
+
+        Args:
+            entries: List of BibtexEntry objects to process
+            acronym_cache: AcronymCache instance for conflict checking
+
+        Returns:
+            Tuple of (new_mappings, conflicts) where:
+            - new_mappings: List of (acronym, venue_name, normalized_name, entity_type)
+            - conflicts: List of (acronym, entity_type, [venue_names])
+        """
+        from .normalizer import input_normalizer
+
+        detail_logger.debug(f"Extracting acronyms from {len(entries)} BibTeX entries")
+
+        # Track mappings: (acronym, entity_type) -> list of (venue_name, normalized_name)
+        mappings: dict[tuple[str, str], list[tuple[str, str]]] = {}
+
+        for entry in entries:
+            if not entry.journal_name:
+                continue
+
+            # Use normalizer to extract acronyms from venue name
+            extracted_acronyms = input_normalizer._extract_acronyms(entry.journal_name)
+            acronym_mappings = input_normalizer._extract_acronym_mappings_from_text(
+                entry.journal_name, extracted_acronyms
+            )
+
+            # Store each acronym mapping
+            for acronym, full_name in acronym_mappings.items():
+                # Normalize the venue name
+                normalized_name = acronym_cache._normalize_venue_name(full_name)
+                entity_type = entry.venue_type.value
+
+                key = (acronym, entity_type)
+                if key not in mappings:
+                    mappings[key] = []
+
+                # Check if this venue is already in the list (avoid duplicates)
+                venue_pair = (full_name, normalized_name)
+                if venue_pair not in mappings[key]:
+                    mappings[key].append(venue_pair)
+
+        # Detect conflicts and build result lists
+        new_mappings: list[tuple[str, str, str, str]] = []
+        conflicts: list[tuple[str, str, list[str]]] = []
+
+        for (acronym, entity_type), venue_list in mappings.items():
+            if len(venue_list) == 1:
+                # No conflict within file, check against database
+                full_name, normalized_name = venue_list[0]
+                has_conflict, existing_name = acronym_cache.check_acronym_conflict(
+                    acronym, entity_type, normalized_name
+                )
+
+                if has_conflict:
+                    # Conflict with existing database entry
+                    detail_logger.debug(
+                        f"Database conflict: '{acronym}' maps to '{normalized_name}' but database has '{existing_name}'"
+                    )
+                    conflicts.append(
+                        (acronym, entity_type, [normalized_name, existing_name or ""])
+                    )
+                else:
+                    # No conflict, add to new mappings
+                    new_mappings.append(
+                        (acronym, full_name, normalized_name, entity_type)
+                    )
+            else:
+                # Multiple venues for same acronym within file
+                venue_names = [normalized for _, normalized in venue_list]
+                detail_logger.debug(
+                    f"File conflict: '{acronym}' maps to multiple venues: {venue_names}"
+                )
+                conflicts.append((acronym, entity_type, venue_names))
+
+        detail_logger.debug(
+            f"Extracted {len(new_mappings)} new acronyms, found {len(conflicts)} conflicts"
+        )
+
+        return new_mappings, conflicts
@@ -51,14 +51,24 @@ def get_full_name_for_acronym(self, acronym: str, entity_type: str) -> str | Non
 
             cursor.execute(
                 """
-                SELECT normalized_name FROM venue_acronyms
+                SELECT normalized_name, is_ambiguous FROM venue_acronyms
                 WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
                 """,
                 (acronym.strip(), entity_type),
             )
 
             row = cursor.fetchone()
             if row:
+                # Check if acronym is ambiguous
+                if row["is_ambiguous"]:
+                    detail_logger.debug(
+                        f"Acronym '{acronym}' is ambiguous (maps to multiple venues), cannot be used for matching"
+                    )
+                    status_logger.warning(
+                        f"Acronym '{acronym}' is ambiguous and cannot be used for automatic matching"
+                    )
+                    return None
+
                 detail_logger.debug(
                     f"Found mapping for '{acronym}' -> '{row['normalized_name']}'"
                 )
@@ -189,8 +199,8 @@ def _store_mapping(
         cursor.execute(
             """
             INSERT OR REPLACE INTO venue_acronyms
-            (acronym, normalized_name, entity_type, source, created_at, last_used_at)
-            VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
+            (acronym, normalized_name, entity_type, source, is_ambiguous, created_at, last_used_at)
+            VALUES (?, ?, ?, ?, FALSE, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
             """,
             (acronym, normalized_name, entity_type, source),
         )
@@ -465,3 +475,141 @@ def clear_acronym_database(self, entity_type: str | None = None) -> int:
                 f"Database clear operation completed, {count} entries deleted"
             )
             return count
+
+    def mark_acronym_as_ambiguous(
+        self, acronym: str, entity_type: str, venues: list[str] | None = None
+    ) -> None:
+        """Mark an acronym as ambiguous (maps to multiple venues).
+
+        Args:
+            acronym: The acronym to mark as ambiguous
+            entity_type: VenueType value (e.g., 'journal', 'conference')
+            venues: Optional list of conflicting venue names for logging
+        """
+        detail_logger.debug(
+            f"Marking acronym '{acronym}' (entity_type={entity_type}) as ambiguous"
+        )
+
+        if venues:
+            status_logger.warning(
+                f"Acronym '{acronym}' is ambiguous - maps to multiple venues: {', '.join(venues)}"
+            )
+        else:
+            status_logger.warning(
+                f"Acronym '{acronym}' (entity_type={entity_type}) marked as ambiguous"
+            )
+
+        with self.get_connection() as conn:
+            cursor = conn.cursor()
+            cursor.execute(
+                """
+                UPDATE venue_acronyms
+                SET is_ambiguous = TRUE
+                WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
+                """,
+                (acronym.strip(), entity_type),
+            )
+            conn.commit()
+            detail_logger.debug(f"Marked acronym '{acronym}' as ambiguous in database")
+
+    def check_acronym_conflict(
+        self, acronym: str, entity_type: str, normalized_name: str
+    ) -> tuple[bool, str | None]:
+        """Check if storing this acronym would create a conflict.
+
+        Args:
+            acronym: The acronym to check
+            entity_type: VenueType value
+            normalized_name: The normalized venue name
+
+        Returns:
+            Tuple of (has_conflict, existing_name)
+            - has_conflict: True if acronym exists with different normalized_name
+            - existing_name: The existing normalized_name if conflict exists, None otherwise
+        """
+        detail_logger.debug(
+            f"Checking for conflict: '{acronym}' (entity_type={entity_type}) -> '{normalized_name}'"
+        )
+
+        with self.get_connection_with_row_factory() as conn:
+            cursor = conn.cursor()
+            cursor.execute(
+                """
+                SELECT normalized_name FROM venue_acronyms
+                WHERE acronym = ? COLLATE NOCASE AND entity_type = ?
+                """,
+                (acronym.strip(), entity_type),
+            )
+
+            row = cursor.fetchone()
+            if row:
+                existing_name = str(row["normalized_name"])
+                if existing_name != normalized_name:
+                    # Check if names are equivalent (minor variations)
+                    if not are_conference_names_equivalent(
+                        existing_name, normalized_name
+                    ):
+                        detail_logger.debug(
+                            f"Conflict detected: existing '{existing_name}' != new '{normalized_name}'"
+                        )
+                        return True, existing_name
+                    else:
+                        detail_logger.debug(
+                            f"Names are equivalent variants: '{existing_name}' ≈ '{normalized_name}'"
+                        )
+                        return False, None
+                else:
+                    detail_logger.debug("Acronym already mapped to same venue")
+                    return False, None
+            else:
+                detail_logger.debug("No existing mapping found")
+                return False, None
+
+    def list_ambiguous_acronyms(
+        self, entity_type: str | None = None
+    ) -> list[dict[str, str]]:
+        """List all acronyms marked as ambiguous.
+
+        Args:
+            entity_type: Optional VenueType value to filter by
+
+        Returns:
+            List of dictionaries containing ambiguous acronym details
+        """
+        detail_logger.debug(
+            f"Listing ambiguous acronyms (entity_type={entity_type or 'all'})"
+        )
+
+        with self.get_connection_with_row_factory() as conn:
+            cursor = conn.cursor()
+
+            if entity_type:
+                query = """
+                    SELECT acronym, normalized_name, entity_type, source, created_at
+                    FROM venue_acronyms
+                    WHERE is_ambiguous = TRUE AND entity_type = ?
+                    ORDER BY acronym ASC
+                """
+                cursor.execute(query, (entity_type,))
+            else:
+                query = """
+                    SELECT acronym, normalized_name, entity_type, source, created_at
+                    FROM venue_acronyms
+                    WHERE is_ambiguous = TRUE
+                    ORDER BY acronym ASC
+                """
+                cursor.execute(query)
+
+            rows = cursor.fetchall()
+            detail_logger.debug(f"Found {len(rows)} ambiguous acronym entries")
+
+            return [
+                {
+                    "acronym": row["acronym"],
+                    "normalized_name": row["normalized_name"],
+                    "entity_type": row["entity_type"],
+                    "source": row["source"],
+                    "created_at": row["created_at"],
+                }
+                for row in rows
+            ]
@@ -22,6 +22,19 @@ def init_database(db_path: Path) -> None:
     name_type_values = ", ".join(f"'{t.value}'" for t in NameType)
 
     with get_configured_connection(db_path) as conn:
+        # Check if venue_acronyms table exists and has correct schema
+        cursor = conn.cursor()
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='venue_acronyms'"
+        )
+        if cursor.fetchone():
+            # Table exists, check if it has is_ambiguous column
+            cursor.execute("PRAGMA table_info(venue_acronyms)")
+            columns = {row[1] for row in cursor.fetchall()}
+            if "is_ambiguous" not in columns:
+                # Old schema, drop and recreate
+                cursor.execute("DROP TABLE venue_acronyms")
+                conn.commit()
         conn.executescript(
             f"""
             -- Core journals table (normalized, one entry per unique journal)
@@ -98,6 +111,7 @@ def init_database(db_path: Path) -> None:
                 normalized_name TEXT NOT NULL,
                 entity_type TEXT NOT NULL,
                 source TEXT,
+                is_ambiguous BOOLEAN DEFAULT FALSE,
                 created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                 last_used_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                 PRIMARY KEY (acronym, entity_type),