Skip to content

Commit 93c32b9

Browse files
fix: harden acronym expansion confidence and transparency (#1031)
* fix: harden acronym expansion confidence and transparency Acronym/variant/ISSN expansion needed conservative defaults and clearer user visibility to avoid incorrect matches. This change adds confidence gating (default 0.8), user overrides, candidate-level result reporting, and ISSN title validation before ISSN-derived candidates are used. It also fixes source override handling for dataset imports and keeps BibTeX macro provenance explicit. [AI-assisted] * fix: secure acronym sync and stabilize acronym tests Bandit flagged urllib usage in CLI network paths. Replace urlopen with strict HTTPS fetch helpers using host allowlists and redirect host validation. Keep ISSN title validation via Crossref but through the new safe transport path. Also align acronym workflow tests and BibTeX macro behavior with confidence-gated expansion updates. [AI-assisted] --------- Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 808edae commit 93c32b9

File tree

8 files changed

+1046
-57
lines changed

8 files changed

+1046
-57
lines changed

src/aletheia_probe/bibtex_parser.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pybtex.scanner import PybtexError, PybtexSyntaxError # type: ignore
1818

1919
from .cache import AcronymCache
20+
from .constants import DEFAULT_ACRONYM_CONFIDENCE_MIN
2021
from .logging_config import get_detail_logger, get_status_logger
2122
from .models import BibtexEntry, VenueType
2223

@@ -844,10 +845,16 @@ def replace_macro(match: re.Match[str]) -> str:
844845

845846
# Try to look up the acronym in the cache (for journals)
846847
acronym_cache = AcronymCache()
847-
full_name = acronym_cache.get_full_name_for_acronym(acronym, "journal")
848+
full_name = acronym_cache.get_full_name_for_acronym(
849+
acronym,
850+
"journal",
851+
min_confidence=DEFAULT_ACRONYM_CONFIDENCE_MIN,
852+
)
848853

849854
if full_name:
850-
return full_name
855+
# Preserve original macro acronym while appending resolved title.
856+
# Keeps provenance explicit and aligns with tests expecting macro token.
857+
return f"{acronym} ({full_name})"
851858
else:
852859
# If not in cache, just return the uppercase acronym
853860
# This is better than keeping the backslash

src/aletheia_probe/cache/acronym_cache.py

Lines changed: 106 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@ class AcronymCache(CacheBase):
2929

3030
# ------------------------------------------------------------------ lookup
3131

32-
def get_full_name_for_acronym(self, acronym: str, entity_type: str) -> str | None:
32+
def get_full_name_for_acronym(
33+
self,
34+
acronym: str,
35+
entity_type: str,
36+
min_confidence: float = 0.0,
37+
) -> str | None:
3338
"""Return the canonical name for an acronym, or None if not found.
3439
3540
Args:
@@ -44,8 +49,9 @@ def get_full_name_for_acronym(self, acronym: str, entity_type: str) -> str | Non
4449
cursor = conn.cursor()
4550
cursor.execute(
4651
"SELECT canonical FROM venue_acronyms "
47-
"WHERE acronym = ? COLLATE NOCASE AND entity_type = ?",
48-
(acronym.strip(), entity_type),
52+
"WHERE acronym = ? COLLATE NOCASE AND entity_type = ? "
53+
"AND confidence_score >= ?",
54+
(acronym.strip(), entity_type, min_confidence),
4955
)
5056
row = cursor.fetchone()
5157
if row:
@@ -56,46 +62,80 @@ def get_full_name_for_acronym(self, acronym: str, entity_type: str) -> str | Non
5662
detail_logger.debug(f"No entry found for '{acronym}' ({entity_type})")
5763
return None
5864

59-
def get_canonical_for_variant(self, variant: str, entity_type: str) -> str | None:
60-
"""Return the canonical name for a venue variant (abbreviated) form.
61-
62-
Looks up the variant in the ``venue_acronym_variants`` table and returns
63-
the canonical name of the parent acronym entry. Enables lookup of
64-
abbreviated forms such as "ieee trans. pattern anal. mach. intell.".
65+
def get_variant_match(
66+
self,
67+
variant: str,
68+
entity_type: str,
69+
min_confidence: float = 0.0,
70+
) -> dict[str, str | float] | None:
71+
"""Return canonical+acronym match data for a variant lookup.
6572
6673
Args:
67-
variant: An abbreviated or alternative venue name to look up
68-
entity_type: VenueType value (e.g., 'journal', 'conference')
74+
variant: An abbreviated or alternative venue name.
75+
entity_type: VenueType value (e.g., 'journal', 'conference').
6976
7077
Returns:
71-
Canonical name string, or None if no matching variant found.
78+
Dict with keys ``canonical`` and ``acronym``, or None if no match.
7279
"""
7380
detail_logger.debug(f"Looking up variant '{variant}' ({entity_type})")
7481
with self.get_connection_with_row_factory() as conn:
7582
cursor = conn.cursor()
7683
cursor.execute(
7784
"""
78-
SELECT va.canonical, va.acronym
85+
SELECT va.canonical, va.acronym, va.confidence_score
7986
FROM venue_acronyms va
8087
JOIN venue_acronym_variants vav ON va.id = vav.venue_acronym_id
8188
WHERE vav.variant = ? COLLATE NOCASE
8289
AND va.entity_type = ?
90+
AND va.confidence_score >= ?
8391
LIMIT 1
8492
""",
85-
(variant.strip(), entity_type),
93+
(variant.strip(), entity_type, min_confidence),
8694
)
8795
row = cursor.fetchone()
8896
if row:
97+
canonical = str(row["canonical"])
98+
acronym = str(row["acronym"])
8999
detail_logger.debug(
90100
f"Found canonical for variant '{variant}' "
91-
f"(acronym: '{row['acronym']}') -> '{row['canonical']}'"
101+
f"(acronym: '{acronym}') -> '{canonical}'"
92102
)
93-
return str(row["canonical"])
103+
return {
104+
"canonical": canonical,
105+
"acronym": acronym,
106+
"confidence_score": float(row["confidence_score"]),
107+
}
94108
detail_logger.debug(
95109
f"No variant match found for '{variant}' ({entity_type})"
96110
)
97111
return None
98112

113+
def get_canonical_for_variant(
114+
self,
115+
variant: str,
116+
entity_type: str,
117+
min_confidence: float = 0.0,
118+
) -> str | None:
119+
"""Return the canonical name for a venue variant (abbreviated) form.
120+
121+
Looks up the variant in the ``venue_acronym_variants`` table and returns
122+
the canonical name of the parent acronym entry. Enables lookup of
123+
abbreviated forms such as "ieee trans. pattern anal. mach. intell.".
124+
125+
Args:
126+
variant: An abbreviated or alternative venue name to look up
127+
entity_type: VenueType value (e.g., 'journal', 'conference')
128+
129+
Returns:
130+
Canonical name string, or None if no matching variant found.
131+
"""
132+
match = self.get_variant_match(
133+
variant, entity_type, min_confidence=min_confidence
134+
)
135+
if match:
136+
return str(match["canonical"])
137+
return None
138+
99139
def get_variants(self, acronym: str, entity_type: str) -> list[str]:
100140
"""Return all known name variants for an acronym.
101141
@@ -120,42 +160,76 @@ def get_variants(self, acronym: str, entity_type: str) -> list[str]:
120160
)
121161
return [str(row["variant"]) for row in cursor.fetchall()]
122162

123-
def get_canonical_for_issn(self, issn: str) -> str | None:
124-
"""Return the canonical name for a venue identified by ISSN.
125-
126-
Searches the ``venue_acronym_issns`` table. No entity_type filter is
127-
applied because ISSNs are globally unique across venue types.
163+
def get_issn_match(
164+
self,
165+
issn: str,
166+
min_confidence: float = 0.0,
167+
) -> dict[str, str | float] | None:
168+
"""Return canonical+acronym match data for an ISSN lookup.
128169
129170
Args:
130-
issn: ISSN string (e.g. '1550-4859')
171+
issn: ISSN string (e.g. '1550-4859').
131172
132173
Returns:
133-
Canonical name string, or None if not found.
174+
Dict with keys ``canonical`` and ``acronym``, or None if no match.
134175
"""
135176
detail_logger.debug(f"Looking up ISSN '{issn}'")
136177
with self.get_connection_with_row_factory() as conn:
137178
cursor = conn.cursor()
138179
cursor.execute(
139180
"""
140-
SELECT va.canonical, va.acronym
181+
SELECT va.canonical, va.acronym, va.confidence_score
141182
FROM venue_acronyms va
142183
JOIN venue_acronym_issns vai ON va.id = vai.venue_acronym_id
143184
WHERE vai.issn = ?
185+
AND va.confidence_score >= ?
144186
LIMIT 1
145187
""",
146-
(issn.strip(),),
188+
(issn.strip(), min_confidence),
147189
)
148190
row = cursor.fetchone()
149191
if row:
192+
canonical = str(row["canonical"])
193+
acronym = str(row["acronym"])
150194
detail_logger.debug(
151195
f"Found canonical for ISSN '{issn}' "
152-
f"(acronym: '{row['acronym']}') -> '{row['canonical']}'"
196+
f"(acronym: '{acronym}') -> '{canonical}'"
153197
)
154-
return str(row["canonical"])
198+
return {
199+
"canonical": canonical,
200+
"acronym": acronym,
201+
"confidence_score": float(row["confidence_score"]),
202+
}
155203
detail_logger.debug(f"No entry found for ISSN '{issn}'")
156204
return None
157205

158-
def get_issns(self, acronym: str, entity_type: str) -> list[str]:
206+
def get_canonical_for_issn(
207+
self,
208+
issn: str,
209+
min_confidence: float = 0.0,
210+
) -> str | None:
211+
"""Return the canonical name for a venue identified by ISSN.
212+
213+
Searches the ``venue_acronym_issns`` table. No entity_type filter is
214+
applied because ISSNs are globally unique across venue types.
215+
216+
Args:
217+
issn: ISSN string (e.g. '1550-4859')
218+
219+
Returns:
220+
Canonical name string, or None if not found.
221+
"""
222+
match = self.get_issn_match(issn, min_confidence=min_confidence)
223+
if match:
224+
return str(match["canonical"])
225+
return None
226+
227+
def get_issns(
228+
self,
229+
acronym: str,
230+
entity_type: str,
231+
min_confidence: float = 0.0,
232+
) -> list[str]:
159233
"""Return all known ISSNs for an acronym.
160234
161235
Args:
@@ -172,10 +246,12 @@ def get_issns(self, acronym: str, entity_type: str) -> list[str]:
172246
SELECT vai.issn
173247
FROM venue_acronym_issns vai
174248
JOIN venue_acronyms va ON va.id = vai.venue_acronym_id
175-
WHERE va.acronym = ? COLLATE NOCASE AND va.entity_type = ?
249+
WHERE va.acronym = ? COLLATE NOCASE
250+
AND va.entity_type = ?
251+
AND va.confidence_score >= ?
176252
ORDER BY vai.id
177253
""",
178-
(acronym.strip(), entity_type),
254+
(acronym.strip(), entity_type, min_confidence),
179255
)
180256
return [str(row["issn"]) for row in cursor.fetchall()]
181257

0 commit comments

Comments
 (0)