Skip to content

Commit 3e41559

Browse files
feat: Implement self-learning conference acronym recognition (fixes #70) (#86)
Add self-learning acronym cache that builds conference/journal acronym mappings over time based on user's domain-specific usage. Database: - Add conference_acronyms table to cache.db with acronym, full_name, source Acronym Storage: - Extract acronyms from parenthetical references in BibTeX entries - Extract acronyms from OpenAlex display_name responses - Warn when overwriting existing acronym mapping Acronym Expansion: - Detect standalone acronyms in input (2-10 chars, mostly uppercase) - Look up expansion in cache and add as alias for searching - Fall back to expanded name if initial query yields no results User Feedback: - CLI shows 'Note: Expanded acronym' message - JSON output includes acronym_expansion_used field Users querying ICML will get legitimate results after the system has seen the full name with acronym in any BibTeX file or OpenAlex response. Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent d011abe commit 3e41559

File tree

6 files changed

+330
-0
lines changed

6 files changed

+330
-0
lines changed

src/aletheia_probe/backends/openalex_analyzer.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ async def _query_api(self, query_input: QueryInput) -> BackendResult:
9494
response_time=response_time,
9595
)
9696

97+
# Store acronym mapping if display_name contains acronym in parentheses
98+
self._store_acronym_from_openalex(openalex_data)
99+
97100
# Route to appropriate assessment based on publication type
98101
source_type = openalex_data.get("source_type", "").lower()
99102
if source_type == "conference":
@@ -134,6 +137,29 @@ async def _query_api(self, query_input: QueryInput) -> BackendResult:
134137
response_time=response_time,
135138
)
136139

140+
def _store_acronym_from_openalex(self, openalex_data: dict[str, Any]) -> None:
141+
"""Extract and store acronym mapping from OpenAlex display_name.
142+
143+
OpenAlex sometimes includes acronyms in parentheses in the display_name field.
144+
For example: "International Conference on Machine Learning (ICML)"
145+
146+
Args:
147+
openalex_data: Raw data from OpenAlex API
148+
"""
149+
from ..normalizer import InputNormalizer
150+
151+
display_name = openalex_data.get("display_name")
152+
if not display_name:
153+
return
154+
155+
# Use the normalizer's acronym extraction logic
156+
normalizer = InputNormalizer()
157+
acronyms = normalizer._extract_acronyms(display_name)
158+
159+
if acronyms:
160+
# Store the mapping for each extracted acronym
161+
normalizer._store_acronym_mappings_from_text(display_name, acronyms)
162+
137163
def _analyze_journal_patterns(
138164
self, openalex_data: dict[str, Any]
139165
) -> dict[str, Any]:

src/aletheia_probe/cache.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,16 @@ def _init_database(self) -> None:
9797
UNIQUE(journal_id, source_id)
9898
);
9999
100+
-- Conference/journal acronym mappings (self-learning cache)
101+
CREATE TABLE IF NOT EXISTS conference_acronyms (
102+
acronym TEXT PRIMARY KEY COLLATE NOCASE,
103+
full_name TEXT NOT NULL,
104+
source TEXT,
105+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
106+
last_used_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
107+
);
108+
CREATE INDEX IF NOT EXISTS idx_acronyms_full_name ON conference_acronyms(full_name);
109+
100110
-- Source metadata (replaces JSON metadata)
101111
CREATE TABLE IF NOT EXISTS source_metadata (
102112
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -1088,6 +1098,97 @@ def cleanup_expired_article_retractions(self) -> int:
10881098
conn.commit()
10891099
return cursor.rowcount
10901100

1101+
# Acronym management methods
1102+
1103+
def get_full_name_for_acronym(self, acronym: str) -> str | None:
1104+
"""
1105+
Look up the full name for a conference/journal acronym.
1106+
1107+
Args:
1108+
acronym: The acronym to look up (e.g., 'ICML', 'CVPR')
1109+
1110+
Returns:
1111+
Full name if found in cache, None otherwise
1112+
"""
1113+
with sqlite3.connect(self.db_path) as conn:
1114+
conn.row_factory = sqlite3.Row
1115+
cursor = conn.cursor()
1116+
1117+
cursor.execute(
1118+
"""
1119+
SELECT full_name FROM conference_acronyms
1120+
WHERE acronym = ? COLLATE NOCASE
1121+
""",
1122+
(acronym.strip(),),
1123+
)
1124+
1125+
row = cursor.fetchone()
1126+
if row:
1127+
# Update last_used_at timestamp
1128+
cursor.execute(
1129+
"""
1130+
UPDATE conference_acronyms
1131+
SET last_used_at = CURRENT_TIMESTAMP
1132+
WHERE acronym = ? COLLATE NOCASE
1133+
""",
1134+
(acronym.strip(),),
1135+
)
1136+
conn.commit()
1137+
return str(row["full_name"])
1138+
return None
1139+
1140+
def store_acronym_mapping(
1141+
self, acronym: str, full_name: str, source: str = "unknown"
1142+
) -> None:
1143+
"""
1144+
Store an acronym to full name mapping in the cache.
1145+
1146+
If the acronym already exists with a different full_name, logs a warning
1147+
and overwrites with the new mapping.
1148+
1149+
Args:
1150+
acronym: The acronym (e.g., 'ICML')
1151+
full_name: The full conference/journal name
1152+
source: Source of the mapping ('bibtex_extraction', 'openalex_response', 'manual')
1153+
"""
1154+
from .logging_config import get_status_logger
1155+
1156+
status_logger = get_status_logger()
1157+
1158+
acronym = acronym.strip()
1159+
full_name = full_name.strip()
1160+
1161+
with sqlite3.connect(self.db_path) as conn:
1162+
conn.row_factory = sqlite3.Row
1163+
cursor = conn.cursor()
1164+
1165+
# Check for existing mapping
1166+
cursor.execute(
1167+
"""
1168+
SELECT full_name FROM conference_acronyms
1169+
WHERE acronym = ? COLLATE NOCASE
1170+
""",
1171+
(acronym,),
1172+
)
1173+
1174+
existing = cursor.fetchone()
1175+
if existing and existing["full_name"] != full_name:
1176+
status_logger.warning(
1177+
f"Acronym '{acronym}' already maps to '{existing['full_name']}', "
1178+
f"overwriting with '{full_name}'"
1179+
)
1180+
1181+
# Insert or replace the mapping
1182+
cursor.execute(
1183+
"""
1184+
INSERT OR REPLACE INTO conference_acronyms
1185+
(acronym, full_name, source, created_at, last_used_at)
1186+
VALUES (?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
1187+
""",
1188+
(acronym, full_name, source),
1189+
)
1190+
conn.commit()
1191+
10911192

10921193
# Global cache manager instance with factory pattern
10931194
_cache_manager_instance: CacheManager | None = None

src/aletheia_probe/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,13 @@ async def _async_assess_publication(
380380
label = "Journal"
381381

382382
print(f"{label}: {result.input_query}")
383+
384+
# Show acronym expansion note if applicable
385+
if result.acronym_expansion_used and result.acronym_expanded_from:
386+
print(
387+
f"Note: Expanded acronym '{result.acronym_expanded_from}' using cached mapping"
388+
)
389+
383390
print(f"Assessment: {result.assessment.upper()}")
384391
print(f"Confidence: {result.confidence:.2f}")
385392
print(f"Overall Score: {result.overall_score:.2f}")

src/aletheia_probe/dispatcher.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ async def assess_journal(self, query_input: QueryInput) -> AssessmentResult:
8888
metadata=None,
8989
reasoning=["No backends available for assessment"],
9090
processing_time=time.time() - start_time,
91+
acronym_expanded_from=query_input.acronym_expanded_from,
92+
acronym_expansion_used=bool(query_input.acronym_expanded_from),
9193
)
9294

9395
self.status_logger.info(
@@ -102,6 +104,46 @@ async def assess_journal(self, query_input: QueryInput) -> AssessmentResult:
102104
query_input, backend_results, time.time() - start_time
103105
)
104106

107+
# Acronym fallback: If initial query yields no confident results and input looks
108+
# like an acronym with a cached expansion, retry with the expanded name
109+
if self._should_try_acronym_fallback(assessment_result, query_input):
110+
from .cache import CacheManager
111+
from .normalizer import InputNormalizer
112+
113+
normalizer = InputNormalizer()
114+
cache = CacheManager()
115+
116+
# Check if input is acronym-like and has expansion
117+
if normalizer._is_standalone_acronym(query_input.raw_input):
118+
expanded_name = cache.get_full_name_for_acronym(query_input.raw_input)
119+
120+
if expanded_name:
121+
self.status_logger.info(
122+
f"No confident results for '{query_input.raw_input}'. "
123+
f"Retrying with expanded name: '{expanded_name}'"
124+
)
125+
126+
# Create new query input with expanded name
127+
from .normalizer import input_normalizer
128+
129+
expanded_query = input_normalizer.normalize(expanded_name)
130+
131+
# Re-query backends with expanded name
132+
retry_results = await self._query_backends(
133+
enabled_backends, expanded_query
134+
)
135+
136+
# Calculate new assessment
137+
retry_assessment = self._calculate_assessment(
138+
expanded_query, retry_results, time.time() - start_time
139+
)
140+
141+
# If retry gave better results, use it and mark acronym expansion
142+
if retry_assessment.confidence > assessment_result.confidence:
143+
retry_assessment.acronym_expanded_from = query_input.raw_input
144+
retry_assessment.acronym_expansion_used = True
145+
return retry_assessment
146+
105147
return assessment_result
106148

107149
def _get_enabled_backends(self) -> list[Backend]:
@@ -256,6 +298,46 @@ async def _query_backend_with_timing(
256298
result_dict["evidence_type"] = backend.get_evidence_type().value
257299
return BackendResult(**result_dict)
258300

301+
def _should_try_acronym_fallback(
302+
self, assessment_result: AssessmentResult, query_input: QueryInput
303+
) -> bool:
304+
"""Determine if we should try acronym expansion fallback.
305+
306+
Acronym fallback is attempted when:
307+
- Initial assessment is UNKNOWN or has low confidence
308+
- No backends returned FOUND status
309+
- Input hasn't already been expanded from an acronym
310+
311+
Args:
312+
assessment_result: The initial assessment result
313+
query_input: The original query input
314+
315+
Returns:
316+
True if acronym fallback should be attempted
317+
"""
318+
# Don't retry if we already used acronym expansion
319+
if query_input.acronym_expanded_from:
320+
return False
321+
322+
# Retry if assessment is UNKNOWN
323+
if assessment_result.assessment == AssessmentType.UNKNOWN:
324+
return True
325+
326+
# Retry if confidence is very low (< 0.3)
327+
if assessment_result.confidence < 0.3:
328+
return True
329+
330+
# Retry if no backends found anything
331+
found_count = sum(
332+
1
333+
for r in assessment_result.backend_results
334+
if r.status == BackendStatus.FOUND
335+
)
336+
if found_count == 0:
337+
return True
338+
339+
return False
340+
259341
def _calculate_assessment(
260342
self,
261343
query_input: QueryInput,
@@ -395,6 +477,8 @@ def _handle_no_results(
395477
metadata=None,
396478
reasoning=reasoning,
397479
processing_time=processing_time,
480+
acronym_expanded_from=query_input.acronym_expanded_from,
481+
acronym_expansion_used=bool(query_input.acronym_expanded_from),
398482
)
399483

400484
def _calculate_backend_scores(
@@ -611,6 +695,8 @@ def _make_final_assessment(
611695
metadata=None,
612696
reasoning=reasoning,
613697
processing_time=processing_time,
698+
acronym_expanded_from=query_input.acronym_expanded_from,
699+
acronym_expansion_used=bool(query_input.acronym_expanded_from),
614700
)
615701

616702

src/aletheia_probe/models.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ class QueryInput(BaseModel):
2828
default_factory=dict, description="ISSN, DOI, etc."
2929
)
3030
aliases: list[str] = Field(default_factory=list, description="Alternative names")
31+
acronym_expanded_from: str | None = Field(
32+
None, description="Original acronym if expansion was applied"
33+
)
3134

3235

3336
class BackendResult(BaseModel):
@@ -102,6 +105,12 @@ class AssessmentResult(BaseModel):
102105
default_factory=datetime.now, description="Assessment timestamp"
103106
)
104107
processing_time: float = Field(..., description="Total processing time in seconds")
108+
acronym_expanded_from: str | None = Field(
109+
None, description="Original acronym if expansion was applied during assessment"
110+
)
111+
acronym_expansion_used: bool = Field(
112+
False, description="Whether acronym expansion was used to get results"
113+
)
105114

106115

107116
class ConfigBackend(BaseModel):

0 commit comments

Comments
 (0)