Skip to content

Commit 25a12a4

Browse files
Fix conference name normalization to preserve critical acronyms (fixes #76) (#81)
## Problem The normalizer was removing conference acronyms like (CVPR), (NeurIPS), and (ICCV) from parentheses, making it impossible to match major conferences against databases like OpenAlex. This caused legitimate conferences to show as UNKNOWN instead of LEGITIMATE. ## Solution ### 1. Extract acronyms as aliases (normalizer.py) - Added _extract_acronyms() method to identify conference/journal acronyms - Uses heuristics: primarily uppercase (≥50%), 2-20 chars, starts uppercase - Filters out metadata keywords (ISSN, online, invited, etc.) - Extracts acronyms like CVPR, NeurIPS, ICCV before text cleaning - Adds extracted acronyms to aliases list for backend matching ### 2. Enable alias fallback in OpenAlex backend (openalex_analyzer.py) - If normalized name not found, iterates through aliases - Stops at first successful match - Logs which alias was used for debugging - Includes tried aliases in NOT_FOUND response data ## Results - "IEEE Conference on CVPR (CVPR)" → LEGITIMATE (was UNKNOWN) - "International Conference on Computer Vision (ICCV)" → LEGITIMATE (was UNKNOWN) - All 244 unit tests pass - Mypy type checking passes Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
1 parent 33bc432 commit 25a12a4

File tree

2 files changed

+94
-0
lines changed

2 files changed

+94
-0
lines changed

src/aletheia_probe/backends/openalex_analyzer.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,21 @@ async def _query_api(self, query_input: QueryInput) -> BackendResult:
5959
journal_name=journal_name, issn=issn, eissn=eissn
6060
)
6161

62+
# If not found with normalized name, try aliases (especially acronyms)
63+
if not openalex_data and query_input.aliases:
64+
self.detail_logger.info(
65+
f"OpenAlex: Normalized name '{journal_name}' not found, trying {len(query_input.aliases)} alias(es)"
66+
)
67+
for alias in query_input.aliases:
68+
openalex_data = await client.enrich_journal_data(
69+
journal_name=alias, issn=issn, eissn=eissn
70+
)
71+
if openalex_data:
72+
self.detail_logger.info(
73+
f"OpenAlex: Found match using alias '{alias}'"
74+
)
75+
break
76+
6277
response_time = time.time() - start_time
6378

6479
if not openalex_data:
@@ -72,6 +87,7 @@ async def _query_api(self, query_input: QueryInput) -> BackendResult:
7287
"searched_for": journal_name,
7388
"issn": issn,
7489
"eissn": eissn,
90+
"aliases_tried": query_input.aliases,
7591
},
7692
sources=["https://api.openalex.org"],
7793
error_message=None,

src/aletheia_probe/normalizer.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ def normalize(self, raw_input: str) -> QueryInput:
119119
# Extract identifiers first
120120
identifiers = self._extract_identifiers(raw_input)
121121

122+
# Extract conference/journal acronyms from parentheses before cleaning
123+
extracted_acronyms = self._extract_acronyms(raw_input)
124+
122125
# Clean and normalize the text
123126
normalized = self._clean_text(raw_input)
124127
normalized = self._expand_abbreviations(normalized)
@@ -127,6 +130,18 @@ def normalize(self, raw_input: str) -> QueryInput:
127130
# Generate aliases
128131
aliases = self._generate_aliases(normalized)
129132

133+
# Add extracted acronyms to aliases for better matching
134+
aliases.extend(extracted_acronyms)
135+
136+
# Remove duplicates while preserving order
137+
seen: set[str] = set()
138+
unique_aliases: list[str] = []
139+
for alias in aliases:
140+
if alias not in seen:
141+
seen.add(alias)
142+
unique_aliases.append(alias)
143+
aliases = unique_aliases
144+
130145
return QueryInput(
131146
raw_input=raw_input.strip(),
132147
normalized_name=normalized,
@@ -152,6 +167,69 @@ def _extract_identifiers(self, text: str) -> dict[str, str]:
152167

153168
return identifiers
154169

170+
def _extract_acronyms(self, text: str) -> list[str]:
171+
"""Extract conference/journal acronyms from parentheses for use as aliases.
172+
173+
Conference acronyms are typically uppercase letters, possibly with numbers,
174+
and appear in parentheses (e.g., CVPR, NeurIPS, ICCV, 3DV).
175+
176+
Examples:
177+
"IEEE Conference on Computer Vision (CVPR)" -> ["CVPR"]
178+
"Neural Information Processing Systems (NeurIPS)" -> ["NeurIPS"]
179+
"Conference (ISSN: 1234-5678)" -> [] # Not an acronym
180+
181+
Args:
182+
text: Input text that may contain parenthesized acronyms
183+
184+
Returns:
185+
List of extracted acronyms
186+
"""
187+
acronyms = []
188+
189+
# Find all content within parentheses
190+
pattern = r"\(([^)]+)\)"
191+
matches = re.findall(pattern, text)
192+
193+
for content in matches:
194+
content = content.strip()
195+
196+
# Skip if contains certain keywords that indicate metadata, not acronyms
197+
skip_keywords = [
198+
"issn",
199+
"isbn",
200+
"doi",
201+
"online",
202+
"print",
203+
"invited",
204+
"accepted",
205+
"to appear",
206+
]
207+
if any(keyword in content.lower() for keyword in skip_keywords):
208+
continue
209+
210+
# Check if content looks like a conference/journal acronym:
211+
# - Primarily uppercase letters (allow some lowercase for mixed cases like NeurIPS)
212+
# - May contain numbers (e.g., 3DV, CVPR'23)
213+
# - May contain apostrophes for year indicators (e.g., CVPR'23)
214+
# - May contain hyphens (e.g., AAAI-23)
215+
# - Typically short (2-20 characters)
216+
# - Must start with uppercase letter
217+
# - No spaces, colons, or special punctuation
218+
219+
# Pattern: starts with uppercase, contains mostly uppercase letters/numbers,
220+
# may have apostrophes, hyphens, or few lowercase letters
221+
if re.match(r"^[A-Z][A-Za-z0-9'\-]{1,19}$", content):
222+
# Additional check: should have a good proportion of uppercase letters
223+
# to avoid catching things like "(Online)" or "(Invited)"
224+
# Use 50% threshold to catch "NeurIPS" (57%) while excluding "Online" (17%)
225+
uppercase_count = sum(1 for c in content if c.isupper())
226+
total_alpha = sum(1 for c in content if c.isalpha())
227+
228+
if total_alpha > 0 and (uppercase_count / total_alpha) >= 0.5:
229+
acronyms.append(content)
230+
231+
return acronyms
232+
155233
def _clean_text(self, text: str) -> str:
156234
"""Clean and normalize text using regex patterns."""
157235
# Remove identifiers from text for name normalization

0 commit comments

Comments
 (0)