feat: add normalization contract and dispatcher gating (#1047)

coding-ai-assistant[bot] · florath · web-flow · commit 95d475387f30 · 2026-02-16T07:26:29.000+01:00
* feat: add normalization result contract and dispatcher gate

Introduce a structured NormalizationResult model and enforce dispatcher gating so non-ok normalization status returns no assessment before backend queries.

Attach normalization payload to QueryInput for backend consumption and add model/dispatcher unit tests for OK and conflict paths.

* refactor: simplify normalization payload contract

Reduce NormalizationResult to a minimal backend-facing structure (original text, name, acronym, ISSN/eISSN, aliases, input identifiers) and keep normalization assessment/gating internal to dispatcher.

This removes confidence-like normalization evidence from the shared payload while preserving strict no-assessment behavior on normalization failures.

---------

Co-authored-by: florath-ai-assistant[bot] &lt;Andreas.Florath@telekom.de&gt;
diff --git a/src/aletheia_probe/dispatcher.py b/src/aletheia_probe/dispatcher.py
@@ -19,7 +19,15 @@
 from .enums import AssessmentType, EvidenceType
 from .fallback_chain import QueryFallbackChain
 from .logging_config import get_detail_logger, get_status_logger
-from .models import AssessmentResult, BackendResult, BackendStatus, QueryInput
+from .lookup import VenueLookupService
+from .models import (
+    AssessmentResult,
+    BackendResult,
+    BackendStatus,
+    NormalizationResult,
+    QueryInput,
+    VenueType,
+)
 from .normalizer import InputNormalizer, input_normalizer
 from .openalex import OpenAlexClient
 from .quality_assessment import QualityAssessmentProcessor
@@ -84,6 +92,7 @@ def __init__(self) -> None:
         self.cross_validation_registry = get_cross_validation_registry()
         self.quality_processor = QualityAssessmentProcessor()
         self.journal_cache = JournalCache()
+        self.lookup_service = VenueLookupService(journal_cache=self.journal_cache)
 
     async def assess_journal(self, query_input: QueryInput) -> AssessmentResult:
         """Assess a journal using all enabled backends.
@@ -95,6 +104,18 @@ async def assess_journal(self, query_input: QueryInput) -> AssessmentResult:
             AssessmentResult with aggregated assessment from all backends
         """
         start_time = time.time()
+        (
+            normalization_result,
+            normalization_failure,
+        ) = await self._normalize_for_dispatch(query_input)
+        query_input = self._attach_normalization_to_query(
+            query_input, normalization_result
+        )
+        if normalization_failure:
+            return self._build_normalization_blocked_result(
+                query_input, normalization_failure, start_time
+            )
+
         query_input = await self._enrich_query_identifiers(query_input)
 
         # Get enabled backends from registry
@@ -138,6 +159,110 @@ async def assess_journal(self, query_input: QueryInput) -> AssessmentResult:
             assessment_result, query_input, enabled_backends, start_time
         )
 
+    async def _normalize_for_dispatch(
+        self, query_input: QueryInput
+    ) -> tuple[NormalizationResult, str | None]:
+        """Build minimal normalization payload and evaluate gating failures."""
+        requested_venue_type = (
+            query_input.venue_type
+            if query_input.venue_type != VenueType.UNKNOWN
+            else VenueType.JOURNAL
+        )
+
+        lookup_result = self.lookup_service.lookup(
+            query_input.raw_input,
+            venue_type=requested_venue_type,
+            confidence_min=DEFAULT_ACRONYM_CONFIDENCE_MIN,
+        )
+        primary_name = (
+            lookup_result.normalized_name or query_input.normalized_name or ""
+        ).strip() or None
+        selected_issn = query_input.identifiers.get("issn") or (
+            lookup_result.issns[0] if lookup_result.issns else None
+        )
+        selected_eissn = query_input.identifiers.get("eissn") or (
+            lookup_result.eissns[0] if lookup_result.eissns else None
+        )
+
+        consistency_errors = list(lookup_result.consistency_errors)
+        failure_reason: str | None = None
+        if not primary_name and not (selected_issn or selected_eissn):
+            failure_reason = "Normalization did not resolve a name or identifier"
+
+        input_ids = {value for value in query_input.identifiers.values() if value}
+        if primary_name and input_ids:
+            resolved_ids: set[str] = set()
+            for candidate in lookup_result.candidates:
+                if candidate.normalized_name != primary_name:
+                    continue
+                if candidate.issn:
+                    resolved_ids.add(candidate.issn)
+                if candidate.eissn:
+                    resolved_ids.add(candidate.eissn)
+
+            if resolved_ids and input_ids.isdisjoint(resolved_ids):
+                consistency_errors.append(
+                    "Input mismatch: provided identifier(s) "
+                    f"{sorted(input_ids)} do not match '{primary_name}' "
+                    f"(resolved identifiers: {sorted(resolved_ids)})"
+                )
+
+        if consistency_errors:
+            failure_reason = "; ".join(sorted(set(consistency_errors)))
+
+        normalization_result = NormalizationResult(
+            original_text=lookup_result.raw_input,
+            venue_type=requested_venue_type,
+            name=primary_name,
+            acronym=query_input.acronym_expanded_from,
+            issn=selected_issn,
+            eissn=selected_eissn,
+            aliases=lookup_result.aliases,
+            input_identifiers=dict(query_input.identifiers),
+        )
+        return normalization_result, failure_reason
+
+    def _attach_normalization_to_query(
+        self, query_input: QueryInput, normalization_result: NormalizationResult
+    ) -> QueryInput:
+        """Attach normalization payload and selected fields to query input."""
+        normalized_name = normalization_result.name or query_input.normalized_name
+        merged_identifiers = dict(query_input.identifiers)
+        if normalization_result.issn:
+            merged_identifiers.setdefault("issn", normalization_result.issn)
+        if normalization_result.eissn:
+            merged_identifiers.setdefault("eissn", normalization_result.eissn)
+        return query_input.model_copy(
+            update={
+                "normalized_name": normalized_name,
+                "identifiers": merged_identifiers,
+                "normalization_result": normalization_result,
+            }
+        )
+
+    def _build_normalization_blocked_result(
+        self,
+        query_input: QueryInput,
+        failure_reason: str,
+        start_time: float,
+    ) -> AssessmentResult:
+        """Build a no-assessment result when normalization gate is not OK."""
+        reason = failure_reason or "Normalization failed; no assessment possible"
+        self.status_logger.warning(f"Normalization blocked assessment: {reason}")
+        return AssessmentResult(
+            input_query=query_input.raw_input,
+            assessment=AssessmentType.INSUFFICIENT_DATA,
+            confidence=0.0,
+            overall_score=0.0,
+            backend_results=[],
+            metadata=None,
+            reasoning=[reason],
+            processing_time=time.time() - start_time,
+            acronym_expanded_from=query_input.acronym_expanded_from,
+            acronym_expansion_used=bool(query_input.acronym_expanded_from),
+            venue_type=query_input.venue_type,
+        )
+
     async def _enrich_query_identifiers(self, query_input: QueryInput) -> QueryInput:
         """Enrich query identifiers with reliable ISSN/eISSN from cache/API."""
         if query_input.identifiers.get("issn") or query_input.identifiers.get("eissn"):
diff --git a/src/aletheia_probe/models.py b/src/aletheia_probe/models.py
@@ -49,6 +49,23 @@ class BackendStatus(str, Enum):
     TIMEOUT = "timeout"
 
 
+class NormalizationResult(BaseModel):
+    """Minimal normalization payload passed to backends."""
+
+    original_text: str = Field(..., description="Original query input string")
+    name: str | None = Field(None, description="Normalized venue name")
+    acronym: str | None = Field(
+        None, description="Detected or expanded acronym, if available"
+    )
+    issn: str | None = Field(None, description="Resolved print ISSN")
+    eissn: str | None = Field(None, description="Resolved electronic ISSN")
+    venue_type: VenueType = Field(..., description="Requested/detected venue type")
+    aliases: list[str] = Field(default_factory=list, description="Known aliases")
+    input_identifiers: dict[str, str] = Field(
+        default_factory=dict, description="Identifiers extracted directly from input"
+    )
+
+
 class QueryInput(BaseModel):
     """Input query data for journal assessment."""
 
@@ -68,6 +85,10 @@ class QueryInput(BaseModel):
         default_factory=dict,
         description="Acronym to full name mappings extracted during normalization",
     )
+    normalization_result: NormalizationResult | None = Field(
+        None,
+        description="Structured normalization payload passed to backends",
+    )
 
 
 class BackendResult(BaseModel):
diff --git a/tests/unit/test_dispatcher.py b/tests/unit/test_dispatcher.py
@@ -14,7 +14,9 @@
     AssessmentResult,
     BackendResult,
     BackendStatus,
+    NormalizationResult,
     QueryInput,
+    VenueType,
 )
 
 
@@ -96,8 +98,26 @@ async def test_assess_journal_basic_flow(
         self, dispatcher, sample_query_input, mock_backend
     ):
         """Test basic journal assessment flow."""
-        with patch.object(
-            dispatcher, "_get_enabled_backends", return_value=[mock_backend]
+        with (
+            patch.object(
+                dispatcher, "_get_enabled_backends", return_value=[mock_backend]
+            ),
+            patch.object(
+                dispatcher,
+                "_normalize_for_dispatch",
+                AsyncMock(
+                    return_value=(
+                        NormalizationResult(
+                            original_text=sample_query_input.raw_input,
+                            venue_type=VenueType.JOURNAL,
+                            name="journal of advanced computer science",
+                            issn="1234-5679",
+                            input_identifiers={"issn": "1234-5679"},
+                        ),
+                        None,
+                    )
+                ),
+            ),
         ):
             result = await dispatcher.assess_journal(sample_query_input)
 
@@ -108,6 +128,36 @@ async def test_assess_journal_basic_flow(
             assert result.processing_time > 0
             assert len(result.backend_results) == 1
 
+    @pytest.mark.asyncio
+    async def test_assess_journal_blocks_on_normalization_conflict(
+        self, dispatcher, sample_query_input, mock_backend
+    ):
+        """Do not query backends when normalization status is conflict."""
+        conflict_result = NormalizationResult(
+            original_text=sample_query_input.raw_input,
+            venue_type=VenueType.JOURNAL,
+            name="journal of advanced computer science",
+            issn="1234-5679",
+            input_identifiers={"issn": "1234-5679"},
+        )
+
+        with (
+            patch.object(
+                dispatcher, "_get_enabled_backends", return_value=[mock_backend]
+            ),
+            patch.object(
+                dispatcher,
+                "_normalize_for_dispatch",
+                AsyncMock(return_value=(conflict_result, "identifier mismatch")),
+            ),
+        ):
+            result = await dispatcher.assess_journal(sample_query_input)
+
+        assert result.assessment == AssessmentType.INSUFFICIENT_DATA
+        assert result.backend_results == []
+        assert any("identifier mismatch" in reason for reason in result.reasoning)
+        mock_backend.query_with_timeout.assert_not_called()
+
     @pytest.mark.asyncio
     async def test_assess_journal_no_backends(self, dispatcher, sample_query_input):
         """Test assessment with no enabled backends."""
diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py
@@ -16,6 +16,7 @@
     BibtexEntry,
     ConfigBackend,
     JournalMetadata,
+    NormalizationResult,
     QueryInput,
     VenueType,
 )
@@ -45,6 +46,58 @@ def test_create_full_query_input(self):
         assert query.identifiers["issn"] == "1234-5679"
         assert "Test Science Journal" in query.aliases
 
+    def test_query_input_with_normalization_result(self):
+        """Test attaching normalization payload to QueryInput."""
+        normalization_result = NormalizationResult(
+            original_text="Nature 0028-0836",
+            venue_type=VenueType.JOURNAL,
+            name="nature",
+            issn="0028-0836",
+            input_identifiers={"issn": "0028-0836"},
+        )
+        query = QueryInput(
+            raw_input="Nature 0028-0836",
+            normalized_name="nature",
+            identifiers={"issn": "0028-0836"},
+            normalization_result=normalization_result,
+        )
+        assert query.normalization_result is not None
+        assert query.normalization_result.name == "nature"
+
+
+class TestNormalizationResult:
+    """Tests for normalization contract models."""
+
+    def test_create_normalization_result(self):
+        """Create minimal normalization payload for backend consumption."""
+        result = NormalizationResult(
+            original_text="Nature",
+            venue_type=VenueType.JOURNAL,
+            name="nature",
+            aliases=[],
+            acronym=None,
+            input_identifiers={},
+            issn="0028-0836",
+            eissn="1476-4687",
+        )
+        assert result.name == "nature"
+        assert result.issn == "0028-0836"
+        assert result.eissn == "1476-4687"
+
+    def test_create_partial_normalization_result(self):
+        """Missing fields should be represented as None in minimal payload."""
+        result = NormalizationResult(
+            original_text="Unknown Venue",
+            venue_type=VenueType.JOURNAL,
+            name=None,
+            acronym=None,
+            issn=None,
+            eissn=None,
+        )
+        assert result.name is None
+        assert result.issn is None
+        assert result.eissn is None
+
 
 class TestBackendResult:
     """Tests for BackendResult model."""