feat: Add 'suspicious' as evaluation result for heuristic-only assessments (issue #65) (#66)

coding-ai-assistant[bot] · florath · web-flow · commit 8dd4409ff655 · 2025-11-22T09:43:50.000+01:00
## Summary Implements the 'suspicious' evaluation result to distinguish between definitive predatory classifications (based on predatory lists) and heuristic-based assessments (based on indicators like retraction rates). ## Changes Made ### Core Feature - Added SUSPICIOUS to AssessmentType enum - Created EvidenceType enum (PREDATORY_LIST, LEGITIMATE_LIST, HEURISTIC) - Enhanced Backend architecture with get_evidence_type() abstract method - Updated models with suspicious counters and evidence_type field ### Classification Logic (per issue #65) - PREDATORY: Only when journal found in predatory lists - SUSPICIOUS: When assessment based solely on heuristics - Mixed case: Predatory list + negative heuristics = PREDATORY - Backend-centric evidence classification (no central lists) ### Display Updates - Added ⚠️ emoji for suspicious results in batch output - Updated CLI to support suspicious in custom list types - Enhanced batch summaries with suspicious counts - Lower confidence scoring for heuristic-only assessments ### Architecture Improvements - CachedBackend automatically detects evidence type from list_type - HybridBackend defaults to HEURISTIC evidence - DOAJ overridden to return LEGITIMATE_LIST evidence - Extensible design for new backends ### Test Updates - Fixed all mock backends to implement get_evidence_type() - Added evidence_type field to all BackendResult test instances - Updated test expectations for new classification logic - All 254 tests passing ## Technical Details - Backward compatible with existing functionality - Self-declaring backends eliminate maintenance of central lists - Proper confidence scoring adjustments for different evidence types - Comprehensive error handling and validation Co-authored-by: florath-ai-assistant[bot] <Andreas.Florath@telekom.de>
diff --git a/src/aletheia_probe/backends/base.py b/src/aletheia_probe/backends/base.py
@@ -10,6 +10,7 @@
 from typing import Any
 
 from ..cache import get_cache_manager
+from ..enums import EvidenceType
 from ..models import AssessmentResult, BackendResult, BackendStatus, QueryInput
 
 
@@ -42,6 +43,11 @@ def get_description(self) -> str:
         """Return a description of what this backend checks."""
         pass
 
+    @abstractmethod
+    def get_evidence_type(self) -> EvidenceType:
+        """Return the type of evidence this backend provides."""
+        pass
+
     async def query_with_timeout(
         self, query_input: QueryInput, timeout: int = 10
     ) -> BackendResult:
@@ -91,6 +97,16 @@ def __init__(self, source_name: str, list_type: str, cache_ttl_hours: int = 24):
         self.source_name = source_name
         self.list_type = list_type
 
+    def get_evidence_type(self) -> EvidenceType:
+        """Return evidence type based on list type."""
+        if self.list_type == "predatory":
+            return EvidenceType.PREDATORY_LIST
+        elif self.list_type == "legitimate":
+            return EvidenceType.LEGITIMATE_LIST
+        else:
+            # Default to heuristic for unknown list types
+            return EvidenceType.HEURISTIC
+
     async def query(self, query_input: QueryInput) -> BackendResult:
         """Query cached data for journal information."""
         start_time = time.time()
@@ -202,6 +218,10 @@ class HybridBackend(Backend):
     def __init__(self, cache_ttl_hours: int = 24):
         super().__init__(cache_ttl_hours)
 
+    def get_evidence_type(self) -> EvidenceType:
+        """HybridBackend provides heuristic evidence by default."""
+        return EvidenceType.HEURISTIC
+
     async def query(self, query_input: QueryInput) -> BackendResult:
         """Check cache first, then query live API if needed."""
         start_time = time.time()
diff --git a/src/aletheia_probe/backends/doaj.py b/src/aletheia_probe/backends/doaj.py
@@ -8,7 +8,7 @@
 
 import aiohttp
 
-from ..enums import AssessmentType
+from ..enums import AssessmentType, EvidenceType
 from ..logging_config import get_detail_logger, get_status_logger
 from ..models import BackendResult, BackendStatus, QueryInput
 from ..retry_utils import async_retry_with_backoff
@@ -40,6 +40,9 @@ def get_name(self) -> str:
     def get_description(self) -> str:
         return "Checks DOAJ (Directory of Open Access Journals) for legitimate journals"
 
+    def get_evidence_type(self) -> EvidenceType:
+        return EvidenceType.LEGITIMATE_LIST
+
     async def _query_api(self, query_input: QueryInput) -> BackendResult:
         """Query DOAJ API for journal information with retry logic."""
         start_time = time.time()
diff --git a/src/aletheia_probe/batch_assessor.py b/src/aletheia_probe/batch_assessor.py
@@ -93,12 +93,15 @@ async def assess_bibtex_file(
             predatory_count=0,
             legitimate_count=0,
             insufficient_data_count=0,
+            suspicious_count=0,
             conference_entries=0,
             conference_predatory=0,
             conference_legitimate=0,
+            conference_suspicious=0,
             journal_entries=0,
             journal_predatory=0,
             journal_legitimate=0,
+            journal_suspicious=0,
             has_predatory_journals=False,
             retracted_articles_count=0,
             articles_checked_for_retraction=0,
@@ -182,6 +185,12 @@ async def assess_bibtex_file(
                         result.conference_legitimate += 1
                     else:
                         result.journal_legitimate += 1
+                elif assessment.assessment == AssessmentType.SUSPICIOUS:
+                    result.suspicious_count += 1
+                    if is_conference:
+                        result.conference_suspicious += 1
+                    else:
+                        result.journal_suspicious += 1
                 else:
                     result.insufficient_data_count += 1
 
@@ -246,6 +255,15 @@ def format_summary(result: BibtexAssessmentResult, verbose: bool = False) -> str
             summary_lines.append(
                 f"    🎤 Conferences: {result.conference_predatory}/{result.conference_entries}"
             )
+        summary_lines.append(f"  Suspicious: {result.suspicious_count} total")
+        if result.journal_entries > 0:
+            summary_lines.append(
+                f"    📄 Journals: {result.journal_suspicious}/{result.journal_entries}"
+            )
+        if result.conference_entries > 0:
+            summary_lines.append(
+                f"    🎤 Conferences: {result.conference_suspicious}/{result.conference_entries}"
+            )
         summary_lines.append(f"  Legitimate: {result.legitimate_count} total")
         if result.journal_entries > 0:
             summary_lines.append(
@@ -294,6 +312,7 @@ def format_summary(result: BibtexAssessmentResult, verbose: bool = False) -> str
                 emoji_map: dict[str, str] = {
                     AssessmentType.PREDATORY.value: "❌",
                     AssessmentType.LEGITIMATE.value: "✅",
+                    AssessmentType.SUSPICIOUS.value: "⚠️",
                     AssessmentType.UNKNOWN.value: "❓",
                 }
                 status_emoji = emoji_map.get(assessment.assessment, "❓")
diff --git a/src/aletheia_probe/cli.py b/src/aletheia_probe/cli.py
@@ -230,7 +230,12 @@ def status() -> None:
 @click.option(
     "--list-type",
     type=click.Choice(
-        [AssessmentType.PREDATORY, AssessmentType.LEGITIMATE, AssessmentType.UNKNOWN]
+        [
+            AssessmentType.PREDATORY,
+            AssessmentType.LEGITIMATE,
+            AssessmentType.SUSPICIOUS,
+            AssessmentType.UNKNOWN,
+        ]
     ),
     default=AssessmentType.PREDATORY,
     help="Type of journals in the list",
diff --git a/src/aletheia_probe/dispatcher.py b/src/aletheia_probe/dispatcher.py
@@ -11,7 +11,7 @@
     AGREEMENT_BONUS_AMOUNT,
     CONFIDENCE_THRESHOLD_HIGH,
 )
-from .enums import AssessmentType
+from .enums import AssessmentType, EvidenceType
 from .logging_config import get_detail_logger, get_status_logger
 from .models import AssessmentResult, BackendResult, BackendStatus, QueryInput
 
@@ -205,6 +205,7 @@ async def _query_backends(
                     assessment=None,
                     error_message=str(result),
                     response_time=0.0,
+                    evidence_type="heuristic",  # Default for error cases
                 )
                 backend_results.append(error_result)
             elif isinstance(result, BackendResult):
@@ -227,6 +228,7 @@ async def _query_backends(
                     assessment=None,
                     error_message=f"Unexpected result type: {type(result)}",
                     response_time=0.0,
+                    evidence_type="heuristic",  # Default for error cases
                 )
                 backend_results.append(error_result)
 
@@ -251,6 +253,7 @@ async def _query_backend_with_timing(
         # response_time already contains the actual backend execution time
         result_dict = result.model_dump()
         result_dict["execution_time_ms"] = result.response_time * 1000
+        result_dict["evidence_type"] = backend.get_evidence_type().value
         return BackendResult(**result_dict)
 
     def _calculate_assessment(
@@ -460,30 +463,63 @@ def _make_final_assessment(
         total_weight = score_data["total_weight"]
         retraction_risk_level = retraction_info.get("risk_level")
 
-        # Decision logic
+        # Analyze evidence types to determine classification
+        predatory_list_evidence = []
+        legitimate_list_evidence = []
+        heuristic_evidence = []
+
+        for result in successful_results:
+            if (
+                result.evidence_type == EvidenceType.PREDATORY_LIST.value
+                and result.assessment == AssessmentType.PREDATORY
+            ):
+                predatory_list_evidence.append(result)
+            elif (
+                result.evidence_type == EvidenceType.LEGITIMATE_LIST.value
+                and result.assessment == AssessmentType.LEGITIMATE
+            ):
+                legitimate_list_evidence.append(result)
+            elif result.evidence_type == EvidenceType.HEURISTIC.value:
+                heuristic_evidence.append(result)
+
+        # Decision logic based on issue #65 requirements
         if total_weight == 0:
             assessment = AssessmentType.UNKNOWN
             confidence = 0.1
             overall_score = 0.0
-        elif total_predatory_weight > total_legitimate_weight:
-            assessment = AssessmentType.PREDATORY
-            confidence = min(0.95, total_predatory_weight / total_weight)
-            overall_score = total_predatory_weight / total_weight
-            reasoning.insert(
-                0,
-                f"Classified as predatory based on {score_data['predatory_count']} source(s)",
-            )
-
-            # Cross-validate with retraction data
-            if retraction_risk_level in ["critical", "high"]:
-                confidence = min(
-                    CONFIDENCE_THRESHOLD_HIGH, confidence + AGREEMENT_BONUS_AMOUNT
+            reasoning.insert(0, "No assessment data available")
+
+        elif len(predatory_list_evidence) > 0:
+            # Rule: If ANY predatory list evidence exists, can be PREDATORY
+            if total_predatory_weight > total_legitimate_weight:
+                assessment = AssessmentType.PREDATORY
+                confidence = min(0.95, total_predatory_weight / total_weight)
+                overall_score = total_predatory_weight / total_weight
+                reasoning.insert(
+                    0,
+                    f"Classified as predatory based on {len(predatory_list_evidence)} predatory list(s)",
                 )
-                reasoning.append(
-                    "⚠️ High retraction rate corroborates predatory classification"
+
+                # Cross-validate with retraction data
+                if retraction_risk_level in ["critical", "high"]:
+                    confidence = min(
+                        CONFIDENCE_THRESHOLD_HIGH, confidence + AGREEMENT_BONUS_AMOUNT
+                    )
+                    reasoning.append(
+                        "⚠️ High retraction rate corroborates predatory classification"
+                    )
+            else:
+                # Predatory list evidence exists but legitimate evidence is stronger
+                assessment = AssessmentType.LEGITIMATE
+                confidence = min(0.9, total_legitimate_weight / total_weight)
+                overall_score = total_legitimate_weight / total_weight
+                reasoning.insert(
+                    0,
+                    "Classified as legitimate despite predatory list match - stronger legitimate evidence",
                 )
 
         elif total_legitimate_weight > 0:
+            # Only legitimate evidence (list or heuristic)
             assessment = AssessmentType.LEGITIMATE
             confidence = min(0.9, total_legitimate_weight / total_weight)
             overall_score = total_legitimate_weight / total_weight
@@ -502,6 +538,25 @@ def _make_final_assessment(
                     "⚠️ NOTE: Moderate retraction rate - quality concerns exist"
                 )
 
+        elif total_predatory_weight > 0:
+            # Rule: Predatory assessment based ONLY on heuristics = SUSPICIOUS
+            assessment = AssessmentType.SUSPICIOUS
+            confidence = min(
+                0.85, total_predatory_weight / total_weight
+            )  # Lower confidence for heuristic-only
+            overall_score = total_predatory_weight / total_weight
+            reasoning.insert(
+                0,
+                f"Classified as suspicious based on heuristic analysis only ({score_data['predatory_count']} source(s))",
+            )
+
+            # Retraction data supports suspicious classification
+            if retraction_risk_level in ["critical", "high"]:
+                confidence = min(0.95, confidence + AGREEMENT_BONUS_AMOUNT)
+                reasoning.append(
+                    "⚠️ High retraction rate supports suspicious classification"
+                )
+
         else:
             assessment = AssessmentType.UNKNOWN
             confidence = 0.3
diff --git a/src/aletheia_probe/enums.py b/src/aletheia_probe/enums.py
@@ -20,6 +20,7 @@ class AssessmentType(str, Enum):
 
     PREDATORY = "predatory"
     LEGITIMATE = "legitimate"
+    SUSPICIOUS = "suspicious"
     UNKNOWN = "unknown"
     QUESTIONABLE = "questionable"
     QUALITY_INDICATOR = "quality_indicator"
@@ -33,6 +34,14 @@ class BackendType(str, Enum):
     QUALITY_INDICATOR = "quality_indicator"
 
 
+class EvidenceType(str, Enum):
+    """Types of evidence provided by backends for classification purposes."""
+
+    PREDATORY_LIST = "predatory_list"  # Curated lists of predatory journals
+    LEGITIMATE_LIST = "legitimate_list"  # Curated lists of legitimate journals
+    HEURISTIC = "heuristic"  # Analysis-based assessment (retraction rates, etc.)
+
+
 class RiskLevel(str, Enum):
     """Risk levels for retraction watch data."""
 
diff --git a/src/aletheia_probe/models.py b/src/aletheia_probe/models.py
@@ -39,7 +39,7 @@ class BackendResult(BaseModel):
         ..., ge=0.0, le=1.0, description="Confidence score 0.0-1.0"
     )
     assessment: str | None = Field(
-        None, description="predatory, legitimate, or unknown"
+        None, description="predatory, legitimate, suspicious, or unknown"
     )
     data: dict[str, Any] = Field(
         default_factory=dict, description="Backend-specific raw data"
@@ -55,6 +55,10 @@ class BackendResult(BaseModel):
     execution_time_ms: float | None = Field(
         None, description="Backend execution time in milliseconds"
     )
+    evidence_type: str | None = Field(
+        None,
+        description="Type of evidence: predatory_list, legitimate_list, or heuristic",
+    )
 
 
 class JournalMetadata(BaseModel):
@@ -79,7 +83,7 @@ class AssessmentResult(BaseModel):
 
     input_query: str = Field(..., description="Original query string")
     assessment: str = Field(
-        ..., description="predatory, legitimate, or insufficient_data"
+        ..., description="predatory, legitimate, suspicious, or insufficient_data"
     )
     confidence: float = Field(
         ..., ge=0.0, le=1.0, description="Overall confidence score"
@@ -179,6 +183,9 @@ class BibtexAssessmentResult(BaseModel):
     insufficient_data_count: int = Field(
         0, description="Number of entries with insufficient data"
     )
+    suspicious_count: int = Field(
+        0, description="Number of entries with suspicious journals/conferences"
+    )
     # Conference-specific counters
     conference_entries: int = Field(
         0,
@@ -188,12 +195,16 @@ class BibtexAssessmentResult(BaseModel):
     conference_legitimate: int = Field(
         0, description="Number of legitimate conferences"
     )
+    conference_suspicious: int = Field(
+        0, description="Number of suspicious conferences"
+    )
     # Journal-specific counters
     journal_entries: int = Field(
         0, description="Number of journal entries (article, etc.)"
     )
     journal_predatory: int = Field(0, description="Number of predatory journals")
     journal_legitimate: int = Field(0, description="Number of legitimate journals")
+    journal_suspicious: int = Field(0, description="Number of suspicious journals")
     has_predatory_journals: bool = Field(
         False, description="Whether any predatory journals/conferences were found"
     )
diff --git a/tests/integration/test_basic_integration.py b/tests/integration/test_basic_integration.py
diff --git a/tests/unit/backends/test_base.py b/tests/unit/backends/test_base.py
diff --git a/tests/unit/test_dispatcher.py b/tests/unit/test_dispatcher.py