Merge pull request #24 from Imaging-Plaza/bug/affiliation-hallucination

caviri · web-flow · commit cb23439e35ab · 2025-11-12T14:54:12.000+01:00
bug solved
diff --git a/src/agents/user_prompts.py b/src/agents/user_prompts.py
@@ -132,39 +132,46 @@ def get_general_user_agent_prompt(username: str, user_data: str):
     User Profile Data:
     {user_data}
 
-    CRITICAL: The README content contains explicit position information. Look for this exact text:
-    "Currently, I am working as a **Data Engineer** at the **Swiss Data Science Center** at **EPFL**."
-
     Please provide a detailed analysis in JSON format with the following fields:
-    - "relatedToOrganization": List of organizations the user is affiliated with (e.g., ["EPFL", "Swiss Data Science Center", "ETH Zürich"])
-    - "relatedToOrganizationJustification": List of justifications for each organization (e.g., ["Works at SDSC which is jointly established by EPFL and ETH Zürich", "Profile shows @epfl.ch email"])
-    - "discipline": List of scientific disciplines (e.g., ["Biology", "Computer Science", "Data Science"])
+    - "relatedToOrganization": List of organizations the user is affiliated with
+    - "relatedToOrganizationJustification": List of justifications for each organization
+    - "discipline": List of scientific disciplines
     - "disciplineJustification": List of justifications for each discipline
-    - "position": List of professional positions/roles (e.g., ["Data Engineer", "Research Scientist", "Software Developer"])
+    - "position": List of professional positions/roles
     - "positionJustification": List of justifications for each position
 
-    IMPORTANT: Extract organization and position information from ALL available sources:
+    IMPORTANT: Extract organization and position information ONLY from the actual data provided:
     - Company field: "{user_data.get('company', 'N/A')}"
     - Bio content: "{user_data.get('bio', 'N/A')}"
     - README content: "{user_data.get('readme_content', 'N/A')[:500]}..." (truncated)
     - Organization affiliations: {user_data.get('organizations', [])}
     - ORCID activities: {user_data.get('orcid_activities', 'N/A')}
 
-    Look for phrases like:
-    - "I am working as a [POSITION]"
-    - "Currently working as [POSITION]"
-    - "Data Engineer", "Research Scientist", "Software Developer", etc.
-    - Job titles in README content
-    - Current employment status
-
-    The README explicitly states: "Currently, I am working as a **Data Engineer**" - this should be extracted as position: ["Data Engineer"]
-
-    ORGANIZATION EXTRACTION RULES:
+    EXTRACTION GUIDELINES:
+    
+    **For Positions:**
+    - Look for explicit statements about current or past roles in the bio, company field, or README
+    - Look for phrases like "I am working as", "Currently working as", "Software Engineer at", etc.
+    - ONLY extract positions that are EXPLICITLY mentioned in the data
+    - DO NOT infer or assume positions that are not stated
+    
+    **For Organizations:**
     - Look for company/employer information in the bio, company field, and README
     - Check GitHub organizations the user is a member of (institutions, universities, companies)
-    - Include both primary organizations (e.g., "EPFL") and sub-units (e.g., "Swiss Data Science Center")
-    - For each organization, provide a clear justification explaining the evidence
-    - Add EPFL to the list if the user is affiliated with any EPFL lab, center, or has @epfl.ch email
+    - Include both primary organizations (e.g., "EPFL") and sub-units (e.g., "Swiss Data Science Center") ONLY if mentioned
+    - Add EPFL to the list ONLY if the user explicitly mentions affiliation with an EPFL lab/center or has @epfl.ch email
+    - DO NOT add organizations that are not explicitly mentioned or clearly indicated
+    
+    **For Disciplines:**
+    - Infer from the user's bio, projects, repositories, and stated roles
+    - Base on technical skills, research areas, or explicit statements
+    
+    **Critical Rules:**
+    - For each field, provide a clear justification that quotes or references the actual source data
+    - If a field cannot be determined from the available data, return an empty list []
+    - DO NOT hallucinate or fabricate information
+    - DO NOT use example data as if it were real
+    - ONLY extract information that is present in the provided user data
 
     Return valid JSON only with all SIX fields populated (relatedToOrganization, relatedToOrganizationJustification, discipline, disciplineJustification, position, positionJustification).
     """
diff --git a/src/analysis/organization.py b/src/analysis/organization.py
@@ -315,28 +315,9 @@ async def run_organization_enrichment(self):
         # organization_enrichment is an OrganizationEnrichmentResult, not a dict
         enriched_orgs = organization_enrichment.organizations  # Direct attribute access
 
-        # Safely handle relatedToOrganization list
-        related_orgs = getattr(self.data, "relatedToOrganization", None)
-        if related_orgs is None:
-            related_orgs = []
-            self.data.relatedToOrganization = related_orgs
-        for org in enriched_orgs:
-            legal_name = (
-                org.legalName
-            )  # Direct attribute access, org is already Organization
-            if legal_name:
-                related_orgs.append(legal_name)
-
-        # Merge enriched organizations into relatedToOrganization
-        # Combine string names and Organization objects
-        current_orgs = getattr(self.data, "relatedToOrganization", None) or []
-        if not isinstance(current_orgs, list):
-            current_orgs = []
-        
-        # Add enriched Organization objects
-        combined_orgs = list(current_orgs)  # Copy existing
-        combined_orgs.extend(enriched_orgs)  # Add Organization objects
-        self.data.relatedToOrganization = combined_orgs
+        # Replace relatedToOrganization with enriched Organization objects only
+        # Don't add both org name strings and Organization objects - just objects
+        self.data.relatedToOrganization = list(enriched_orgs)
 
         # For organization profiles, preserve LLM's EPFL assessment (which has full context)
         # Only update EPFL values if they weren't set by LLM analysis
diff --git a/src/analysis/repositories.py b/src/analysis/repositories.py
@@ -160,22 +160,9 @@ async def run_organization_enrichment(self):
                 organization_enrichment.organizations
             )  # Direct attribute access
 
-            # Replace (not append) organization lists with enriched versions
-            # Build list of organization names for relatedToOrganizations
-            related_orgs = []
-            for org in enriched_orgs:
-                legal_name = org.legalName
-                if legal_name:
-                    related_orgs.append(legal_name)
-            
-            # Merge enriched organizations into relatedToOrganizations
-            # Combine string names and Organization objects
-            combined_orgs = []
-            if related_orgs:
-                combined_orgs.extend(related_orgs)
-            if enriched_orgs:
-                combined_orgs.extend(enriched_orgs)
-            self.data.relatedToOrganizations = combined_orgs if combined_orgs else None
+            # Replace relatedToOrganizations with enriched Organization objects only
+            # Don't add both org name strings and Organization objects - just objects
+            self.data.relatedToOrganizations = list(enriched_orgs) if enriched_orgs else None
 
             # These values are overwritten only if provided by the enrichment
             if organization_enrichment.relatedToEPFL is not None:
diff --git a/src/analysis/user.py b/src/analysis/user.py
@@ -53,7 +53,10 @@ def run_github_parsing(self):
             githubHandle=user_data_dict.get("login"),
             githubUserMetadata=github_metadata,
             # Enrichment fields (will be populated by analysis steps)
-            relatedToOrganization=user_data_dict.get("organizations", []) or [],
+            # NOTE: Don't pre-populate relatedToOrganization with GitHub orgs here
+            # Let LLM analysis extract them from all sources (bio, README, GitHub orgs, etc.)
+            # This prevents duplication when enrichment adds Organization objects later
+            relatedToOrganization=[],
             relatedToOrganizationJustification=[],
             discipline=[],
             disciplineJustification=[],
@@ -238,29 +241,10 @@ async def run_organization_enrichment(self):
         # organization_enrichment is an OrganizationEnrichmentResult, not a dict
         enriched_orgs = organization_enrichment.organizations  # Direct attribute access
 
-        # Safely handle relatedToOrganizations list
-        related_orgs = getattr(self.data, "relatedToOrganization", None)
-        if related_orgs is None:
-            related_orgs = []
-            self.data.relatedToOrganization = related_orgs
-        for org in enriched_orgs:
-            legal_name = (
-                org.legalName
-            )  # Direct attribute access, org is already Organization
-            if legal_name:
-                related_orgs.append(legal_name)
-
-        # Safely handle relatedToOrganizationsROR list
-        # Merge enriched organizations into relatedToOrganization
-        # Combine string names and Organization objects
-        current_orgs = getattr(self.data, "relatedToOrganization", None) or []
-        if not isinstance(current_orgs, list):
-            current_orgs = []
-        
-        # Add enriched Organization objects
-        combined_orgs = list(current_orgs)  # Copy existing
-        combined_orgs.extend(enriched_orgs)  # Add Organization objects
-        self.data.relatedToOrganization = combined_orgs
+        # Replace the relatedToOrganization list with enriched Organization objects
+        # This prevents duplication - we don't add both strings and objects
+        # The LLM analysis already populated org name strings, now we replace them with full objects
+        self.data.relatedToOrganization = list(enriched_orgs)
 
         # For user profiles, preserve any existing EPFL assessment
         # Only update EPFL values if they weren't already set
diff --git a/src/data_models/__init__.py b/src/data_models/__init__.py
@@ -232,3 +232,31 @@
 delattr(_orgllm_module, "Optional")
 delattr(_orgllm_module, "Union")
 delattr(_orgllm_module, "Any")
+
+# Rebuild AcademicCatalogRelation to resolve List typing import
+_catalog_relation_module = sys.modules[AcademicCatalogRelation.__module__]
+_catalog_relation_module.List = List
+_catalog_relation_module.Dict = Dict
+_catalog_relation_module.Optional = Optional
+_catalog_relation_module.Union = Union
+_catalog_relation_module.Any = Any
+AcademicCatalogRelation.model_rebuild()
+delattr(_catalog_relation_module, "List")
+delattr(_catalog_relation_module, "Dict")
+delattr(_catalog_relation_module, "Optional")
+delattr(_catalog_relation_module, "Union")
+delattr(_catalog_relation_module, "Any")
+
+# Rebuild AcademicCatalogEnrichmentResult to resolve List and other typing imports
+_catalog_module = sys.modules[AcademicCatalogEnrichmentResult.__module__]
+_catalog_module.List = List
+_catalog_module.Dict = Dict
+_catalog_module.Optional = Optional
+_catalog_module.Union = Union
+_catalog_module.Any = Any
+AcademicCatalogEnrichmentResult.model_rebuild()
+delattr(_catalog_module, "List")
+delattr(_catalog_module, "Dict")
+delattr(_catalog_module, "Optional")
+delattr(_catalog_module, "Union")
+delattr(_catalog_module, "Any")
diff --git a/src/data_models/academic_catalog.py b/src/data_models/academic_catalog.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any, Optional, Union
+from typing import Any, List, Optional, Union
 
 from pydantic import BaseModel, Field