Skip to content

Commit cb23439

Browse files
authored
Merge pull request #24 from Imaging-Plaza/bug/affiliation-hallucination
bug solved
2 parents 999d59a + 404bc59 commit cb23439

6 files changed

Lines changed: 71 additions & 84 deletions

File tree

src/agents/user_prompts.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -132,39 +132,46 @@ def get_general_user_agent_prompt(username: str, user_data: str):
132132
User Profile Data:
133133
{user_data}
134134
135-
CRITICAL: The README content contains explicit position information. Look for this exact text:
136-
"Currently, I am working as a **Data Engineer** at the **Swiss Data Science Center** at **EPFL**."
137-
138135
Please provide a detailed analysis in JSON format with the following fields:
139-
- "relatedToOrganization": List of organizations the user is affiliated with (e.g., ["EPFL", "Swiss Data Science Center", "ETH Zürich"])
140-
- "relatedToOrganizationJustification": List of justifications for each organization (e.g., ["Works at SDSC which is jointly established by EPFL and ETH Zürich", "Profile shows @epfl.ch email"])
141-
- "discipline": List of scientific disciplines (e.g., ["Biology", "Computer Science", "Data Science"])
136+
- "relatedToOrganization": List of organizations the user is affiliated with
137+
- "relatedToOrganizationJustification": List of justifications for each organization
138+
- "discipline": List of scientific disciplines
142139
- "disciplineJustification": List of justifications for each discipline
143-
- "position": List of professional positions/roles (e.g., ["Data Engineer", "Research Scientist", "Software Developer"])
140+
- "position": List of professional positions/roles
144141
- "positionJustification": List of justifications for each position
145142
146-
IMPORTANT: Extract organization and position information from ALL available sources:
143+
IMPORTANT: Extract organization and position information ONLY from the actual data provided:
147144
- Company field: "{user_data.get('company', 'N/A')}"
148145
- Bio content: "{user_data.get('bio', 'N/A')}"
149146
- README content: "{user_data.get('readme_content', 'N/A')[:500]}..." (truncated)
150147
- Organization affiliations: {user_data.get('organizations', [])}
151148
- ORCID activities: {user_data.get('orcid_activities', 'N/A')}
152149
153-
Look for phrases like:
154-
- "I am working as a [POSITION]"
155-
- "Currently working as [POSITION]"
156-
- "Data Engineer", "Research Scientist", "Software Developer", etc.
157-
- Job titles in README content
158-
- Current employment status
159-
160-
The README explicitly states: "Currently, I am working as a **Data Engineer**" - this should be extracted as position: ["Data Engineer"]
161-
162-
ORGANIZATION EXTRACTION RULES:
150+
EXTRACTION GUIDELINES:
151+
152+
**For Positions:**
153+
- Look for explicit statements about current or past roles in the bio, company field, or README
154+
- Look for phrases like "I am working as", "Currently working as", "Software Engineer at", etc.
155+
- ONLY extract positions that are EXPLICITLY mentioned in the data
156+
- DO NOT infer or assume positions that are not stated
157+
158+
**For Organizations:**
163159
- Look for company/employer information in the bio, company field, and README
164160
- Check GitHub organizations the user is a member of (institutions, universities, companies)
165-
- Include both primary organizations (e.g., "EPFL") and sub-units (e.g., "Swiss Data Science Center")
166-
- For each organization, provide a clear justification explaining the evidence
167-
- Add EPFL to the list if the user is affiliated with any EPFL lab, center, or has @epfl.ch email
161+
- Include both primary organizations (e.g., "EPFL") and sub-units (e.g., "Swiss Data Science Center") ONLY if mentioned
162+
- Add EPFL to the list ONLY if the user explicitly mentions affiliation with an EPFL lab/center or has @epfl.ch email
163+
- DO NOT add organizations that are not explicitly mentioned or clearly indicated
164+
165+
**For Disciplines:**
166+
- Infer from the user's bio, projects, repositories, and stated roles
167+
- Base on technical skills, research areas, or explicit statements
168+
169+
**Critical Rules:**
170+
- For each field, provide a clear justification that quotes or references the actual source data
171+
- If a field cannot be determined from the available data, return an empty list []
172+
- DO NOT hallucinate or fabricate information
173+
- DO NOT use example data as if it were real
174+
- ONLY extract information that is present in the provided user data
168175
169176
Return valid JSON only with all SIX fields populated (relatedToOrganization, relatedToOrganizationJustification, discipline, disciplineJustification, position, positionJustification).
170177
"""

src/analysis/organization.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -315,28 +315,9 @@ async def run_organization_enrichment(self):
315315
# organization_enrichment is an OrganizationEnrichmentResult, not a dict
316316
enriched_orgs = organization_enrichment.organizations # Direct attribute access
317317

318-
# Safely handle relatedToOrganization list
319-
related_orgs = getattr(self.data, "relatedToOrganization", None)
320-
if related_orgs is None:
321-
related_orgs = []
322-
self.data.relatedToOrganization = related_orgs
323-
for org in enriched_orgs:
324-
legal_name = (
325-
org.legalName
326-
) # Direct attribute access, org is already Organization
327-
if legal_name:
328-
related_orgs.append(legal_name)
329-
330-
# Merge enriched organizations into relatedToOrganization
331-
# Combine string names and Organization objects
332-
current_orgs = getattr(self.data, "relatedToOrganization", None) or []
333-
if not isinstance(current_orgs, list):
334-
current_orgs = []
335-
336-
# Add enriched Organization objects
337-
combined_orgs = list(current_orgs) # Copy existing
338-
combined_orgs.extend(enriched_orgs) # Add Organization objects
339-
self.data.relatedToOrganization = combined_orgs
318+
# Replace relatedToOrganization with enriched Organization objects only
319+
# Don't add both org name strings and Organization objects - just objects
320+
self.data.relatedToOrganization = list(enriched_orgs)
340321

341322
# For organization profiles, preserve LLM's EPFL assessment (which has full context)
342323
# Only update EPFL values if they weren't set by LLM analysis

src/analysis/repositories.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -160,22 +160,9 @@ async def run_organization_enrichment(self):
160160
organization_enrichment.organizations
161161
) # Direct attribute access
162162

163-
# Replace (not append) organization lists with enriched versions
164-
# Build list of organization names for relatedToOrganizations
165-
related_orgs = []
166-
for org in enriched_orgs:
167-
legal_name = org.legalName
168-
if legal_name:
169-
related_orgs.append(legal_name)
170-
171-
# Merge enriched organizations into relatedToOrganizations
172-
# Combine string names and Organization objects
173-
combined_orgs = []
174-
if related_orgs:
175-
combined_orgs.extend(related_orgs)
176-
if enriched_orgs:
177-
combined_orgs.extend(enriched_orgs)
178-
self.data.relatedToOrganizations = combined_orgs if combined_orgs else None
163+
# Replace relatedToOrganizations with enriched Organization objects only
164+
# Don't add both org name strings and Organization objects - just objects
165+
self.data.relatedToOrganizations = list(enriched_orgs) if enriched_orgs else None
179166

180167
# These values are overwritten only if provided by the enrichment
181168
if organization_enrichment.relatedToEPFL is not None:

src/analysis/user.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ def run_github_parsing(self):
5353
githubHandle=user_data_dict.get("login"),
5454
githubUserMetadata=github_metadata,
5555
# Enrichment fields (will be populated by analysis steps)
56-
relatedToOrganization=user_data_dict.get("organizations", []) or [],
56+
# NOTE: Don't pre-populate relatedToOrganization with GitHub orgs here
57+
# Let LLM analysis extract them from all sources (bio, README, GitHub orgs, etc.)
58+
# This prevents duplication when enrichment adds Organization objects later
59+
relatedToOrganization=[],
5760
relatedToOrganizationJustification=[],
5861
discipline=[],
5962
disciplineJustification=[],
@@ -238,29 +241,10 @@ async def run_organization_enrichment(self):
238241
# organization_enrichment is an OrganizationEnrichmentResult, not a dict
239242
enriched_orgs = organization_enrichment.organizations # Direct attribute access
240243

241-
# Safely handle relatedToOrganizations list
242-
related_orgs = getattr(self.data, "relatedToOrganization", None)
243-
if related_orgs is None:
244-
related_orgs = []
245-
self.data.relatedToOrganization = related_orgs
246-
for org in enriched_orgs:
247-
legal_name = (
248-
org.legalName
249-
) # Direct attribute access, org is already Organization
250-
if legal_name:
251-
related_orgs.append(legal_name)
252-
253-
# Safely handle relatedToOrganizationsROR list
254-
# Merge enriched organizations into relatedToOrganization
255-
# Combine string names and Organization objects
256-
current_orgs = getattr(self.data, "relatedToOrganization", None) or []
257-
if not isinstance(current_orgs, list):
258-
current_orgs = []
259-
260-
# Add enriched Organization objects
261-
combined_orgs = list(current_orgs) # Copy existing
262-
combined_orgs.extend(enriched_orgs) # Add Organization objects
263-
self.data.relatedToOrganization = combined_orgs
244+
# Replace the relatedToOrganization list with enriched Organization objects
245+
# This prevents duplication - we don't add both strings and objects
246+
# The LLM analysis already populated org name strings, now we replace them with full objects
247+
self.data.relatedToOrganization = list(enriched_orgs)
264248

265249
# For user profiles, preserve any existing EPFL assessment
266250
# Only update EPFL values if they weren't already set

src/data_models/__init__.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,3 +232,31 @@
232232
delattr(_orgllm_module, "Optional")
233233
delattr(_orgllm_module, "Union")
234234
delattr(_orgllm_module, "Any")
235+
236+
# Rebuild AcademicCatalogRelation to resolve List typing import
237+
_catalog_relation_module = sys.modules[AcademicCatalogRelation.__module__]
238+
_catalog_relation_module.List = List
239+
_catalog_relation_module.Dict = Dict
240+
_catalog_relation_module.Optional = Optional
241+
_catalog_relation_module.Union = Union
242+
_catalog_relation_module.Any = Any
243+
AcademicCatalogRelation.model_rebuild()
244+
delattr(_catalog_relation_module, "List")
245+
delattr(_catalog_relation_module, "Dict")
246+
delattr(_catalog_relation_module, "Optional")
247+
delattr(_catalog_relation_module, "Union")
248+
delattr(_catalog_relation_module, "Any")
249+
250+
# Rebuild AcademicCatalogEnrichmentResult to resolve List and other typing imports
251+
_catalog_module = sys.modules[AcademicCatalogEnrichmentResult.__module__]
252+
_catalog_module.List = List
253+
_catalog_module.Dict = Dict
254+
_catalog_module.Optional = Optional
255+
_catalog_module.Union = Union
256+
_catalog_module.Any = Any
257+
AcademicCatalogEnrichmentResult.model_rebuild()
258+
delattr(_catalog_module, "List")
259+
delattr(_catalog_module, "Dict")
260+
delattr(_catalog_module, "Optional")
261+
delattr(_catalog_module, "Union")
262+
delattr(_catalog_module, "Any")

src/data_models/academic_catalog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from __future__ import annotations
99

1010
from enum import Enum
11-
from typing import Any, Optional, Union
11+
from typing import Any, List, Optional, Union
1212

1313
from pydantic import BaseModel, Field
1414

0 commit comments

Comments
 (0)