Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
84a594e
feat: add human review and sampling pages for evaluation process
RonShakutai Mar 7, 2026
f214416
feat: add default values for presidioEntities and llmEntities in Enti…
RonShakutai Mar 8, 2026
0054d54
Feature/evaluation - Dataset Interface (#1893)
RonShakutai Mar 8, 2026
e9ee97c
refactor: clean up imports and improve code formatting across multipl…
RonShakutai Mar 8, 2026
a1936fd
Ronshakutai/feature/evaluation sampling (#1894)
RonShakutai Mar 8, 2026
877f9a4
ruff fixes
RonShakutai Mar 8, 2026
de286e1
Feature/evaluation - LLM AS A JUDGE INTEGRATION (#1900)
RonShakutai Mar 9, 2026
e48284f
Feedback adjustments
RonShakutai Mar 9, 2026
244b2ae
refactor: update entity handling and UI components for improved datas…
RonShakutai Mar 10, 2026
82639bd
refactor: update dataset identifiers and enhance entity status displa…
RonShakutai Mar 10, 2026
be7293e
refactor: enhance dataset handling and UI components for improved ent…
RonShakutai Mar 10, 2026
b32af0d
Presidio default running
RonShakutai Mar 10, 2026
7329a6c
refactor: update presidio-analyzer dependency to include transformers…
RonShakutai Mar 10, 2026
57e6370
fix bugs
RonShakutai Mar 10, 2026
bb4a71a
feat: add API utility functions and mock data for evaluation runs and…
RonShakutai Mar 11, 2026
daaa5a4
feat: enhance save_final_entities to include Presidio analysis result…
RonShakutai Mar 11, 2026
6d5f129
omri feedbacks & Coby PRD alignment
RonShakutai Mar 11, 2026
0089d88
refactor: remove duplicate dataset entry and adjust button placement …
RonShakutai Mar 11, 2026
252445a
refactor: remove Presidio configuration notice from Setup component
RonShakutai Mar 11, 2026
7a93e30
refactor: remove outdated comments and unused recognizer from custom_…
RonShakutai Mar 11, 2026
b5d6fa8
comparing configs
RonShakutai Mar 11, 2026
8be74aa
bug fixes, feeadback support, and irinia suggestions
RonShakutai Mar 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions evaluation/ai-assistant/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Override root gitignore's lib/ rule for src/app/lib/
!src/app/lib/

# Dependencies
node_modules/

# Build output
dist/

# Environment / IDE
.env
.env.*
.vscode/
.idea/

# OS
.DS_Store
Thumbs.db

# Python backend
backend/.venv/
backend/__pycache__/
backend/**/__pycache__/

# Persisted dataset copies
backend/data/

*.pyc
*.pyo

# Vite
*.local
19 changes: 19 additions & 0 deletions evaluation/ai-assistant/backend/datasets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"id": "example-dataset",
"filename": "example_pii_dataset.csv",
"name": "Example Dataset",
"description": "3 synthetic PII records covering healthcare, finance, and lab-result scenarios.",
"path": "data/example_pii_dataset.csv",
"stored_path": "/Users/ronshakutai/projects_folder/presidio/evaluation/ai-assistant/backend/data/Example_Dataset_example-dataset.csv",
"format": "csv",
"record_count": 3,
"has_entities": false,
"has_final_entities": true,
"ran_configs": [
"default_spacy"
],
"text_column": "text",
"entities_column": null
}
]
97 changes: 97 additions & 0 deletions evaluation/ai-assistant/backend/llm_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""LLM Judge service using Azure OpenAI via LangExtract."""

from __future__ import annotations

import logging
from typing import Optional

from models import Entity

logger = logging.getLogger(__name__)

# Lazy-loaded recognizer singleton
_recognizer = None


class LLMServiceError(Exception):
"""Raised when LLM service encounters an error."""


def configure(
azure_endpoint: str,
api_key: Optional[str] = None,
deployment_name: str = "gpt-4o",
api_version: str = "2024-02-15-preview",
) -> dict:
"""Initialise the Azure OpenAI LangExtract recognizer.

:param azure_endpoint: Azure OpenAI endpoint URL.
:param api_key: API key (or None for managed identity).
:param deployment_name: Azure deployment / model name.
:param api_version: Azure OpenAI API version.
:returns: Status dict.
"""
global _recognizer

try:
from presidio_analyzer.predefined_recognizers.third_party.azure_openai_langextract_recognizer import ( # noqa: E501
AzureOpenAILangExtractRecognizer,
)
except ImportError as exc:
raise LLMServiceError(
"langextract or presidio-analyzer is not installed. "
"Run: pip install langextract presidio-analyzer"
) from exc

try:
_recognizer = AzureOpenAILangExtractRecognizer(
model_id=deployment_name,
azure_endpoint=azure_endpoint,
api_key=api_key,
api_version=api_version,
)
except Exception as exc:
_recognizer = None
raise LLMServiceError(f"Failed to initialise recognizer: {exc}") from exc

logger.info(
"LLM Judge configured: endpoint=%s deployment=%s",
azure_endpoint,
deployment_name,
)
return {"status": "configured", "deployment": deployment_name}


def is_configured() -> bool:
"""Return True if the recognizer has been initialised."""
return _recognizer is not None


def disconnect() -> None:
"""Reset the recognizer so a new model can be configured."""
global _recognizer
_recognizer = None
logger.info("LLM Judge disconnected")


def analyze_text(text: str) -> list[Entity]:
"""Run the LLM recognizer on a single text and return Entity objects."""
if _recognizer is None:
raise LLMServiceError(
"LLM service not configured. Call /api/llm/configure first."
)

results = _recognizer.analyze(text=text, entities=None)

entities: list[Entity] = []
for r in results:
entities.append(
Entity(
text=text[r.start:r.end],
entity_type=r.entity_type,
start=r.start,
end=r.end,
score=round(r.score, 4),
)
)
return entities
44 changes: 44 additions & 0 deletions evaluation/ai-assistant/backend/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging

logging.basicConfig(level=logging.INFO)

# Eagerly import presidio_analyzer so the module is fully initialised before
# concurrent requests trigger lazy imports from different threads (which would
# hit a circular-import race in the presidio_analyzer package).
try:
import presidio_analyzer # noqa: F401
except ImportError:
pass # Optional dependency – endpoints will return clear errors if missing

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from routers import (
decision,
evaluation,
llm,
presidio_service,
review,
upload,
)

app = FastAPI(title="Presidio Evaluation Flow API", version="0.1.0")

app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:5173"],
allow_methods=["*"],
allow_headers=["*"],
)

app.include_router(upload.router)
app.include_router(review.router)
app.include_router(evaluation.router)
app.include_router(decision.router)
app.include_router(llm.router)
app.include_router(presidio_service.router)


@app.get("/api/health")
async def health():
"""Return service health status."""
return {"status": "ok"}
113 changes: 113 additions & 0 deletions evaluation/ai-assistant/backend/mock_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Mock data for evaluation / decision stages only."""

from datetime import datetime

from models import (
Entity,
EntityMiss,
EvaluationMetrics,
EvaluationRun,
MissType,
RiskLevel,
)

EVALUATION_RUNS: list[EvaluationRun] = [
EvaluationRun(
id="run-001",
timestamp=datetime(2025, 2, 20, 10, 30),
sample_size=500,
config_version="baseline-v1.0",
metrics=EvaluationMetrics(
precision=0.87,
recall=0.73,
f1_score=0.79,
true_positives=245,
false_positives=36,
false_negatives=91,
true_negatives=128,
),
),
EvaluationRun(
id="run-002",
timestamp=datetime(2025, 2, 22, 14, 15),
sample_size=500,
config_version="tuned-v1.1",
metrics=EvaluationMetrics(
precision=0.91,
recall=0.81,
f1_score=0.86,
true_positives=272,
false_positives=27,
false_negatives=64,
true_negatives=137,
),
),
EvaluationRun(
id="run-003",
timestamp=datetime(2025, 2, 25, 9, 0),
sample_size=500,
config_version="tuned-v1.2",
metrics=EvaluationMetrics(
precision=0.94,
recall=0.88,
f1_score=0.91,
true_positives=296,
false_positives=19,
false_negatives=40,
true_negatives=145,
),
),
]

ENTITY_MISSES: list[EntityMiss] = [
EntityMiss(
record_id="rec-004",
record_text=(
"Credit card ending in 4532 was used for "
"transaction. Customer: alice.wong@company.com."
),
missed_entity=Entity(
text="4532", entity_type="CREDIT_CARD", start=22, end=26, score=0.65
),
miss_type=MissType.false_negative,
entity_type="CREDIT_CARD",
risk_level=RiskLevel.high,
),
EntityMiss(
record_id="rec-002",
record_text=(
"Dr. Sarah Johnson reviewed the case. "
"Insurance Policy: POL-8821-USA."
),
missed_entity=Entity(
text="POL-8821-USA", entity_type="INSURANCE_POLICY", start=56, end=68
),
miss_type=MissType.false_negative,
entity_type="INSURANCE_POLICY",
risk_level=RiskLevel.medium,
),
EntityMiss(
record_id="rec-005",
record_text=(
"Prescription for Robert Chen: Medication ABC-123, dosage 50mg. "
"Doctor notes indicate history of diabetes."
),
missed_entity=Entity(
text="diabetes", entity_type="MEDICAL_CONDITION", start=97, end=105
),
miss_type=MissType.false_negative,
entity_type="MEDICAL_CONDITION",
risk_level=RiskLevel.high,
),
EntityMiss(
record_id="rec-003",
record_text=(
"Employee ID: EMP-8821, Jane Doe, "
"started 2023-06-01. Salary: $85,000."
),
missed_entity=Entity(text="$85,000", entity_type="SALARY", start=61, end=68),
miss_type=MissType.false_negative,
entity_type="SALARY",
risk_level=RiskLevel.medium,
),
]
Loading
Loading