Skip to content
Closed
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
794e23e
feat: add dependencies and skeletton for detection tools
JCHAVEROT May 5, 2026
d8503b1
feat: add gliner detection tool
JCHAVEROT May 5, 2026
11205df
feat: add openai-filter detection tool
JCHAVEROT May 5, 2026
ec9126f
feat: add presidio detection tool
JCHAVEROT May 5, 2026
9797115
feat: add first version of LLM with dspy as detection tool
JCHAVEROT May 5, 2026
8be4dc8
chores: update imported functions
JCHAVEROT May 5, 2026
58b8572
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT May 7, 2026
66b0c77
Merge branch 'feat/agent-framework-integration', remote-tracking bran…
JCHAVEROT May 12, 2026
f1060f9
Merge remote-tracking branch 'origin/master' into feat/agent-framewor…
JCHAVEROT May 12, 2026
5814294
Merge branch 'feat/agent-framework-integration' into feat/detection-t…
JCHAVEROT May 12, 2026
d175fd4
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT May 13, 2026
6a91ebb
refactor: create a file gathering default configurations
JCHAVEROT May 13, 2026
3eae01f
tests: add unit tests on detection engines to be later replaced
JCHAVEROT May 15, 2026
df24d81
refactor: extract DSPy llm so that it can be reused by sanitizer later
JCHAVEROT May 15, 2026
825ff24
refactor: extract prompts to clean llm_engine.py
JCHAVEROT May 15, 2026
53e384b
review: apply changes
JCHAVEROT May 18, 2026
a050c18
Merge branch 'feat/agent-framework-integration' into feat/detection-t…
JCHAVEROT May 26, 2026
fd784c6
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT May 27, 2026
39d6a58
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT May 27, 2026
ec8001e
review: fix llm_engine in case the same fragment appear multiple times
JCHAVEROT May 27, 2026
46bc5ab
review: fix openai_filter_engine as passed entity_types list should a…
JCHAVEROT May 27, 2026
fd49131
tests: update dict keys
JCHAVEROT Jun 3, 2026
4c34eab
feat: download spacy language model when missing
JCHAVEROT Jun 3, 2026
07848bb
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT Jun 3, 2026
f2ae78f
Merge remote-tracking branch 'me/feat/agent-framework-integration' in…
JCHAVEROT Jun 9, 2026
1d9c58e
review: fix types and other small changes
JCHAVEROT Jun 10, 2026
9e0b74d
fix: add device mapping for gliner engine
JCHAVEROT Jun 10, 2026
e5723a5
fix: corrections in tests, dependencies, and fix runtime errors
JCHAVEROT Jun 10, 2026
52f4e8c
feat: add a model registry to have efficient caching in privacy pipeline
JCHAVEROT Jun 10, 2026
a9b21bc
[Multi-Agent Privacy] Agent framework integration (#285)
JCHAVEROT Jun 11, 2026
8f16202
review: create an enum for the detection engine type
JCHAVEROT Jun 11, 2026
570b013
Various typing fixes (#320)
fabnemEPFL Jun 12, 2026
e3890d1
Bump version from 1.2.3 to 1.2.4
fabnemEPFL Jun 12, 2026
6281137
Fix torch import crash in Docker images (#322)
JCHAVEROT Jun 12, 2026
03f3426
RAG CLI: fix empty answers and improve UX (#323)
JCHAVEROT Jun 20, 2026
88ee637
Merge remote-tracking branch 'origin/master' into v2
JCHAVEROT Jun 22, 2026
42dbcdf
Merge remote-tracking branch 'origin/v2' into feat/detection-toolkit
JCHAVEROT Jun 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,17 @@ privacy = [
"mmore[llm]",
"langgraph>=1.0",
"langgraph-checkpoint-sqlite>=2.0",
"gliner",
"presidio-analyzer",
"presidio-anonymizer",
"spacy",
"dspy>=2.6",
]

privacy-openai-filter = [
"mmore[privacy]",
"transformers>=5",
"peft>=0.18",
]

# --- Composite + variant extras ---
Expand Down Expand Up @@ -173,6 +184,16 @@ conflicts = [
{ extra = "cpu" },
{ extra = "cu126" },
],
[
# privacy-openai-filter needs transformers>=5 but has conflicts with
# marker-pdf in `process`/`all` (transformers<5)
{ extra = "privacy-openai-filter" },
{ extra = "process" },
],
[
{ extra = "privacy-openai-filter" },
{ extra = "all" },
],
]

override-dependencies = ["anthropic==0.73.0"]
Expand Down
22 changes: 22 additions & 0 deletions src/mmore/privacy/detection/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""PII detection engines exposed as agent tools."""
Comment thread
JCHAVEROT marked this conversation as resolved.

from .base import DetectionEngine, PIISpan
from .config import DetectionConfig
from .gliner_engine import GLiNEREngine, detect_pii_gliner
from .llm_engine import LLMDetectionEngine, detect_pii_llm
from .openai_filter_engine import OpenAIFilterEngine, detect_pii_openai_filter
from .presidio_engine import PresidioEngine, detect_pii_presidio

__all__ = [
"DetectionConfig",
"DetectionEngine",
"GLiNEREngine",
"LLMDetectionEngine",
"OpenAIFilterEngine",
"PIISpan",
"PresidioEngine",
"detect_pii_gliner",
"detect_pii_llm",
"detect_pii_openai_filter",
"detect_pii_presidio",
]
28 changes: 28 additions & 0 deletions src/mmore/privacy/detection/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""PII detection interface.

Each engine implements ``DetectionEngine.detect`` and returns a list of
``PIISpan`` records. Engines are independently registered as agent tools so a
sanitizer agent can resolve them by name from YAML.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List


@dataclass
class PIISpan:
"""A single detected PII occurrence in some text."""

start: int
end: int
label: str
score: float


class DetectionEngine(ABC):
"""Abstract base for PII detection backends."""

@abstractmethod
def detect(self, text: str) -> List[PIISpan]:
"""Return all PII spans found in ``text``."""
17 changes: 17 additions & 0 deletions src/mmore/privacy/detection/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Configuration dataclass for the PII detection toolkit."""

from dataclasses import dataclass, field
from typing import List, Optional

from ...rag.llm import LLMConfig
from .defaults import DEFAULT_CONFIDENCE_THRESHOLD


@dataclass
class DetectionConfig:
"""Schema for the ``privacy.detection`` block of a YAML config."""

engine: str

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the field engine seems to be used only in assertion tests so it sounds useless

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unless it's meant to be saved

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is still not used in this PR (it will be the case most likely in the PR were we wire the privacy layer into the mmore's RAG pipeline), but I think we want to keep it because later using this parameter the user will be able to choose a specific detection engine (instead of falling back to the default one or having the Analyzer agent infer one for the task)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then it could make sense to have an enum with the supported detection engines

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 8f16202

entity_types: List[str] = field(default_factory=list)
confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD
llm: Optional[LLMConfig] = None
55 changes: 55 additions & 0 deletions src/mmore/privacy/detection/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Shared defaults for the PII detection engines."""

from ...rag.llm import LLMConfig

DEFAULT_LANGUAGE = "en"

DEFAULT_GLINER_MODEL = "nvidia/gliner-PII"
DEFAULT_OPENAI_FILTER_MODEL = "openai/privacy-filter"
DEFAULT_PRESIDIO_SPACY_MODEL = "en_core_web_lg"

DEFAULT_LLM_CONFIG = LLMConfig(
llm_name="Qwen/Qwen2.5-3B-Instruct",
max_new_tokens=512,
)

DEFAULT_CONFIDENCE_THRESHOLD = 0.7

# TODO: Later add new labels to the list
DEFAULT_LABELS = [
"PERSON",
"PHONE",
"EMAIL",
"MRN",
"DATE",
"LOCATION",
"SSN",
"INSURANCE_ID",
]

# TODO: Later add new patterns to the list
PRESIDIO_CLINICAL_PATTERNS = [
{
"entity": "MRN",
"patterns": [
("mrn_with_prefix", r"\bMRN[\s:#]*\d{6,10}\b", 0.9),
("mrn_bare_8_digits", r"\b\d{8}\b", 0.4),
],
"context": ["mrn", "medical record", "record number", "patient id"],
},
{
"entity": "HOSPITAL_DATE",
"patterns": [
("iso_date", r"\b\d{4}-\d{2}-\d{2}\b", 0.6),
("us_date", r"\b\d{1,2}/\d{1,2}/\d{4}\b", 0.6),
],
"context": ["admission", "discharge", "appointment", "hospital", "clinic"],
},
{
"entity": "INSURANCE_ID",
"patterns": [
("insurance_alnum", r"\b[A-Z]{2,3}\d{6,12}\b", 0.7),
],
"context": ["insurance", "policy", "member id", "subscriber"],
},
]
109 changes: 109 additions & 0 deletions src/mmore/privacy/detection/gliner_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""GLiNER-based PII detection engine."""

import logging
import threading
from typing import Any, Dict, List, Optional, Sequence

from typing_extensions import Self

from ..agents.registry import register_tool
from .base import DetectionEngine, PIISpan
from .config import DetectionConfig
from .defaults import (
DEFAULT_CONFIDENCE_THRESHOLD,
DEFAULT_GLINER_MODEL,
DEFAULT_LABELS,
)

logger = logging.getLogger(__name__)

_model_cache: Dict[str, Any] = {}
Comment thread
fabnemEPFL marked this conversation as resolved.
Outdated
_model_cache_lock = threading.Lock()


def _load_gliner_model(model_name: str) -> Any:
Comment thread
JCHAVEROT marked this conversation as resolved.
Outdated
from gliner import GLiNER

return GLiNER.from_pretrained(model_name)


def _get_or_load_model(model_name: str) -> Any:
Comment thread
JCHAVEROT marked this conversation as resolved.
Outdated
cached = _model_cache.get(model_name)
if cached is not None:
return cached
with _model_cache_lock:
cached = _model_cache.get(model_name)
if cached is None:
cached = _load_gliner_model(model_name)
_model_cache[model_name] = cached
return cached


def clear_gliner_cache() -> None:
"""Drop all cached GLiNER models."""
with _model_cache_lock:
_model_cache.clear()


class GLiNEREngine(DetectionEngine):
"""Detect PII spans with a GLiNER model.

Each instance carries its own ``entity_types`` and ``confidence_threshold``,
models with the same ``model_name`` are shared via ``_models_cache``.
"""

def __init__(
self,
model_name: str = DEFAULT_GLINER_MODEL,
entity_types: Optional[Sequence[str]] = None,
confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
):
self._model_name = model_name
self._entity_types: List[str] = (
list(entity_types) if entity_types else list(DEFAULT_LABELS)
)
self._confidence_threshold = confidence_threshold

@classmethod
def from_config(cls, config: DetectionConfig) -> Self:
"""Build an engine from a ``DetectionConfig``."""
return cls(
entity_types=config.entity_types or None,
confidence_threshold=config.confidence_threshold,
)

@property
def model(self) -> Any:
Comment thread
JCHAVEROT marked this conversation as resolved.
Outdated
"""Lazy-load and cache the LLM on first access."""
return _get_or_load_model(self._model_name)

def detect(self, text: str) -> List[PIISpan]:
raw = self.model.predict_entities(
text=text,
labels=self._entity_types,
threshold=self._confidence_threshold,
multi_label=False,
)
return [
PIISpan(
start=int(r["start"]),
end=int(r["end"]),
label=str(r["label"]),
score=float(r["score"]),
)
for r in raw
]


@register_tool("detect_pii_gliner")
def detect_pii_gliner(text: str) -> List[PIISpan]:
"""Detect PII spans in ``text`` using a default-configured GLiNER engine.

Agents needing per-config behavior should be wired by setup code that
builds a ``GLiNEREngine.from_config(detection_cfg)`` and registers its
``detect()`` function under a distinct tool name, e.g.::

engine = GLiNEREngine.from_config(detection_cfg)
register_tool("detect_pii_gliner_custom", engine.detect)
"""
return GLiNEREngine().detect(text)
Loading
Loading