Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,10 @@ recognizers:
type: predefined
enabled: false
config_path: presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove OllamaLangExtractRecognizer?

BasicLangExtractRecognizer already supports Ollama through provider configuration.
The dedicated Ollama recognizer seems redundant now.

also should we adjust the e2e tests as well https://github.com/microsoft/presidio/blob/main/e2e-tests/tests/test_package_e2e_integration_flows.py#L68.


- name: BasicLangExtractRecognizer
supported_languages:
- en
type: predefined
enabled: false
config_path: presidio_analyzer/conf/langextract_config_basic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Configurable LangExtract Configuration
# Supports multiple LLM providers via LangExtract's ModelConfig

lm_recognizer:
supported_entities:
- PERSON
- EMAIL_ADDRESS
- PHONE_NUMBER
- US_SSN
- LOCATION
- ORGANIZATION
- DATE_TIME
- CREDIT_CARD
- IP_ADDRESS
- URL

labels_to_ignore:
- payment_status
- metadata
- annotation

enable_generic_consolidation: true
min_score: 0.5

langextract:
prompt_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2"
examples_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml"

model:
model_id: "gpt-4o"
provider:
name: "openai"
kwargs:
base_url: "https://api.openai.com/v1"
# api_key: "API_KEY_GOES_HERE" or set env LANGEXTRACT_API_KEY

entity_mappings:
person: PERSON
name: PERSON
email: EMAIL_ADDRESS
phone: PHONE_NUMBER
ssn: US_SSN
location: LOCATION
address: LOCATION
organization: ORGANIZATION
date: DATE_TIME
credit_card: CREDIT_CARD
ip_address: IP_ADDRESS
url: URL
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/llm_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
extract_lm_config,
get_supported_entities,
lx,
lx_factory,
)
from .prompt_loader import load_file_from_conf, load_prompt_file, render_jinja_template

Expand All @@ -52,6 +53,7 @@
"extract_lm_config",
"get_supported_entities",
"lx",
"lx_factory",
"load_file_from_conf",
"load_prompt_file",
"render_jinja_template",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@

try:
import langextract as lx
import langextract.factory as lx_factory
except ImportError:
lx = None
lx_factory = None

__all__ = [
"lx",
"lx_factory",
"check_langextract_available",
"extract_lm_config",
"get_supported_entities",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
)
from .third_party.langextract_recognizer import LangExtractRecognizer
from .third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
from .third_party.basic_langextract_recognizer import BasicLangExtractRecognizer

PREDEFINED_RECOGNIZERS = [
"PhoneRecognizer",
Expand Down Expand Up @@ -164,5 +165,6 @@
"LangExtractRecognizer",
"AzureOpenAILangExtractRecognizer",
"OllamaLangExtractRecognizer",
"BasicLangExtractRecognizer",
"KrPassportRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
import os
from pathlib import Path
from typing import Optional

from presidio_analyzer.llm_utils import lx, lx_factory
from presidio_analyzer.predefined_recognizers.third_party.\
langextract_recognizer import LangExtractRecognizer

logger = logging.getLogger("presidio-analyzer")

class BasicLangExtractRecognizer(LangExtractRecognizer):
"""Basic LangExtract recognizer using configurable backend."""

DEFAULT_CONFIG_PATH = (
Path(__file__).parent.parent.parent / "conf" / "langextract_config_basic.yaml"
)

def __init__(
self,
config_path: Optional[str] = None,
supported_language: str = "en",
context: Optional[list] = None
):
"""Initialize Basic LangExtract recognizer.
:param config_path: Path to configuration file (optional).
:param supported_language: Language this recognizer supports
(optional, default: "en").
:param context: List of context words
(optional, currently not used by LLM recognizers).
"""
actual_config_path = (
config_path if config_path else str(self.DEFAULT_CONFIG_PATH)
)

super().__init__(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we support extract_params in BasicLangExtractRecognizer?
Those parameters are needed for different scenarios,
for example for ollama if we use small llm we would need the max_char_buffer..

OllamaLangExtractRecognizer passes parameters like max_char_buffer, timeout, and num_ctx , max_workers, language_model_params, extraction_passes.

to the parent class, but BasicLangExtractRecognizer doesn't support these yet.

i have thought about something like this:

 Extract optional parameters from config
extract_params = {}
if "max_char_buffer" in model_config:
    extract_params["extract"] = {"max_char_buffer": model_config["max_char_buffer"]}

lang_model_params = {}
for key in ["timeout", "num_ctx"]:
    if key in model_config:
        lang_model_params[key] = model_config[key]
if lang_model_params:
    extract_params["language_model"] = lang_model_params

super().__init__(
    config_path=actual_config_path,
    name="Basic LangExtract PII",
    supported_language=supported_language,
    extract_params=extract_params or None
)

config_path=actual_config_path,
name="Basic LangExtract PII",
supported_language=supported_language
)

model_config = self.config.get("model", {})
provider_config = model_config.get("provider", {})
self.model_id = model_config.get("model_id")
self.provider = provider_config.get("name")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add validation here with descriptive error messages?

self.provider_kwargs = provider_config.get("kwargs", {})
if not self.model_id:
raise ValueError("Configuration must contain 'model_id'")
if not self.provider:
raise ValueError("Configuration must contain 'provider'")

self.fence_output = model_config.get("fence_output", "openai" in self.provider.lower())
self.use_schema_constraints = model_config.get("use_schema_constraints", False)

if "api_key" not in self.provider_kwargs and "LANGEXTRACT_API_KEY" in os.environ:
self.provider_kwargs["api_key"] = os.environ["LANGEXTRACT_API_KEY"]

self.lx_model_config = lx_factory.ModelConfig(
model_id=self.model_id,
provider=self.provider,
provider_kwargs=self.provider_kwargs,
)

def _get_provider_params(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this method can be removed also from the parent and from the AzureOpenAILangExtractRecognizer recognizer... but keep the abstraction i guess

"""Return Azure OpenAI-specific params."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please fix the doc string

return {
"config": self.lx_model_config,
"fence_output": self.fence_output,
"use_schema_constraints": self.use_schema_constraints,
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -57,11 +58,13 @@ def __init__(
conf_file=conf_file, registry_configuration=registry_configuration
)

self.configuration = (
ConfigurationValidator.validate_recognizer_registry_configuration(
self.configuration
if os.environ.get("PRESIDIO_ENABLE_SCHEMA_VALIDATION", "").lower() == "true":
self.configuration = (
ConfigurationValidator.validate_recognizer_registry_configuration(
self.configuration
)
)
)

self.nlp_engine = nlp_engine

def create_recognizer_registry(self) -> RecognizerRegistry:
Expand Down
Loading