microsoft · telackey · Dec 12, 2025 · Dec 12, 2025 · Jan 5, 2026 · RonShakutai
diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml
@@ -215,3 +215,10 @@ recognizers:
     type: predefined
     enabled: false
     config_path: presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml
+
+  - name: BasicLangExtractRecognizer
+    supported_languages:
+      - en
+    type: predefined
+    enabled: false
+    config_path: presidio_analyzer/conf/langextract_config_basic.yaml
diff --git a/presidio-analyzer/presidio_analyzer/conf/langextract_config_basic.yaml b/presidio-analyzer/presidio_analyzer/conf/langextract_config_basic.yaml
@@ -0,0 +1,49 @@
+# Configurable LangExtract Configuration
+# Supports multiple LLM providers via LangExtract's ModelConfig
+
+lm_recognizer:
+  supported_entities:
+    - PERSON
+    - EMAIL_ADDRESS
+    - PHONE_NUMBER
+    - US_SSN
+    - LOCATION
+    - ORGANIZATION
+    - DATE_TIME
+    - CREDIT_CARD
+    - IP_ADDRESS
+    - URL
+
+  labels_to_ignore:
+    - payment_status
+    - metadata
+    - annotation
+
+  enable_generic_consolidation: true
+  min_score: 0.5
+
+langextract:
+  prompt_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_prompt.j2"
+  examples_file: "presidio_analyzer/conf/langextract_prompts/default_pii_phi_examples.yaml"
+
+  model:
+    model_id: "gpt-4o"
+    provider:
+      name: "openai"
+      kwargs:
+        base_url: "https://api.openai.com/v1"
+        # api_key: "API_KEY_GOES_HERE" or set env LANGEXTRACT_API_KEY
+
+  entity_mappings:
+    person: PERSON
+    name: PERSON
+    email: EMAIL_ADDRESS
+    phone: PHONE_NUMBER
+    ssn: US_SSN
+    location: LOCATION
+    address: LOCATION
+    organization: ORGANIZATION
+    date: DATE_TIME
+    credit_card: CREDIT_CARD
+    ip_address: IP_ADDRESS
+    url: URL
diff --git a/presidio-analyzer/presidio_analyzer/llm_utils/__init__.py b/presidio-analyzer/presidio_analyzer/llm_utils/__init__.py
@@ -26,6 +26,7 @@
     extract_lm_config,
     get_supported_entities,
     lx,
+    lx_factory,
 )
 from .prompt_loader import load_file_from_conf, load_prompt_file, render_jinja_template
 
@@ -52,6 +53,7 @@
     "extract_lm_config",
     "get_supported_entities",
     "lx",
+    "lx_factory",
     "load_file_from_conf",
     "load_prompt_file",
     "render_jinja_template",

diff --git a/presidio-analyzer/presidio_analyzer/llm_utils/langextract_helper.py b/presidio-analyzer/presidio_analyzer/llm_utils/langextract_helper.py
@@ -8,11 +8,14 @@
 
 try:
     import langextract as lx
+    import langextract.factory as lx_factory
 except ImportError:
     lx = None
+    lx_factory = None
 
 __all__ = [
     "lx",
+    "lx_factory",
     "check_langextract_available",
     "extract_lm_config",
     "get_supported_entities",

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py
@@ -93,6 +93,7 @@
 )
 from .third_party.langextract_recognizer import LangExtractRecognizer
 from .third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
+from .third_party.basic_langextract_recognizer import BasicLangExtractRecognizer
 
 PREDEFINED_RECOGNIZERS = [
     "PhoneRecognizer",
@@ -164,5 +165,6 @@
     "LangExtractRecognizer",
     "AzureOpenAILangExtractRecognizer",
     "OllamaLangExtractRecognizer",
+    "BasicLangExtractRecognizer",
     "KrPassportRecognizer",
 ]
diff --git a/...yzer/presidio_analyzer/predefined_recognizers/third_party/basic_langextract_recognizer.py b/...yzer/presidio_analyzer/predefined_recognizers/third_party/basic_langextract_recognizer.py
@@ -0,0 +1,71 @@
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+
+from presidio_analyzer.llm_utils import lx, lx_factory
+from presidio_analyzer.predefined_recognizers.third_party.\
+    langextract_recognizer import LangExtractRecognizer
+
+logger = logging.getLogger("presidio-analyzer")
+
+class BasicLangExtractRecognizer(LangExtractRecognizer):
+    """Basic LangExtract recognizer using configurable backend."""
+
+    DEFAULT_CONFIG_PATH = (
+        Path(__file__).parent.parent.parent / "conf" / "langextract_config_basic.yaml"
+    )
+
+    def __init__(
+        self,
+        config_path: Optional[str] = None,
+        supported_language: str = "en",
+        context: Optional[list] = None
+    ):
+        """Initialize Basic LangExtract recognizer.
+
+        :param config_path: Path to configuration file (optional).
+        :param supported_language: Language this recognizer supports
+            (optional, default: "en").
+        :param context: List of context words
+            (optional, currently not used by LLM recognizers).
+        """
+        actual_config_path = (
+            config_path if config_path else str(self.DEFAULT_CONFIG_PATH)
+        )
+
+        super().__init__(
+            config_path=actual_config_path,
+            name="Basic LangExtract PII",
+            supported_language=supported_language
+        )
+
+        model_config = self.config.get("model", {})
+        provider_config = model_config.get("provider", {})
+        self.model_id = model_config.get("model_id")
+        self.provider = provider_config.get("name")
+        self.provider_kwargs = provider_config.get("kwargs", {})
+        if not self.model_id:
+            raise ValueError("Configuration must contain 'model_id'")
+        if not self.provider:
+            raise ValueError("Configuration must contain 'provider'")
+
+        self.fence_output = model_config.get("fence_output", "openai" in self.provider.lower())
+        self.use_schema_constraints = model_config.get("use_schema_constraints", False)
+
+        if "api_key" not in self.provider_kwargs and "LANGEXTRACT_API_KEY" in os.environ:
+            self.provider_kwargs["api_key"] = os.environ["LANGEXTRACT_API_KEY"]
+
+        self.lx_model_config = lx_factory.ModelConfig(
+            model_id=self.model_id,
+            provider=self.provider,
+            provider_kwargs=self.provider_kwargs,
+        )
+
+    def _get_provider_params(self):
+        """Return Azure OpenAI-specific params."""
+        return {
+            "config": self.lx_model_config,
+            "fence_output": self.fence_output,
+            "use_schema_constraints": self.use_schema_constraints,
+        }
diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import logging
+import os
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
@@ -57,11 +58,13 @@ def __init__(
             conf_file=conf_file, registry_configuration=registry_configuration
         )
 
-        self.configuration = (
-            ConfigurationValidator.validate_recognizer_registry_configuration(
-                self.configuration
+        if os.environ.get("PRESIDIO_ENABLE_SCHEMA_VALIDATION", "").lower() == "true":
+            self.configuration = (
+                ConfigurationValidator.validate_recognizer_registry_configuration(
+                    self.configuration
+                )
             )
-        )
+
         self.nlp_engine = nlp_engine
 
     def create_recognizer_registry(self) -> RecognizerRegistry: