microsoft · Copilot · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/presidio-analyzer/Dockerfile b/presidio-analyzer/Dockerfile
@@ -30,10 +30,12 @@ RUN pip install poetry==2.3.2 \
     && poetry install --no-root --only=main -E server \
     && rm -rf $(poetry config cache-dir)
 
-# install nlp models specified in NLP_CONF_FILE
+# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
 COPY ./install_nlp_models.py /app/
 
-RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
+RUN poetry run python install_nlp_models.py \
+    --conf_file ${NLP_CONF_FILE} \
+    --analyzer_conf_file ${ANALYZER_CONF_FILE}
 
 COPY . /app/
 

diff --git a/presidio-analyzer/Dockerfile.stanza b/presidio-analyzer/Dockerfile.stanza
@@ -26,10 +26,17 @@ RUN apt-get update \
 COPY ./pyproject.toml /app/
 
 RUN pip install poetry==2.3.2 && poetry install --no-root --only=main -E server -E stanza
-# install nlp models specified in NLP_CONF_FILE
+# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
 COPY ./install_nlp_models.py /app/
 
-RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
+# Set Stanza resource directory to a location that will be owned by the presidio user.
+# Without this, stanza.download() writes to /root/.cache/stanza/ which is inaccessible
+# at runtime when the container runs as user 1001 (presidio).
+ENV STANZA_RESOURCES_DIR=/app/stanza_resources
+
+RUN poetry run python install_nlp_models.py \
+    --conf_file ${NLP_CONF_FILE} \
+    --analyzer_conf_file ${ANALYZER_CONF_FILE}
 
 COPY . /app/
 

diff --git a/presidio-analyzer/Dockerfile.transformers b/presidio-analyzer/Dockerfile.transformers
@@ -24,10 +24,12 @@ RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
 COPY ./pyproject.toml /app/
 RUN pip install poetry==2.3.2 && poetry install --no-root --only=main -E server -E transformers
 
-# install nlp models specified in NLP_CONF_FILE
+# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
 COPY ./install_nlp_models.py /app/
 
-RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
+RUN poetry run python install_nlp_models.py \
+    --conf_file ${NLP_CONF_FILE} \
+    --analyzer_conf_file ${ANALYZER_CONF_FILE}
 
 COPY . /app/
 

diff --git a/presidio-analyzer/app.py b/presidio-analyzer/app.py
@@ -43,9 +43,11 @@ def __init__(self):
         self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
         self.app = Flask(__name__)
 
-        analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE")
-        nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE")
-        recognizer_registry_conf_file = os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE")
+        analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE") or None
+        nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE") or None
+        recognizer_registry_conf_file = (
+            os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE") or None
+        )
 
         self.logger.info("Starting analyzer engine")
         self.engine: AnalyzerEngine = AnalyzerEngineProvider(

diff --git a/presidio-analyzer/install_nlp_models.py b/presidio-analyzer/install_nlp_models.py
@@ -2,7 +2,7 @@
 
 import argparse
 import logging
-from typing import Dict, Union
+from typing import Dict, Optional, Union
 
 import yaml
 from spacy.cli import download as spacy_download
@@ -26,15 +26,58 @@
 logger.addHandler(logging.StreamHandler())
 
 
-def install_models(conf_file: str) -> None:
-    """Installs models in conf/default.yaml.
+def install_models(
+    conf_file: str, analyzer_conf_file: Optional[str] = None
+) -> None:
+    """Installs NLP models based on the provided configuration files.
 
-    :param conf_file: Path to the yaml file containing the models to install.
-    See examples in the conf directory.
-    """
+    When *analyzer_conf_file* is given and contains an ``nlp_configuration``
+    section (unified analyzer conf format), the models defined there are
+    downloaded and *conf_file* is ignored.  This ensures that a single unified
+    ``ANALYZER_CONF_FILE`` can drive both build-time model downloads and
+    runtime configuration without requiring a separate ``NLP_CONF_FILE``.
 
-    nlp_configuration = yaml.safe_load(open(conf_file))
+    When *analyzer_conf_file* is not provided or does not contain an
+    ``nlp_configuration`` section, *conf_file* is used as before (plain NLP
+    conf format with a top-level ``nlp_engine_name`` and ``models`` field).
 
+    :param conf_file: Path to a plain NLP configuration yaml file.
+    :param analyzer_conf_file: Optional path to a unified analyzer conf file
+        that may contain an ``nlp_configuration`` section.
+    """
+    # Prefer nlp_configuration embedded inside a unified ANALYZER_CONF_FILE.
+    if analyzer_conf_file:
+        try:
+            with open(analyzer_conf_file) as fh:
+                analyzer_config = yaml.safe_load(fh)
+        except OSError as e:
+            raise OSError(
+                f"Could not read analyzer conf file '{analyzer_conf_file}'"
+            ) from e
+        if analyzer_config and "nlp_configuration" in analyzer_config:
+            logger.info(
+                "Using nlp_configuration from analyzer conf file: %s",
+                analyzer_conf_file,
+            )
+            _install_models_from_nlp_config(analyzer_config["nlp_configuration"])
+            return
+
+    # Fall back to the plain NLP conf file (backward-compatible path).
+    try:
+        with open(conf_file) as fh:
+            nlp_configuration = yaml.safe_load(fh)
+    except OSError as e:
+        raise OSError(f"Could not read NLP conf file '{conf_file}'") from e
+    _install_models_from_nlp_config(nlp_configuration)
+
+
+def _install_models_from_nlp_config(nlp_configuration: dict) -> None:
+    """Download all models described in an nlp_configuration dict.
+
+    :param nlp_configuration: Dict with at least ``nlp_engine_name`` and
+        ``models`` keys (i.e. the content of a plain NLP conf file, or the
+        value of the ``nlp_configuration`` key in a unified analyzer conf).
+    """
     logger.info(f"Installing models from configuration: {nlp_configuration}")
 
     if "nlp_engine_name" not in nlp_configuration:
@@ -105,6 +148,21 @@ def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
         default="presidio_analyzer/conf/default.yaml",
         help="Location of nlp configuration yaml file. Default: conf/default.yaml",
     )
+    parser.add_argument(
+        "--analyzer_conf_file",
+        required=False,
+        default=None,
+        help=(
+            "Optional path to a unified analyzer conf file (ANALYZER_CONF_FILE). "
+            "When this file contains an nlp_configuration section, models from "
+            "that section are downloaded and --conf_file is ignored. "
+            "Use this when ANALYZER_CONF_FILE is the single source of truth for "
+            "both NLP and recognizer-registry configuration."
+        ),
+    )
     args = parser.parse_args()
 
-    install_models(conf_file=args.conf_file)
+    install_models(
+        conf_file=args.conf_file,
+        analyzer_conf_file=args.analyzer_conf_file,
+    )
diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
@@ -107,15 +107,15 @@ def _load_recognizer_registry(
         supported_languages: List[str],
         nlp_engine: NlpEngine,
     ) -> RecognizerRegistry:
-        if self.recognizer_registry_conf_file:
-            logger.info(
-                f"Reading recognizer registry "
-                f"configuration from {self.recognizer_registry_conf_file}"
-            )
-            provider = RecognizerRegistryProvider(
-                conf_file=self.recognizer_registry_conf_file, nlp_engine=nlp_engine
-            )
-        elif "recognizer_registry" in self.configuration:
+        """Load recognizer registry.
+
+        Inline ``recognizer_registry`` section in the analyzer conf takes
+        priority over a separately provided per-section file so that a unified
+        ANALYZER_CONF_FILE is self-contained and is not silently overridden by
+        a per-section file that was baked into the image as a Dockerfile default.
+        A per-section file is only used when no inline section is present.
+        """
+        if "recognizer_registry" in self.configuration:
             registry_configuration = self.configuration["recognizer_registry"]
             provider = RecognizerRegistryProvider(
                 registry_configuration={
@@ -124,6 +124,14 @@ def _load_recognizer_registry(
                 },
                 nlp_engine=nlp_engine,
             )
+        elif self.recognizer_registry_conf_file:
+            logger.info(
+                f"Reading recognizer registry "
+                f"configuration from {self.recognizer_registry_conf_file}"
+            )
+            provider = RecognizerRegistryProvider(
+                conf_file=self.recognizer_registry_conf_file, nlp_engine=nlp_engine
+            )
         else:
             logger.warning(
                 "configuration file is missing for 'recognizer_registry'. "
@@ -142,12 +150,20 @@ def _load_recognizer_registry(
         return registry
 
     def _load_nlp_engine(self) -> NlpEngine:
-        if self.nlp_engine_conf_file:
-            logger.info(f"Reading nlp configuration from {self.nlp_engine_conf_file}")
-            provider = NlpEngineProvider(conf_file=self.nlp_engine_conf_file)
-        elif "nlp_configuration" in self.configuration:
+        """Load NLP engine.
+
+        Inline ``nlp_configuration`` section in the analyzer conf takes
+        priority over a separately provided per-section file so that a unified
+        ANALYZER_CONF_FILE is self-contained and is not silently overridden by
+        a per-section file that was baked into the image as a Dockerfile default.
+        A per-section file is only used when no inline section is present.
+        """
+        if "nlp_configuration" in self.configuration:
             nlp_configuration = self.configuration["nlp_configuration"]
             provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
+        elif self.nlp_engine_conf_file:
+            logger.info(f"Reading nlp configuration from {self.nlp_engine_conf_file}")
+            provider = NlpEngineProvider(conf_file=self.nlp_engine_conf_file)
         else:
             logger.warning(
                 "configuration file is missing for 'nlp_configuration'."

diff --git a/presidio-analyzer/tests/test_analyzer_engine_provider.py b/presidio-analyzer/tests/test_analyzer_engine_provider.py
@@ -514,6 +514,41 @@ def test_analyzer_engine_provider_create_engine_with_all_params():
     assert len(engine.supported_languages) > 0
 
 
+def test_analyzer_engine_provider_inline_sections_take_priority_over_per_section_files():
+    """Test that nlp_configuration / recognizer_registry sections embedded in the
+    analyzer conf file take priority over separately provided per-section files.
+
+    This is the key behaviour that lets a single unified ANALYZER_CONF_FILE drive
+    both NLP and registry configuration without being silently overridden by
+    Dockerfile-baked-in default values for NLP_CONF_FILE and
+    RECOGNIZER_REGISTRY_CONF_FILE.
+    """
+    # test_analyzer_engine.yaml contains both nlp_configuration and
+    # recognizer_registry sections.
+    analyzer_yaml, nlp_yaml, registry_yaml = get_full_paths(
+        "conf/test_analyzer_engine.yaml",
+        "conf/default.yaml",
+        "conf/test_recognizer_registry.yaml",
+    )
+
+    provider = AnalyzerEngineProvider(
+        analyzer_engine_conf_file=analyzer_yaml,
+        nlp_engine_conf_file=nlp_yaml,
+        recognizer_registry_conf_file=registry_yaml,
+    )
+    engine = provider.create_engine()
+
+    # The analyzer yaml's supported_languages + recognizer_registry should prevail.
+    # test_analyzer_engine.yaml lists de, en, es — not the registry-only en.
+    assert "de" in engine.supported_languages
+    assert "en" in engine.supported_languages
+    assert "es" in engine.supported_languages
+
+    # The registry from the inline section has more than the 6 recognizers
+    # in test_recognizer_registry.yaml, confirming the inline section won.
+    assert len(engine.registry.recognizers) > 6
+
+
 def test_analyzer_engine_provider_multiple_languages_support():
     """Test analyzer engine with multiple language support."""
     analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml")