Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions presidio-analyzer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ RUN pip install poetry==2.3.2 \
&& poetry install --no-root --only=main -E server \
&& rm -rf $(poetry config cache-dir)

# install nlp models specified in NLP_CONF_FILE
# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
COPY ./install_nlp_models.py /app/

RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
RUN poetry run python install_nlp_models.py \
--conf_file ${NLP_CONF_FILE} \
--analyzer_conf_file ${ANALYZER_CONF_FILE}

COPY . /app/

Expand Down
11 changes: 9 additions & 2 deletions presidio-analyzer/Dockerfile.stanza
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ RUN apt-get update \
COPY ./pyproject.toml /app/

RUN pip install poetry==2.3.2 && poetry install --no-root --only=main -E server -E stanza
# install nlp models specified in NLP_CONF_FILE
# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
COPY ./install_nlp_models.py /app/

RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
# Set Stanza resource directory to a location that will be owned by the presidio user.
# Without this, stanza.download() writes to /root/.cache/stanza/ which is inaccessible
# at runtime when the container runs as user 1001 (presidio).
ENV STANZA_RESOURCES_DIR=/app/stanza_resources

RUN poetry run python install_nlp_models.py \
--conf_file ${NLP_CONF_FILE} \
--analyzer_conf_file ${ANALYZER_CONF_FILE}

COPY . /app/

Expand Down
6 changes: 4 additions & 2 deletions presidio-analyzer/Dockerfile.transformers
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
COPY ./pyproject.toml /app/
RUN pip install poetry==2.3.2 && poetry install --no-root --only=main -E server -E transformers

# install nlp models specified in NLP_CONF_FILE
# install nlp models specified in NLP_CONF_FILE or via nlp_configuration in ANALYZER_CONF_FILE
COPY ./install_nlp_models.py /app/

RUN poetry run python install_nlp_models.py --conf_file ${NLP_CONF_FILE}
RUN poetry run python install_nlp_models.py \
--conf_file ${NLP_CONF_FILE} \
--analyzer_conf_file ${ANALYZER_CONF_FILE}

COPY . /app/

Expand Down
8 changes: 5 additions & 3 deletions presidio-analyzer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def __init__(self):
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)

analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE")
nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE")
recognizer_registry_conf_file = os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE")
analyzer_conf_file = os.environ.get("ANALYZER_CONF_FILE") or None
nlp_engine_conf_file = os.environ.get("NLP_CONF_FILE") or None
recognizer_registry_conf_file = (
os.environ.get("RECOGNIZER_REGISTRY_CONF_FILE") or None
)

self.logger.info("Starting analyzer engine")
self.engine: AnalyzerEngine = AnalyzerEngineProvider(
Expand Down
74 changes: 66 additions & 8 deletions presidio-analyzer/install_nlp_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import argparse
import logging
from typing import Dict, Union
from typing import Dict, Optional, Union

import yaml
from spacy.cli import download as spacy_download
Expand All @@ -26,15 +26,58 @@
logger.addHandler(logging.StreamHandler())


def install_models(conf_file: str) -> None:
"""Installs models in conf/default.yaml.
def install_models(
conf_file: str, analyzer_conf_file: Optional[str] = None
) -> None:
"""Installs NLP models based on the provided configuration files.

:param conf_file: Path to the yaml file containing the models to install.
See examples in the conf directory.
"""
When *analyzer_conf_file* is given and contains an ``nlp_configuration``
section (unified analyzer conf format), the models defined there are
downloaded and *conf_file* is ignored. This ensures that a single unified
``ANALYZER_CONF_FILE`` can drive both build-time model downloads and
runtime configuration without requiring a separate ``NLP_CONF_FILE``.

nlp_configuration = yaml.safe_load(open(conf_file))
When *analyzer_conf_file* is not provided or does not contain an
``nlp_configuration`` section, *conf_file* is used as before (plain NLP
conf format with a top-level ``nlp_engine_name`` and ``models`` field).

:param conf_file: Path to a plain NLP configuration yaml file.
:param analyzer_conf_file: Optional path to a unified analyzer conf file
that may contain an ``nlp_configuration`` section.
"""
# Prefer nlp_configuration embedded inside a unified ANALYZER_CONF_FILE.
if analyzer_conf_file:
try:
with open(analyzer_conf_file) as fh:
analyzer_config = yaml.safe_load(fh)
except OSError as e:
raise OSError(
f"Could not read analyzer conf file '{analyzer_conf_file}'"
) from e
if analyzer_config and "nlp_configuration" in analyzer_config:
logger.info(
"Using nlp_configuration from analyzer conf file: %s",
analyzer_conf_file,
)
_install_models_from_nlp_config(analyzer_config["nlp_configuration"])
return

# Fall back to the plain NLP conf file (backward-compatible path).
try:
with open(conf_file) as fh:
nlp_configuration = yaml.safe_load(fh)
except OSError as e:
raise OSError(f"Could not read NLP conf file '{conf_file}'") from e
_install_models_from_nlp_config(nlp_configuration)


def _install_models_from_nlp_config(nlp_configuration: dict) -> None:
"""Download all models described in an nlp_configuration dict.

:param nlp_configuration: Dict with at least ``nlp_engine_name`` and
``models`` keys (i.e. the content of a plain NLP conf file, or the
value of the ``nlp_configuration`` key in a unified analyzer conf).
"""
logger.info(f"Installing models from configuration: {nlp_configuration}")

if "nlp_engine_name" not in nlp_configuration:
Expand Down Expand Up @@ -105,6 +148,21 @@ def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
default="presidio_analyzer/conf/default.yaml",
help="Location of nlp configuration yaml file. Default: conf/default.yaml",
)
parser.add_argument(
"--analyzer_conf_file",
required=False,
default=None,
help=(
"Optional path to a unified analyzer conf file (ANALYZER_CONF_FILE). "
"When this file contains an nlp_configuration section, models from "
"that section are downloaded and --conf_file is ignored. "
"Use this when ANALYZER_CONF_FILE is the single source of truth for "
"both NLP and recognizer-registry configuration."
),
)
args = parser.parse_args()

install_models(conf_file=args.conf_file)
install_models(
conf_file=args.conf_file,
analyzer_conf_file=args.analyzer_conf_file,
)
42 changes: 29 additions & 13 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,15 @@ def _load_recognizer_registry(
supported_languages: List[str],
nlp_engine: NlpEngine,
) -> RecognizerRegistry:
if self.recognizer_registry_conf_file:
logger.info(
f"Reading recognizer registry "
f"configuration from {self.recognizer_registry_conf_file}"
)
provider = RecognizerRegistryProvider(
conf_file=self.recognizer_registry_conf_file, nlp_engine=nlp_engine
)
elif "recognizer_registry" in self.configuration:
"""Load recognizer registry.

Inline ``recognizer_registry`` section in the analyzer conf takes
priority over a separately provided per-section file so that a unified
ANALYZER_CONF_FILE is self-contained and is not silently overridden by
a per-section file that was baked into the image as a Dockerfile default.
A per-section file is only used when no inline section is present.
"""
if "recognizer_registry" in self.configuration:
registry_configuration = self.configuration["recognizer_registry"]
provider = RecognizerRegistryProvider(
registry_configuration={
Expand All @@ -124,6 +124,14 @@ def _load_recognizer_registry(
},
nlp_engine=nlp_engine,
)
elif self.recognizer_registry_conf_file:
logger.info(
f"Reading recognizer registry "
f"configuration from {self.recognizer_registry_conf_file}"
)
provider = RecognizerRegistryProvider(
conf_file=self.recognizer_registry_conf_file, nlp_engine=nlp_engine
)
else:
logger.warning(
"configuration file is missing for 'recognizer_registry'. "
Expand All @@ -142,12 +150,20 @@ def _load_recognizer_registry(
return registry

def _load_nlp_engine(self) -> NlpEngine:
if self.nlp_engine_conf_file:
logger.info(f"Reading nlp configuration from {self.nlp_engine_conf_file}")
provider = NlpEngineProvider(conf_file=self.nlp_engine_conf_file)
elif "nlp_configuration" in self.configuration:
"""Load NLP engine.

Inline ``nlp_configuration`` section in the analyzer conf takes
priority over a separately provided per-section file so that a unified
ANALYZER_CONF_FILE is self-contained and is not silently overridden by
a per-section file that was baked into the image as a Dockerfile default.
A per-section file is only used when no inline section is present.
"""
if "nlp_configuration" in self.configuration:
nlp_configuration = self.configuration["nlp_configuration"]
provider = NlpEngineProvider(nlp_configuration=nlp_configuration)
elif self.nlp_engine_conf_file:
logger.info(f"Reading nlp configuration from {self.nlp_engine_conf_file}")
provider = NlpEngineProvider(conf_file=self.nlp_engine_conf_file)
else:
logger.warning(
"configuration file is missing for 'nlp_configuration'."
Expand Down
35 changes: 35 additions & 0 deletions presidio-analyzer/tests/test_analyzer_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,41 @@ def test_analyzer_engine_provider_create_engine_with_all_params():
assert len(engine.supported_languages) > 0


def test_analyzer_engine_provider_inline_sections_take_priority_over_per_section_files():
"""Test that nlp_configuration / recognizer_registry sections embedded in the
analyzer conf file take priority over separately provided per-section files.

This is the key behaviour that lets a single unified ANALYZER_CONF_FILE drive
both NLP and registry configuration without being silently overridden by
Dockerfile-baked-in default values for NLP_CONF_FILE and
RECOGNIZER_REGISTRY_CONF_FILE.
"""
# test_analyzer_engine.yaml contains both nlp_configuration and
# recognizer_registry sections.
analyzer_yaml, nlp_yaml, registry_yaml = get_full_paths(
"conf/test_analyzer_engine.yaml",
"conf/default.yaml",
"conf/test_recognizer_registry.yaml",
)

provider = AnalyzerEngineProvider(
analyzer_engine_conf_file=analyzer_yaml,
nlp_engine_conf_file=nlp_yaml,
recognizer_registry_conf_file=registry_yaml,
)
engine = provider.create_engine()

# The analyzer yaml's supported_languages + recognizer_registry should prevail.
# test_analyzer_engine.yaml lists de, en, es — not the registry-only en.
assert "de" in engine.supported_languages
assert "en" in engine.supported_languages
assert "es" in engine.supported_languages

# The registry from the inline section has more than the 6 recognizers
# in test_recognizer_registry.yaml, confirming the inline section won.
assert len(engine.registry.recognizers) > 6


def test_analyzer_engine_provider_multiple_languages_support():
"""Test analyzer engine with multiple language support."""
analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml")
Expand Down