Skip to content

Refactor nlp_engine_provider.py to improve configuration handling and… #1559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b431cff
Refactor nlp_engine_provider.py to improve configuration handling and…
ShakutaiGit Mar 25, 2025
ee9feb9
Update NLP configuration to use default Spacy model settings
ShakutaiGit Mar 25, 2025
d7d465f
Add built-in default NLP configuration and improve error handling for…
ShakutaiGit Mar 25, 2025
715a98d
Enhance NLP configuration validation by adding a dedicated method and…
ShakutaiGit Mar 30, 2025
ad4ea4d
Merge branch 'main' into fix-1556-config-file-missing
ShakutaiGit Mar 30, 2025
368d32c
Add NER model configuration to test_stanza.yaml
ShakutaiGit Mar 30, 2025
2978fd5
Fix formatting in nlp_engine_provider.py and remove trailing whitespace
ShakutaiGit Mar 30, 2025
ce7b846
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
8447862
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
00d0d5a
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 21, 2025
1ff6918
CR fixes
ShakutaiGit Apr 21, 2025
4f8479d
Merge branch 'fix-1556-config-file-missing' of https://github.com/mic…
ShakutaiGit Apr 21, 2025
ca9a0f4
CR fixes
ShakutaiGit Apr 27, 2025
6c82cfb
linting fix
ShakutaiGit Apr 27, 2025
6e05990
Merge branch 'main' of https://github.com/microsoft/presidio into fix…
ShakutaiGit Apr 27, 2025
46cea85
linting
ShakutaiGit Apr 27, 2025
073c2ba
Add missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
cd45852
Fix missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
TransformersNlpEngine,
)

logger = logging.getLogger("presidio-analyzer")
DEFAULT_BUILTIN_CONFIG = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}

logger = logging.getLogger("presidio-analyzer")

class NlpEngineProvider:
"""Create different NLP engines from configuration.
Expand Down Expand Up @@ -52,17 +57,21 @@ def __init__(
raise ValueError(
"Either conf_file or nlp_configuration should be provided, not both."
)

if nlp_configuration:
elif nlp_configuration:
self.nlp_configuration = nlp_configuration

if conf_file:
elif conf_file:
self.nlp_configuration = self._read_nlp_conf(conf_file)

if conf_file is None and nlp_configuration is None:
else:
conf_file = self._get_full_conf_path()
logger.debug(f"Reading default conf file from {conf_file}")
self.nlp_configuration = self._read_nlp_conf(conf_file)
try:
self.nlp_configuration = self._read_nlp_conf(conf_file)
except FileNotFoundError:
logger.warning(
f"Default config file '{conf_file}' not found. "
f"Falling back to built-in default: {DEFAULT_BUILTIN_CONFIG}"
)
self.nlp_configuration = DEFAULT_BUILTIN_CONFIG

def create_engine(self) -> NlpEngine:
"""Create an NLP engine instance."""
Expand Down Expand Up @@ -108,26 +117,15 @@ def create_engine(self) -> NlpEngine:

@staticmethod
def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
"""Read the nlp configuration from a provided yaml file."""
"""Read and validate the NLP configuration from a provided YAML file."""

if not Path(conf_file).exists():
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
logger.warning(
f"configuration file {conf_file} not found. "
f"Using default config: {nlp_configuration}."
)
raise FileNotFoundError(f"Configuration file {conf_file} not found.")

else:
with open(conf_file) as file:
nlp_configuration = yaml.safe_load(file)
with open(conf_file) as file:
nlp_configuration = yaml.safe_load(file)

if "ner_model_configuration" not in nlp_configuration:
logger.warning(
"configuration file is missing 'ner_model_configuration'. Using default"
)
NlpEngineProvider._validate_yaml_config_format(nlp_configuration)

return nlp_configuration

Expand All @@ -137,3 +135,44 @@ def _get_full_conf_path(
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent.parent, "conf", default_conf_file)

@staticmethod
def _validate_yaml_config_format(nlp_configuration: Dict) -> None:
"""Validate the YAML configuration file format."""
logger = logging.getLogger("presidio-analyzer")

for key in ("nlp_engine_name", "models"):
if key not in nlp_configuration:
raise ValueError(f"Configuration file is missing '{key}'.")

if nlp_configuration.get("ner_model_configuration"):
return

cfg_langs = {
str(lang).lower()
for lang in nlp_configuration.get("supported_languages", []) or []
}

recog_langs = {
str(lang).lower()
for lang in (
nlp_configuration.get("recognizer_registry", {})
.get("supported_languages", [])
or []
)
}

requested_langs = cfg_langs | recog_langs
english_only = not requested_langs or requested_langs == {"en"}

if english_only:
logger.warning(
"ner_model_configuration is missing, "
"Default English configuration will be used."
)
else:
raise ValueError(
"Configuration file is missing 'ner_model_configuration', "
"which is required when requested languages are not only English. "
f"Detected languages: {sorted(requested_langs)}"
)
226 changes: 224 additions & 2 deletions presidio-analyzer/tests/test_nlp_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest
import spacy
import shutil

from presidio_analyzer.nlp_engine import (
SpacyNlpEngine,
Expand All @@ -11,6 +12,10 @@
)
from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine

def _write_yaml(tmp_path, content: str, name: str = "config.yaml") -> Path:
path = tmp_path / name
path.write_text(content)
return path

@pytest.fixture(scope="module")
def mock_he_model():
Expand All @@ -21,6 +26,8 @@ def mock_he_model():
"""
he = spacy.blank("he")
he.to_disk("he_test")
yield
shutil.rmtree("he_test", ignore_errors=True)


@pytest.fixture(scope="module")
Expand All @@ -32,6 +39,8 @@ def mock_bn_model():
"""
bn = spacy.blank("bn")
bn.to_disk("bn_test")
yield
shutil.rmtree("bn_test", ignore_errors=True)


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -172,8 +181,7 @@ def test_when_both_conf_and_config_then_fail(mocker):


def test_when_labels_to_ignore_not_define_in_conf_file_default_into_empty_set(mocker):
conf_file = "conf/spacy_multilingual.yaml"

conf_file = (Path(__file__).parent.parent/ "presidio_analyzer"/ "conf"/ "spacy_multilingual.yaml")
engine = NlpEngineProvider(conf_file=conf_file).create_engine()
assert len(engine.ner_model_configuration.labels_to_ignore) == 0

Expand Down Expand Up @@ -246,3 +254,217 @@ def test_nlp_engine_provider_init_through_nlp_engine_configuration():
engine = NlpEngineProvider().create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert engine.engine_name == "spacy"

def test_create_engine_missing_ner_model_configuration_english_only():
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert 'en' in engine.nlp
assert isinstance(engine.nlp['en'], spacy.lang.en.English)


def test_create_engine_missing_ner_model_configuration_non_english(mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "de", "model_name": "de_core_news_md"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert 'de' in engine.nlp
assert isinstance(engine.nlp['de'], spacy.lang.de.German)


def test_create_engine_missing_ner_model_configuration_mixed_languages(mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"},
{"lang_code": "de", "model_name": "de_core_news_md"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert set(engine.nlp.keys()) == {'en', 'de'}


def test_create_engine_missing_ner_model_configuration_empty_models():
config = {
"nlp_engine_name": "spacy",
"models": [],
# ner_model_configuration is missing
}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(ValueError) as e:
provider.create_engine()
assert "Configuration should include nlp_engine_name and models" in str(e.value)


def test_read_nlp_conf_file_invalid(tmp_path, caplog):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
"""
yaml_file = tmp_path / "invalid.yaml"
yaml_file.write_text(yaml_content)

with caplog.at_level("WARNING"):
config = NlpEngineProvider._read_nlp_conf(str(yaml_file))
assert "ner_model_configuration is missing" in caplog.text
assert config["nlp_engine_name"] == "spacy"
yaml_file.unlink()


def test_supported_languages_only_en_warns_and_creates(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
supported_languages:
- en
"""
conf_file = _write_yaml(tmp_path, yaml_content)
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
assert "en" in engine.nlp
conf_file.unlink()


def test_supported_languages_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: de
model_name: de_core_news_md
supported_languages:
- de
"""
conf_file = _write_yaml(tmp_path, yaml_content)
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "missing 'ner_model_configuration'" in str(excinfo.value)
conf_file.unlink()



def test_recognizer_registry_only_en_warns_and_creates(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
recognizer_registry:
supported_languages:
- en
"""
conf_file = _write_yaml(tmp_path, yaml_content, "recog.yaml")
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
conf_file.unlink()



def test_recognizer_registry_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
recognizer_registry:
supported_languages:
- fr
"""
conf_file = _write_yaml(tmp_path, yaml_content, "recog2.yaml")
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "missing 'ner_model_configuration'" in str(excinfo.value)
conf_file.unlink()



def test_mixed_supported_and_recognizer_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
supported_languages:
- en
recognizer_registry:
supported_languages:
- de
"""
conf_file = _write_yaml(tmp_path, yaml_content, "mixed.yaml")
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "Detected languages: ['de', 'en']" in str(excinfo.value)
conf_file.unlink()



def test_no_supported_or_recognizer_defaults_to_english(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
"""
conf_file = _write_yaml(tmp_path, yaml_content, "none.yaml")
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
assert 'en' in engine.nlp
conf_file.unlink()

Loading