Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion presidio-analyzer/install_nlp_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def install_models(conf_file: str) -> None:


def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
if engine_name == "spacy":
if engine_name == "spacy" or engine_name == "slim":
spacy_download(model_name)
elif engine_name == "stanza":
if stanza:
Expand Down
120 changes: 120 additions & 0 deletions presidio-analyzer/presidio_analyzer/conf/slim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
supported_languages:
- en
default_score_threshold: 0
nlp_configuration:
nlp_engine_name: slim
models:
- lang_code: en
model_name: en_core_web_sm

recognizer_registry:
# global_regex_flags: 26
recognizers:
# GLiNERRecognizer replaces SpacyRecognizer for NER-based entity detection.
# The slim NLP engine provides only tokenization and lemmatization;
# entity extraction is handled by self-contained recognizers like GLiNER.
- name: GLiNERRecognizer
type: predefined
enabled: true

- name: CreditCardRecognizer
supported_languages:
- language: en
context: [credit, card, visa, mastercard, cc, amex, discover, jcb, diners, maestro, instapayment]
type: predefined

- name: UsBankRecognizer
type: predefined

- name: UsLicenseRecognizer
type: predefined

- name: UsItinRecognizer
type: predefined

- name: UsPassportRecognizer
type: predefined

- name: UsSsnRecognizer
type: predefined

- name: NhsRecognizer
type: predefined

- name: UkNinoRecognizer
type: predefined
enabled: false

- name: UkPassportRecognizer
type: predefined
enabled: false

- name: SgFinRecognizer
type: predefined
enabled: false

- name: AuAbnRecognizer
type: predefined
enabled: false

- name: AuAcnRecognizer
type: predefined
enabled: false

- name: AuTfnRecognizer
type: predefined
enabled: false

- name: AuMedicareRecognizer
type: predefined
enabled: false

- name: InPanRecognizer
type: predefined
enabled: false

- name: InAadhaarRecognizer
supported_languages:
- en
type: predefined
enabled: false

- name: InVehicleRegistrationRecognizer
type: predefined
enabled: false

- name: InPassportRecognizer
type: predefined
enabled: false

- name: CryptoRecognizer
type: predefined

- name: DateRecognizer
type: predefined

- name: EmailRecognizer
type: predefined

- name: IbanRecognizer
type: predefined

- name: IpRecognizer
type: predefined

- name: MedicalLicenseRecognizer
type: predefined

- name: PhoneRecognizer
type: predefined

- name: UrlRecognizer
type: predefined

- name: InVoterRecognizer
type: predefined
enabled: false

- name: InGstinRecognizer
type: predefined
enabled: false
5 changes: 5 additions & 0 deletions presidio-analyzer/presidio_analyzer/conf/slim_nlp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
nlp_engine_name: slim
models:
-
lang_code: en
model_name: en_core_web_sm
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .ner_model_configuration import NerModelConfiguration
from .nlp_artifacts import NlpArtifacts
from .nlp_engine import NlpEngine
from .slim_spacy_nlp_engine import SlimSpacyNlpEngine
from .spacy_nlp_engine import SpacyNlpEngine
from .stanza_nlp_engine import StanzaNlpEngine
from .transformers_nlp_engine import TransformersNlpEngine
Expand All @@ -15,6 +16,7 @@
"NerModelConfiguration",
"NlpArtifacts",
"NlpEngine",
"SlimSpacyNlpEngine",
"SpacyNlpEngine",
"StanzaNlpEngine",
"NlpEngineProvider",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from presidio_analyzer.nlp_engine import (
NerModelConfiguration,
NlpEngine,
SlimSpacyNlpEngine,
SpacyNlpEngine,
StanzaNlpEngine,
TransformersNlpEngine,
Expand All @@ -19,7 +20,7 @@
class NlpEngineProvider:
"""Create different NLP engines from configuration.

:param nlp_engines: List of available NLP engines.
:param nlp_engines: List of available NLP engines
Default: (SpacyNlpEngine, StanzaNlpEngine)
:param nlp_configuration: Dict containing nlp configuration
:example: configuration:
Expand All @@ -40,7 +41,12 @@ def __init__(
nlp_configuration: Optional[Dict] = None,
):
if nlp_engines is None:
nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine)
nlp_engines = (
SpacyNlpEngine,
StanzaNlpEngine,
TransformersNlpEngine,
SlimSpacyNlpEngine,
)

self.nlp_engines = {
engine.engine_name: engine for engine in nlp_engines if engine.is_available
Expand Down Expand Up @@ -76,10 +82,9 @@ def _read_nlp_conf(conf_file: Union[Path, str]) -> Dict:
with open(conf_file) as file:
return yaml.safe_load(file)


@staticmethod
def _get_full_conf_path(
default_conf_file: Union[Path, str] = "default.yaml"
default_conf_file: Union[Path, str] = "default.yaml",
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent, "../conf", default_conf_file)
Expand All @@ -97,15 +102,24 @@ def create_engine(self) -> NlpEngine:
nlp_engine_class = self.nlp_engines[nlp_engine_name]
nlp_models = self.nlp_configuration["models"]

ner_model_configuration = self.nlp_configuration.get("ner_model_configuration")
if ner_model_configuration:
ner_model_configuration = NerModelConfiguration.from_dict(
ner_model_configuration
if nlp_engine_name == SlimSpacyNlpEngine.engine_name:
generic_tokenizer = self.nlp_configuration.get("generic_tokenizer")
engine = nlp_engine_class(
models=nlp_models, generic_tokenizer=generic_tokenizer
)
else:
ner_model_configuration = self.nlp_configuration.get(
"ner_model_configuration"
)
if ner_model_configuration:
ner_model_configuration = NerModelConfiguration.from_dict(
ner_model_configuration
)

engine = nlp_engine_class(
models=nlp_models, ner_model_configuration=ner_model_configuration
)

engine = nlp_engine_class(
models=nlp_models, ner_model_configuration=ner_model_configuration
)
engine.load()
logger.info(
f"Created NLP engine: {engine.engine_name}. "
Expand Down
Loading
Loading