-
Notifications
You must be signed in to change notification settings - Fork 998
Expand file tree
/
Copy pathnlp_engine_provider.py
More file actions
128 lines (109 loc) · 4.46 KB
/
nlp_engine_provider.py
File metadata and controls
128 lines (109 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import logging
from pathlib import Path
from typing import Dict, Optional, Tuple, Union
import yaml
from presidio_analyzer.input_validation import ConfigurationValidator
from presidio_analyzer.nlp_engine import (
NerModelConfiguration,
NlpEngine,
SlimSpacyNlpEngine,
SpacyNlpEngine,
StanzaNlpEngine,
TransformersNlpEngine,
)
logger = logging.getLogger("presidio-analyzer")
class NlpEngineProvider:
"""Create different NLP engines from configuration.
:param nlp_engines: List of available NLP engines
Default: (SpacyNlpEngine, StanzaNlpEngine)
:param nlp_configuration: Dict containing nlp configuration
:example: configuration:
{
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en",
"model_name": "en_core_web_lg"
}]
}
Nlp engine names available by default: spacy, stanza.
:param conf_file: Path to yaml file containing nlp engine configuration.
"""
def __init__(
self,
nlp_engines: Optional[Tuple] = None,
conf_file: Optional[Union[Path, str]] = None,
nlp_configuration: Optional[Dict] = None,
):
if nlp_engines is None:
nlp_engines = (
SpacyNlpEngine,
StanzaNlpEngine,
TransformersNlpEngine,
SlimSpacyNlpEngine,
)
self.nlp_engines = {
engine.engine_name: engine for engine in nlp_engines if engine.is_available
}
logger.debug(
f"Loaded these available nlp engines: {list(self.nlp_engines.keys())}"
)
if conf_file and nlp_configuration:
raise ValueError(
"Either conf_file or nlp_configuration should be provided, not both."
)
if nlp_configuration:
ConfigurationValidator.validate_nlp_configuration(nlp_configuration)
self.nlp_configuration = nlp_configuration
if conf_file or conf_file == "":
if conf_file == "":
raise ValueError("conf_file is empty")
ConfigurationValidator.validate_file_path(conf_file)
self.nlp_configuration = self._read_nlp_conf(conf_file)
if conf_file is None and nlp_configuration is None:
conf_file = self._get_full_conf_path()
logger.debug(f"Reading default conf file from {conf_file}")
self.nlp_configuration = self._read_nlp_conf(conf_file)
ConfigurationValidator.validate_nlp_configuration(self.nlp_configuration)
@staticmethod
def _read_nlp_conf(conf_file: Union[Path, str]) -> Dict:
"""Read NLP configuration from a YAML file."""
with open(conf_file) as file:
return yaml.safe_load(file)
@staticmethod
def _get_full_conf_path(
default_conf_file: Union[Path, str] = "default.yaml",
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent, "../conf", default_conf_file)
def create_engine(self) -> NlpEngine:
"""Create an NLP engine instance."""
# Configuration is already validated by Pydantic in __init__
nlp_engine_name = self.nlp_configuration["nlp_engine_name"]
if nlp_engine_name not in self.nlp_engines:
raise ValueError(
f"NLP engine '{nlp_engine_name}' is not available. "
"Make sure you have all required packages installed"
)
nlp_engine_class = self.nlp_engines[nlp_engine_name]
nlp_models = self.nlp_configuration["models"]
if nlp_engine_name == SlimSpacyNlpEngine.engine_name:
generic_tokenizer = self.nlp_configuration.get("generic_tokenizer")
engine = nlp_engine_class(
models=nlp_models, generic_tokenizer=generic_tokenizer
)
else:
ner_model_configuration = self.nlp_configuration.get(
"ner_model_configuration"
)
if ner_model_configuration:
ner_model_configuration = NerModelConfiguration.from_dict(
ner_model_configuration
)
engine = nlp_engine_class(
models=nlp_models, ner_model_configuration=ner_model_configuration
)
engine.load()
logger.info(
f"Created NLP engine: {engine.engine_name}. "
f"Loaded models: {list(engine.nlp.keys())}"
)
return engine