forked from microsoft/presidio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentity_recognizer.py
330 lines (286 loc) · 12.9 KB
/
entity_recognizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import copy
from abc import abstractmethod
from presidio_analyzer import PresidioLogger
class EntityRecognizer:
MIN_SCORE = 0
MAX_SCORE = 1.0
CONTEXT_SIMILARITY_THRESHOLD = 0.65
CONTEXT_SIMILARITY_FACTOR = 0.35
MIN_SCORE_WITH_CONTEXT_SIMILARITY = 0.4
CONTEXT_PREFIX_COUNT = 5
CONTEXT_SUFFIX_COUNT = 0
def __init__(self, supported_entities, name=None, supported_language="en",
version="0.0.1"):
"""
An abstract class to be inherited by Recognizers which hold the logic
for recognizing specific PII entities.
:param supported_entities: the entities supported by this recognizer
(for example, phone number, address, etc.)
:param supported_language: the language supported by this recognizer.
The supported langauge code is iso6391Name
:param name: the name of this recognizer (optional)
:param version: the recognizer current version
"""
self.supported_entities = supported_entities
if name is None:
self.name = self.__class__.__name__ # assign class name as name
else:
self.name = name
self.supported_language = supported_language
self.version = version
self.is_loaded = False
self.logger = PresidioLogger()
self.load()
self.logger.info("Loaded recognizer: %s", self.name)
self.is_loaded = True
@abstractmethod
def load(self):
"""
Initialize the recognizer assets if needed
(e.g. machine learning models)
"""
@abstractmethod
def analyze(self, text, entities, nlp_artifacts):
"""
This is the core method for analyzing text, assuming entities are
the subset of the supported entities types.
:param text: The text to be analyzed
:param entities: The list of entities to be detected
:param nlp_artifacts: Value of type NlpArtifacts.
A group of attributes which are the result of
some NLP process over the matching text
:return: list of RecognizerResult
:rtype: [RecognizerResult]
"""
return None
def get_supported_entities(self):
"""
:return: A list of the supported entities by this recognizer
"""
return self.supported_entities
def get_supported_language(self):
"""
:return: A list of the supported language by this recognizer
"""
return self.supported_language
def get_version(self):
"""
:return: The current version of this recognizer
"""
return self.version
def to_dict(self):
return_dict = {"supported_entities": self.supported_entities,
"supported_language": self.supported_language,
"name": self.name,
"version": self.version}
return return_dict
@classmethod
def from_dict(cls, entity_recognizer_dict):
return cls(**entity_recognizer_dict)
def enhance_using_context(self, text, raw_results,
nlp_artifacts, recognizer_context_words):
""" using the surrounding words of the actual word matches, look
for specific strings that if found contribute to the score
of the result, improving the confidence that the match is
indeed of that PII entity type
:param text: The actual text that was analyzed
:param raw_results: Recognizer results which didn't take
context into consideration
:param nlp_artifacts: The nlp artifacts contains elements
such as lemmatized tokens for better
accuracy of the context enhancement process
:param recognizer_context_words: The words the current recognizer
supports (words to lookup)
"""
# create a deep copy of the results object so we can manipulate it
results = copy.deepcopy(raw_results)
# Sanity
if nlp_artifacts is None:
self.logger.warning('[%s]. NLP artifacts were not provided',
self.name)
return results
if recognizer_context_words is None or recognizer_context_words == []:
self.logger.info("recognizer '%s' does not support context "
"enhancement", self.name)
return results
for result in results:
# extract lemmatized context from the surrounding of the match
word = text[result.start:result.end]
surrounding_words = self.__extract_surrounding_words(
nlp_artifacts=nlp_artifacts,
word=word,
start=result.start)
supportive_context_word = self.__find_supportive_word_in_context(
surrounding_words, recognizer_context_words)
if supportive_context_word != "":
result.score += \
self.CONTEXT_SIMILARITY_FACTOR
result.score = max(
result.score,
self.MIN_SCORE_WITH_CONTEXT_SIMILARITY)
result.score = min(
result.score,
EntityRecognizer.MAX_SCORE)
# Update the explainability object with context information
# helped improving the score
result.analysis_explanation.set_supportive_context_word(
supportive_context_word)
result.analysis_explanation.set_improved_score(result.score)
return results
@staticmethod
def __context_to_keywords(context):
return context.split(' ')
def __find_supportive_word_in_context(self,
context_list,
recognizer_context_list):
"""A word is considered a supportive context word if
there's exact match between a keyword in
context_text and any keyword in context_list
:param context_list words before and after the matched entity within
a specified window size
:param recognizer_context_list a list of words considered as
context keywords manually specified by the recognizer's author
"""
word = ""
# If the context list is empty, no need to continue
if context_list is None or recognizer_context_list is None:
return word
for predefined_context_word in recognizer_context_list:
# result == true only if any of the predefined context words
# is found exactly or as a substring in any of the collected
# context words
result = \
next((True for keyword in context_list
if predefined_context_word in keyword), False)
if result:
self.logger.debug("Found context keyword '%s'",
predefined_context_word)
word = predefined_context_word
break
return word
@staticmethod
def __add_n_words(index,
n_words,
lemmas,
lemmatized_filtered_keywords,
is_backward):
""" Prepare a string of context words, which surrounds a lemma
at a given index. The words will be collected only if exist
in the filtered array
:param index: index of the lemma that its surrounding words we want
:param n_words: number of words to take
:param lemmas: array of lemmas
:param lemmatized_filtered_keywords: the array of filtered
lemmas from the original sentence,
:param is_backward: if true take the preceeding words, if false,
take the successing words
"""
i = index
context_words = []
# The entity itself is no interest to us...however we want to
# consider it anyway for cases were it is attached with no spaces
# to an interesting context word, so we allow it and add 1 to
# the number of collected words
# collect at most n words (in lower case)
remaining = n_words + 1
while 0 <= i < len(lemmas) and remaining > 0:
lower_lemma = lemmas[i].lower()
if lower_lemma in lemmatized_filtered_keywords:
context_words.append(lower_lemma)
remaining -= 1
i = i-1 if is_backward else i+1
return context_words
def __add_n_words_forward(self,
index,
n_words,
lemmas,
lemmatized_filtered_keywords):
return self.__add_n_words(
index,
n_words,
lemmas,
lemmatized_filtered_keywords,
False)
def __add_n_words_backward(self,
index,
n_words,
lemmas,
lemmatized_filtered_keywords):
return self. __add_n_words(
index,
n_words,
lemmas,
lemmatized_filtered_keywords,
True)
@staticmethod
def find_index_of_match_token(word, start, tokens, tokens_indices):
found = False
# we use the known start index of the original word to find the actual
# token at that index, we are not checking for equivilance since the
# token might be just a substring of that word (e.g. for phone number
# 555-124564 the first token might be just '555' or for a match like '
# rocket' the actual token will just be 'rocket' hence the misalignment
# of indices)
# Note: we are iterating over the original tokens (not the lemmatized)
i = -1
for i, token in enumerate(tokens, 0):
# Either we found a token with the exact location, or
# we take a token which its characters indices covers
# the index we are looking for.
if ((tokens_indices[i] == start) or
(start < tokens_indices[i] + len(token))):
# found the interesting token, the one that around it
# we take n words, we save the matching lemma
found = True
break
if not found:
raise ValueError("Did not find word '" + word + "' "
"in the list of tokens although it "
"is expected to be found")
return i
def __extract_surrounding_words(self, nlp_artifacts, word, start):
""" Extracts words surrounding another given word.
The text from which the context is extracted is given in the nlp
doc
:param nlp_artifacts: An abstraction layer which holds different
items which are the result of a NLP pipeline
execution on a given text
:param word: The word to look for context around
:param start: The start index of the word in the original text
"""
if not nlp_artifacts.tokens:
self.logger.info('Skipping context extraction due to '
'lack of NLP artifacts')
# if there are no nlp artifacts, this is ok, we can
# extract context and we return a valid, yet empty
# context
return ''
# Get the already prepared words in the given text, in their
# LEMMATIZED version
lemmatized_keywords = nlp_artifacts.keywords
# since the list of tokens is not necessarily aligned
# with the actual index of the match, we look for the
# token index which corresponds to the match
token_index = EntityRecognizer.find_index_of_match_token(
word,
start,
nlp_artifacts.tokens,
nlp_artifacts.tokens_indices)
# index i belongs to the PII entity, take the preceding n words
# and the successing m words into a context list
backward_context = \
self.__add_n_words_backward(token_index,
EntityRecognizer.CONTEXT_PREFIX_COUNT,
nlp_artifacts.lemmas,
lemmatized_keywords)
forward_context = \
self.__add_n_words_forward(token_index,
EntityRecognizer.CONTEXT_SUFFIX_COUNT,
nlp_artifacts.lemmas,
lemmatized_keywords)
context_list = []
context_list.extend(backward_context)
context_list.extend(forward_context)
context_list = list(set(context_list))
self.logger.debug('Context list is: %s', " ".join(context_list))
return context_list