-
Notifications
You must be signed in to change notification settings - Fork 1k
feat: add negative_context support to reduce false positives in context-aware PII detection #1969
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 12 commits
919c957
94b30ec
dfa2f6b
27b664b
1ae1105
505dd8a
51447b0
0acd355
48acd01
f66b40e
01d9208
53c2a14
19a8e32
5d7e2e8
817517d
dd25ca5
d6f5216
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ class ContextAwareEnhancer: | |
| :param min_score_with_context_similarity: Minimum confidence score | ||
| :param context_prefix_count: how many words before the entity to match context | ||
| :param context_suffix_count: how many words after the entity to match context | ||
| :param negative_context_penalty: How much to reduce confidence when negative | ||
| context words are found. Default 0.3. Applied after positive context boost. | ||
| """ | ||
|
|
||
| MIN_SCORE = 0 | ||
|
|
@@ -31,11 +33,13 @@ def __init__( | |
| min_score_with_context_similarity: float, | ||
| context_prefix_count: int, | ||
| context_suffix_count: int, | ||
| negative_context_penalty: float = 0.3, | ||
| ): | ||
| self.context_similarity_factor = context_similarity_factor | ||
| self.min_score_with_context_similarity = min_score_with_context_similarity | ||
| self.context_prefix_count = context_prefix_count | ||
| self.context_suffix_count = context_suffix_count | ||
| self.negative_context_penalty = negative_context_penalty | ||
|
|
||
| @abstractmethod | ||
| def enhance_using_context( | ||
|
|
@@ -45,6 +49,7 @@ def enhance_using_context( | |
| nlp_artifacts: NlpArtifacts, | ||
| recognizers: List[EntityRecognizer], | ||
| context: Optional[List[str]] = None, | ||
| negative_context: Optional[List[str]] = None, | ||
| ) -> List[RecognizerResult]: | ||
|
Comment on lines
31
to
53
|
||
| """ | ||
| Update results in case surrounding words are relevant to the context words. | ||
|
|
@@ -62,5 +67,6 @@ def enhance_using_context( | |
| accuracy of the context enhancement process | ||
| :param recognizers: the list of recognizers | ||
| :param context: list of context words | ||
| :param negative_context: list of negative context words to reduce confidence | ||
| """ | ||
| return raw_results | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -9,6 +9,7 @@ class UsSsnRecognizer(PatternRecognizer): | |||||||||||||||||||||
|
|
||||||||||||||||||||||
| :param patterns: List of patterns to be used by this recognizer | ||||||||||||||||||||||
| :param context: List of context words to increase confidence in detection | ||||||||||||||||||||||
| :param negative_context: List of context words to decrease confidence in detection | ||||||||||||||||||||||
| :param supported_language: Language this recognizer supports | ||||||||||||||||||||||
| :param supported_entity: The entity this recognizer can detect | ||||||||||||||||||||||
| """ | ||||||||||||||||||||||
|
|
@@ -32,20 +33,31 @@ class UsSsnRecognizer(PatternRecognizer): | |||||||||||||||||||||
| "ssid", | ||||||||||||||||||||||
| ] | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| NEGATIVE_CONTEXT = [ | ||||||||||||||||||||||
| "test", | ||||||||||||||||||||||
| "example", | ||||||||||||||||||||||
| "dummy", | ||||||||||||||||||||||
| ] | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| def __init__( | ||||||||||||||||||||||
| self, | ||||||||||||||||||||||
| patterns: Optional[List[Pattern]] = None, | ||||||||||||||||||||||
| context: Optional[List[str]] = None, | ||||||||||||||||||||||
| negative_context: Optional[List[str]] = None, | ||||||||||||||||||||||
| supported_language: str = "en", | ||||||||||||||||||||||
| supported_entity: str = "US_SSN", | ||||||||||||||||||||||
| name: Optional[str] = None, | ||||||||||||||||||||||
| ): | ||||||||||||||||||||||
| patterns = patterns if patterns else self.PATTERNS | ||||||||||||||||||||||
| context = context if context else self.CONTEXT | ||||||||||||||||||||||
| negative_context = ( | ||||||||||||||||||||||
| negative_context if negative_context else self.NEGATIVE_CONTEXT | ||||||||||||||||||||||
|
||||||||||||||||||||||
| patterns = patterns if patterns else self.PATTERNS | |
| context = context if context else self.CONTEXT | |
| negative_context = ( | |
| negative_context if negative_context else self.NEGATIVE_CONTEXT | |
| patterns = patterns if patterns is not None else self.PATTERNS | |
| context = context if context is not None else self.CONTEXT | |
| negative_context = ( | |
| negative_context | |
| if negative_context is not None | |
| else self.NEGATIVE_CONTEXT |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TheSabari07 please change this to make sure a user can disable negative context by passing an empty list.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding
negative_contextbetweencontextandallow_listchanges the positional-argument order for a public API (AnalyzerEngine.analyze). To avoid breaking callers who passallow_list/allow_list_matchpositionally, consider adding the new parameter at the end (or making parameters afterentitieskeyword-only).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@TheSabari07 please move it to the bottom of the parameter list