Skip to content

Commit ba489b0

Browse files
abdokasebasofter
andauthored
Fix: Avoid unnecessary zh model download and load language is English (#227) (#268)
Co-authored-by: Oleksandr Yaremchuk <[email protected]>
1 parent 70c902a commit ba489b0

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

llm_guard/input_scanners/anonymize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def __init__(
122122
recognizer=transformers_recognizer,
123123
regex_groups=get_regex_patterns(regex_patterns),
124124
custom_names=hidden_names,
125-
supported_languages=ALL_SUPPORTED_LANGUAGES,
125+
supported_languages=list(set(["en", language])),
126126
)
127127

128128
def _remove_conflicts_and_get_text_manipulation_data(

llm_guard/output_scanners/sensitive.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from presidio_anonymizer import AnonymizerEngine
44

5+
from llm_guard.exception import LLMGuardValidationError
56
from llm_guard.input_scanners.anonymize import (
67
ALL_SUPPORTED_LANGUAGES,
78
DEFAULT_ENTITY_TYPES,
@@ -42,6 +43,7 @@ def __init__(
4243
recognizer_conf: NERConfig | None = None,
4344
threshold: float = 0.5,
4445
use_onnx: bool = False,
46+
language: str = "en",
4547
) -> None:
4648
"""
4749
Initializes an instance of the Sensitive class.
@@ -55,6 +57,11 @@ def __init__(
5557
threshold (float): Acceptance threshold. Default is 0.
5658
use_onnx (bool): Use ONNX model for inference. Default is False.
5759
"""
60+
if language not in ALL_SUPPORTED_LANGUAGES:
61+
raise LLMGuardValidationError(
62+
f"Language must be in the list of allowed: {ALL_SUPPORTED_LANGUAGES}"
63+
)
64+
5865
if not entity_types:
5966
LOGGER.debug(
6067
"No entity types provided, using default", default_entity_types=DEFAULT_ENTITY_TYPES
@@ -74,7 +81,10 @@ def __init__(
7481
use_onnx=use_onnx,
7582
)
7683
self._analyzer = get_analyzer(
77-
transformers_recognizer, get_regex_patterns(regex_patterns), [], ALL_SUPPORTED_LANGUAGES
84+
transformers_recognizer,
85+
get_regex_patterns(regex_patterns),
86+
[],
87+
list(set(["en", language])),
7888
)
7989
self._anonymizer = AnonymizerEngine()
8090

0 commit comments

Comments
 (0)