Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ All notable changes to this project will be documented in this file.

- Added recognizer for Swedish Organisationsnummer, ID number for all Swedish oragnisations.

#### Fixed
- `BasicLangExtractRecognizer` now honours values under `langextract.model.provider.language_model_params` (including `timeout` and `num_ctx`). Previously these were silently dropped because `langextract.extract()` ignores its `language_model_params` argument when a pre-built `ModelConfig` is passed via `config=`, causing Ollama-backed recognizers to fall back to langextract's 120s default regardless of the configured timeout. The recognizer now merges `language_model_params` into `ModelConfig.provider_kwargs`, which is the path that reaches the provider constructor. Explicit entries under `provider.kwargs:` still take precedence.

## [2.2.362] - 2026-03-15
### General
#### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,25 @@ def __init__(

self.model_id = model_config.get("model_id")
self.provider = provider_config.get("name")
self.provider_kwargs = provider_config.get("kwargs", {})
self.provider_kwargs = dict(provider_config.get("kwargs", {}))
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dict(provider_config.get("kwargs", {})) will raise a TypeError if the YAML contains kwargs: null (or an empty kwargs: key). Consider using dict(provider_config.get("kwargs") or {}) (or otherwise normalizing/validating the value) so missing/empty kwargs are treated as an empty mapping and the error message remains actionable for users.

Suggested change
self.provider_kwargs = dict(provider_config.get("kwargs", {}))
self.provider_kwargs = dict(provider_config.get("kwargs") or {})

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with Copilot.


# Not ideal, but update _extract_params now that self.config is fully loaded.
self._extract_params.update(provider_config.get("extract_params", {}))
self._language_model_params.update(
provider_config.get("language_model_params", {})
)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.update(None) crashes if the YAML key is present but empty.
so we should add the or {} like copilot suggested above.


# Surface language_model_params on the ModelConfig itself.
# langextract.extract() honours `language_model_params` only when
# `config` is NOT passed (see langextract/extraction.py elif config:
# branch). Because _get_provider_params() returns a pre-built
# ModelConfig, values like `timeout` and `num_ctx` would otherwise be
# silently dropped. Merge them into provider_kwargs so they reach
# the provider constructor (e.g. OllamaLanguageModel(timeout=...)).
# `setdefault` ensures explicit `provider.kwargs:` entries always win.
for key, value in self._language_model_params.items():
self.provider_kwargs.setdefault(key, value)

if not self.provider:
raise ValueError("Configuration must contain "
"'langextract.model.provider.name'")
Expand Down
65 changes: 65 additions & 0 deletions presidio-analyzer/tests/test_basic_langextract_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,3 +574,68 @@ def test_when_analyze_called_then_params_passed_to_langextract(self, tmp_path):
assert call_kwargs["config"].provider == "ollama"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a test where kwargs: null in the yaml and check if the yaml doesnt crash ?

assert call_kwargs["config"].provider_kwargs["model_url"] == "http://localhost:11434"

# Regression: language_model_params must also surface on
# ModelConfig.provider_kwargs, because langextract.extract()
# ignores `language_model_params` when `config` is passed and
# only reads values from ModelConfig.provider_kwargs in that
# branch. Without this, `timeout` and `num_ctx` are silently
# dropped and Ollama falls back to its 120s default.
assert call_kwargs["config"].provider_kwargs["timeout"] == 180
assert call_kwargs["config"].provider_kwargs["num_ctx"] == 8192

def test_language_model_params_reach_provider_kwargs(self, tmp_path):
"""Regression test: values under provider.language_model_params in the
yaml must end up on ModelConfig.provider_kwargs so they actually reach
the provider constructor (e.g. OllamaLanguageModel(timeout=...)).

Prior to the fix, BasicLangExtractRecognizer only copied
provider.kwargs onto ModelConfig.provider_kwargs, and
provider.language_model_params was forwarded to lx.extract() as a
separate argument — but langextract.extract() ignores that argument
when config is passed directly, so values like `timeout` and
`num_ctx` were silently dropped.
"""
import yaml

config = create_test_config()
config["langextract"]["model"]["provider"]["language_model_params"]["timeout"] = 600
config["langextract"]["model"]["provider"]["language_model_params"]["num_ctx"] = 16384

config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.basic_langextract_recognizer import BasicLangExtractRecognizer
recognizer = BasicLangExtractRecognizer(config_path=str(config_file))

provider_kwargs = recognizer._get_provider_params()["config"].provider_kwargs
assert provider_kwargs["timeout"] == 600
assert provider_kwargs["num_ctx"] == 16384

def test_provider_kwargs_take_precedence_over_language_model_params(self, tmp_path):
"""Explicit `provider.kwargs:` entries must win over values of the
same name under `provider.language_model_params:`. This preserves
backward compatibility for configs that already place timeout in
`kwargs:` as a workaround.
"""
import yaml

config = create_test_config()
config["langextract"]["model"]["provider"]["kwargs"]["timeout"] = 900
config["langextract"]["model"]["provider"]["language_model_params"]["timeout"] = 60

config_file = tmp_path / "test_config.yaml"
with open(config_file, 'w') as f:
yaml.dump(config, f)

with patch('presidio_analyzer.llm_utils.langextract_helper.lx',
return_value=Mock()):
from presidio_analyzer.predefined_recognizers.third_party.basic_langextract_recognizer import BasicLangExtractRecognizer
recognizer = BasicLangExtractRecognizer(config_path=str(config_file))

provider_kwargs = recognizer._get_provider_params()["config"].provider_kwargs
# The explicit kwargs: value wins
assert provider_kwargs["timeout"] == 900

Loading