diff --git a/CHANGELOG.md b/CHANGELOG.md index a0e1d51..43449e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.0.15] 2025-04-24 + +- Update default thresholds for response helpfulness to 0.23 in `Validator` API. + ## [1.0.14] 2025-04-23 - Update `codex-sdk` dependency to `0.1.0-alpha.17`. - Capture data for the number of times the validator API is called on a Codex project. @@ -71,7 +75,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initial release of the `cleanlab-codex` client library. -[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.14...HEAD +[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.15...HEAD +[1.0.15]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.14...v1.0.15 [1.0.14]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.13...v1.0.14 [1.0.13]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.12...v1.0.13 [1.0.12]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.11...v1.0.12 diff --git a/src/cleanlab_codex/__about__.py b/src/cleanlab_codex/__about__.py index a8f1005..8fdfd8d 100644 --- a/src/cleanlab_codex/__about__.py +++ b/src/cleanlab_codex/__about__.py @@ -1,2 +1,2 @@ # SPDX-License-Identifier: MIT -__version__ = "1.0.14" +__version__ = "1.0.15" diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py index e641b2c..c18922f 100644 --- a/src/cleanlab_codex/validator.py +++ b/src/cleanlab_codex/validator.py @@ -315,7 +315,7 @@ class BadResponseThresholds(BaseModel): ) response_helpfulness: float = Field( description="Threshold for response helpfulness.", - default=0.7, + default=0.23, ge=0.0, le=1.0, ) diff --git a/tests/test_validator.py b/tests/test_validator.py index 1eaef34..574ed31 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -19,7 +19,7 @@ def test_get_threshold(self) -> None: def test_default_threshold(self) -> None: thresholds = BadResponseThresholds() assert thresholds.get_threshold("trustworthiness") == 0.7 - assert thresholds.get_threshold("response_helpfulness") == 0.7 + assert thresholds.get_threshold("response_helpfulness") == 0.23 def test_unspecified_threshold(self) -> None: thresholds = BadResponseThresholds() @@ -147,7 +147,7 @@ def test_user_provided_thresholds(self, mock_project: Mock, mock_trustworthy_rag # Test with user-provided thresholds that match evals validator = Validator(codex_access_key="test", bad_response_thresholds={"trustworthiness": 0.6}) assert_threshold_equal(validator, "trustworthiness", 0.6) - assert_threshold_equal(validator, "response_helpfulness", 0.7) + assert_threshold_equal(validator, "response_helpfulness", 0.23) # Test with extra thresholds that should raise ValueError with pytest.raises(ValueError, match="Found thresholds for metrics that are not available"): @@ -157,7 +157,7 @@ def test_default_thresholds(self, mock_project: Mock, mock_trustworthy_rag: Mock # Test with default thresholds (bad_response_thresholds is None) validator = Validator(codex_access_key="test") assert_threshold_equal(validator, "trustworthiness", 0.7) - assert_threshold_equal(validator, "response_helpfulness", 0.7) + assert_threshold_equal(validator, "response_helpfulness", 0.23) def test_edge_cases(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None: # noqa: ARG002 # Note, the `"evals"` field should not be a list of strings in practice, but an Eval from cleanlab_tlm @@ -173,7 +173,7 @@ def test_edge_cases(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> Non # No extra Evals validator = Validator(codex_access_key="test", trustworthy_rag_config={"evals": []}) assert_threshold_equal(validator, "trustworthiness", 0.7) # Default should apply - assert_threshold_equal(validator, "response_helpfulness", 0.7) # Default should apply + assert_threshold_equal(validator, "response_helpfulness", 0.23) # Default should apply # Test with non-existent evals in trustworthy_rag_config with pytest.raises(ValueError, match="Found thresholds for metrics that are not available"):