Update response helpfulness threshold (#79)

elisno · web-flow · commit ffae5658bba7 · 2025-05-03T03:17:25.000Z
* Update default response helpfulness threshold in BadResponseThresholds from 0.7 to 0.23

* bump version and add to changelog

* update tests
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.0.15] 2025-04-24
+
+- Update default thresholds for response helpfulness to 0.23 in `Validator` API.
+
 ## [1.0.14] 2025-04-23
 - Update `codex-sdk` dependency to `0.1.0-alpha.17`.
 - Capture data for the number of times the validator API is called on a Codex project.
@@ -71,7 +75,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Initial release of the `cleanlab-codex` client library.
 
-[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.14...HEAD
+[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.15...HEAD
+[1.0.15]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.14...v1.0.15
 [1.0.14]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.13...v1.0.14
 [1.0.13]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.12...v1.0.13
 [1.0.12]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.11...v1.0.12
diff --git a/src/cleanlab_codex/__about__.py b/src/cleanlab_codex/__about__.py
@@ -1,2 +1,2 @@
 # SPDX-License-Identifier: MIT
-__version__ = "1.0.14"
+__version__ = "1.0.15"
diff --git a/src/cleanlab_codex/validator.py b/src/cleanlab_codex/validator.py
@@ -315,7 +315,7 @@ class BadResponseThresholds(BaseModel):
     )
     response_helpfulness: float = Field(
         description="Threshold for response helpfulness.",
-        default=0.7,
+        default=0.23,
         ge=0.0,
         le=1.0,
     )
diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -19,7 +19,7 @@ def test_get_threshold(self) -> None:
     def test_default_threshold(self) -> None:
         thresholds = BadResponseThresholds()
         assert thresholds.get_threshold("trustworthiness") == 0.7
-        assert thresholds.get_threshold("response_helpfulness") == 0.7
+        assert thresholds.get_threshold("response_helpfulness") == 0.23
 
     def test_unspecified_threshold(self) -> None:
         thresholds = BadResponseThresholds()
@@ -147,7 +147,7 @@ def test_user_provided_thresholds(self, mock_project: Mock, mock_trustworthy_rag
         # Test with user-provided thresholds that match evals
         validator = Validator(codex_access_key="test", bad_response_thresholds={"trustworthiness": 0.6})
         assert_threshold_equal(validator, "trustworthiness", 0.6)
-        assert_threshold_equal(validator, "response_helpfulness", 0.7)
+        assert_threshold_equal(validator, "response_helpfulness", 0.23)
 
         # Test with extra thresholds that should raise ValueError
         with pytest.raises(ValueError, match="Found thresholds for metrics that are not available"):
@@ -157,7 +157,7 @@ def test_default_thresholds(self, mock_project: Mock, mock_trustworthy_rag: Mock
         # Test with default thresholds (bad_response_thresholds is None)
         validator = Validator(codex_access_key="test")
         assert_threshold_equal(validator, "trustworthiness", 0.7)
-        assert_threshold_equal(validator, "response_helpfulness", 0.7)
+        assert_threshold_equal(validator, "response_helpfulness", 0.23)
 
     def test_edge_cases(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> None:  # noqa: ARG002
         # Note, the `"evals"` field should not be a list of strings in practice, but an Eval from cleanlab_tlm
@@ -173,7 +173,7 @@ def test_edge_cases(self, mock_project: Mock, mock_trustworthy_rag: Mock) -> Non
         # No extra Evals
         validator = Validator(codex_access_key="test", trustworthy_rag_config={"evals": []})
         assert_threshold_equal(validator, "trustworthiness", 0.7)  # Default should apply
-        assert_threshold_equal(validator, "response_helpfulness", 0.7)  # Default should apply
+        assert_threshold_equal(validator, "response_helpfulness", 0.23)  # Default should apply
 
         # Test with non-existent evals in trustworthy_rag_config
         with pytest.raises(ValueError, match="Found thresholds for metrics that are not available"):

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# SPDX-License-Identifier: MIT`
`2`		`-__version__ = "1.0.14"`
	`2`	`+__version__ = "1.0.15"`
Original file line number	Diff line number	Diff line change
`@@ -315,7 +315,7 @@ class BadResponseThresholds(BaseModel):`
`315`	`315`	`)`
`316`	`316`	`response_helpfulness: float = Field(`
`317`	`317`	`description="Threshold for response helpfulness.",`
`318`		`- default=0.7,`
	`318`	`+ default=0.23,`
`319`	`319`	`ge=0.0,`
`320`	`320`	`le=1.0,`
`321`	`321`	`)`