Remove OpenAI specific kwargs and try to make metric failure more obvious.

lkacenja · lkacenja · commit 2804b67e89e6 · 2025-09-08T08:20:15.000-06:00
diff --git a/python_components/document_inference/document_inference/helpers.py b/python_components/document_inference/document_inference/helpers.py
@@ -157,8 +157,7 @@ def document_inference_summary(
         populated_prompt,
         attachments=attachments,
         schema=DocumentSummarySchema.model_json_schema(),
-        stream=False,
-        reasoning_effort="minimal"
+        stream=False
     )
     response_json = json.loads(response.text())
     logger.info("Inference complete. Validating response.")
@@ -183,8 +182,7 @@ def document_inference_recommendation(
         populated_prompt,
         attachments=attachments,
         schema=DocumentRecommendation.model_json_schema(),
-        stream=False,
-        reasoning_effort="minimal"
+        stream=False
     )
     response_json = json.loads(response.text())
     logger.info("Inference complete. Validating response.")
diff --git a/python_components/evaluation/evaluation/exception/evaluation.py b/python_components/evaluation/evaluation/exception/evaluation.py
@@ -143,6 +143,8 @@ async def _ceq_evaluate(self, document, exception) -> Result:
             input="\n\n".join(details),
         )
         metric.measure(test_case)
+        if type(metric) is None or metric.verdicts is None:
+            raise RuntimeError("Metric measurement failed. This is likely due to rate limiting.")
         details = {
             "verdicts": convert_model_list(metric.verdicts),
             "response": response,
@@ -175,6 +177,8 @@ async def _faithfulness_evaluate(self, document, exception):
             actual_output=[response],
         )
         metric.measure(test_case)
+        if type(metric) is None or metric.truths is None or metric.claims is None or metric.verdicts is None:
+            raise RuntimeError("Metric measurement failed. This is likely due to rate limiting.")
         details = {
             "truths": metric.truths,
             "claims": metric.claims,
diff --git a/python_components/evaluation/evaluation/summary/evaluation.py b/python_components/evaluation/evaluation/summary/evaluation.py
@@ -39,7 +39,6 @@ def evaluate(self, document: Document) -> List[Result]:
             "file_name": document.file_name,
             "inference_model": self.inference_model_name,
         })))
-
         logger.info("Summarization complete. Performing related evaluations.")
         document.ai_summary = result["summary"]
         # Begin the DeepEval summary evaluation.
@@ -48,6 +47,14 @@ def evaluate(self, document: Document) -> List[Result]:
             input=document.images, actual_output=document.ai_summary
         )
         metric.measure(test_case)
+        if (type(metric) is None
+                or metric.truths is None
+                or metric.claims is None
+                or metric.assessment_questions is None
+                or metric.coverage_verdicts is None
+                or metric.alignment_verdicts is None
+        ):
+            raise RuntimeError("Metric measurement failed. This is likely due to rate limiting.")
         details = {
             "truths": metric.truths,
             "claims": metric.claims,