Description
[ ] I have checked the documentation and related resources and couldn't resolve my bug.
Describe the bug
I have the same problem. When the length of reference and response is very long (response 665 characters, reference 2745 characters), llm_context_precision_with_reference,context_recall and fectual_correctness are all nan, and then the error message is: [ERROR] Prompt xxx failed to parse output: The output parser failed to parse the output including retries.
Ragas version:
Python version:3.11.0
Code to Reproduce
def count_score(self, result) -> dict:
scores = asdict(result).get("scores", [])
print(f'result==分数计算算==>{result}')
if not scores:
return {} # 如果 "scores" 列表为空,直接返回空字典
score = {}
for case in scores:
for metrics, value in case.items():
if math.isnan(value):
value = 0 # 将 nan 替换为 0
if metrics not in score:
score[metrics] = value
else:
score[metrics] += value
mean_score = {metrics: round(value / len(scores),2) for metrics, value in score.items()}
return mean_score
async def evaluation(self) -> Data:
result = Data(text="")
if not self.user_input or not self.response or not self.retrieved_contexts:
self.status = result
return None
if not self.referenceContent:
if self.ContextPrecision: result.data["ContextPrecision"]= "Disabled(reference required)"
if self.ContextRecall: result.data["ContextRecall"]= " Disabled(reference required)"
if self.FactualCorrectness: result.data["FactualCorrectness"] = "Disabled(reference required)"
if self.SemanticSimilarity: result.data["SemanticSimilarity"]= "Disabled(reference required)"
# self.ContextPrecision = self.ContextRecall = self.FactualCorrectness = self.SemanticSimilarity = False
# self.reference = ""
_retrive_contexts = []
for context in self.retrieved_contexts:
_retrive_contexts.append(context.data["Generate"])
print(f'{"-"*100}\nretrieved_contexts+++>{_retrive_contexts}\n{"-"*100}')
_referenceContent = self.referenceContent
sample = [SingleTurnSample(
user_input = self.user_input,
response = self.response,
retrieved_contexts = _retrive_contexts,
reference=_referenceContent,
reference_contexts =[ _referenceContent ]
)]
print(f'{"-"*30} self.referenceContent==>{type(self.referenceContent)}= self.referenceContent===>{self.referenceContent} \nsample ==>{sample}\n{"-"*30}')
default_url = "http://0.0.0.0:11435"
# llm = self.model
llm = ChatOllama(model="qwen2.5:14b-instruct-fp16", temperature=0, base_url=default_url)
# evaluator_llm = LlamaIndexLLMWrapper(llm)
evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embeddings = LangchainEmbeddingsWrapper(self.embedding)
eval_dataset = EvaluationDataset(sample)
metrics = []
if self.ContextPrecision: metrics.append(LLMContextPrecisionWithReference(llm=evaluator_llm))
if self.ContextRecall: metrics.append(LLMContextRecall(llm=evaluator_llm))
if self.FactualCorrectness: metrics.append(FactualCorrectness(llm=evaluator_llm))
if self.Faithfulness :metrics.append(Faithfulness(llm=evaluator_llm))
if self.ResponseRelevancy: metrics.append(ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings))
if self.SemanticSimilarity: metrics.append(SemanticSimilarity(embeddings=evaluator_embeddings))
evaluation_result = evaluate(dataset=eval_dataset, metrics=metrics , run_config=RunConfig(max_workers=4,timeout=3200000))
score = self.count_score(evaluation_result)
# context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)
# sample2 = SingleTurnSample(
# user_input = self.user_input,
# retrieved_contexts = _retrive_contexts,
# reference = self.referenceContent
# )
# print(f'sample2222=>{sample2}')
# resss = await context_precision.single_turn_ascore(sample2)
# print(f'test LLMContextPrecisionWithReference测试===>{resss}')
# for metrics, value in score.items():
# result += f"{metrics}: {value}\n"
result_docs = self.retrieved_contexts if isinstance(self.retrieved_contexts,list) else [self.retrieved_contexts]
print(f'score==>P{score},===>{result_docs}')
source = []
content = []
for item in result_docs:
if item.data["source"]:
source.append(item.data["source"])
if item.text:
content.append(item.text)
resultData = Data(data=dict(score,question=self.user_input, answer = self.response,sources = '\n'.join(source),retrieval_content='\n----------------------------分割线------------------------\n'.join(content) ))
self.status = resultData
return resultData
Error trace
[ERROR] Prompt xxx failed to parse output:The output parser failed to parse the output including retries.
Expected behavior
I hope these three values llm_context_precision_with_reference,context_recall and fectual_correctness can be calculated
Additional context
Add any other context about the problem here.