Skip to content

[ERROR] Prompt xxx failed to parse output:The output parser failed to parse the output including retries. #2022

Open
@674717172

Description

@674717172

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug
I have the same problem. When the length of reference and response is very long (response 665 characters, reference 2745 characters), llm_context_precision_with_reference,context_recall and fectual_correctness are all nan, and then the error message is: [ERROR] Prompt xxx failed to parse output: The output parser failed to parse the output including retries.

Ragas version:
Python version:3.11.0

Code to Reproduce

def count_score(self, result) -> dict:
scores = asdict(result).get("scores", [])
print(f'result==分数计算算==>{result}')
if not scores:
return {} # 如果 "scores" 列表为空,直接返回空字典
score = {}
for case in scores:
for metrics, value in case.items():
if math.isnan(value):
value = 0 # 将 nan 替换为 0
if metrics not in score:
score[metrics] = value
else:
score[metrics] += value
mean_score = {metrics: round(value / len(scores),2) for metrics, value in score.items()}
return mean_score

async def evaluation(self) -> Data:
    result = Data(text="")
    if not self.user_input or not self.response or not self.retrieved_contexts:
        self.status = result
        return None
    if not self.referenceContent:
        if self.ContextPrecision: result.data["ContextPrecision"]= "Disabled(reference required)"
        if self.ContextRecall: result.data["ContextRecall"]= " Disabled(reference required)"
        if self.FactualCorrectness: result.data["FactualCorrectness"] = "Disabled(reference required)"
        if self.SemanticSimilarity: result.data["SemanticSimilarity"]= "Disabled(reference required)"
        # self.ContextPrecision = self.ContextRecall = self.FactualCorrectness = self.SemanticSimilarity = False
        # self.reference = ""
    
    _retrive_contexts = []
    for context in self.retrieved_contexts:
        _retrive_contexts.append(context.data["Generate"])
    print(f'{"-"*100}\nretrieved_contexts+++>{_retrive_contexts}\n{"-"*100}')
    
    _referenceContent = self.referenceContent
    
    sample = [SingleTurnSample(
        user_input = self.user_input,
        response = self.response,
        retrieved_contexts = _retrive_contexts,
        reference=_referenceContent,
        reference_contexts =[ _referenceContent ]
    )]
    print(f'{"-"*30}  self.referenceContent==>{type(self.referenceContent)}= self.referenceContent===>{self.referenceContent} \nsample ==>{sample}\n{"-"*30}')

    default_url = "http://0.0.0.0:11435"
    # llm = self.model
    llm = ChatOllama(model="qwen2.5:14b-instruct-fp16", temperature=0,  base_url=default_url) 
    # evaluator_llm = LlamaIndexLLMWrapper(llm)
    evaluator_llm = LangchainLLMWrapper(llm)
    evaluator_embeddings = LangchainEmbeddingsWrapper(self.embedding)
    eval_dataset = EvaluationDataset(sample)
    
    metrics = []
    if self.ContextPrecision: metrics.append(LLMContextPrecisionWithReference(llm=evaluator_llm))
    if self.ContextRecall: metrics.append(LLMContextRecall(llm=evaluator_llm))
    if self.FactualCorrectness: metrics.append(FactualCorrectness(llm=evaluator_llm))
    if self.Faithfulness :metrics.append(Faithfulness(llm=evaluator_llm))
    if self.ResponseRelevancy: metrics.append(ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings))
    if self.SemanticSimilarity: metrics.append(SemanticSimilarity(embeddings=evaluator_embeddings))

    evaluation_result = evaluate(dataset=eval_dataset, metrics=metrics  , run_config=RunConfig(max_workers=4,timeout=3200000))
    score = self.count_score(evaluation_result)
    
    # context_precision = LLMContextPrecisionWithReference(llm=evaluator_llm)
    # sample2 = SingleTurnSample(
    #     user_input = self.user_input,
    #     retrieved_contexts = _retrive_contexts,
    #     reference = self.referenceContent
    # )
    # print(f'sample2222=>{sample2}')
    # resss =  await context_precision.single_turn_ascore(sample2)
    # print(f'test LLMContextPrecisionWithReference测试===>{resss}')
    # for metrics, value in score.items():
    #     result += f"{metrics}: {value}\n"
    result_docs = self.retrieved_contexts if  isinstance(self.retrieved_contexts,list) else [self.retrieved_contexts]
    print(f'score==>P{score},===>{result_docs}')
    source = []
    content = []
    for item in result_docs:
        if item.data["source"]:
            source.append(item.data["source"])
        if item.text:
            content.append(item.text)
    resultData = Data(data=dict(score,question=self.user_input, answer = self.response,sources = '\n'.join(source),retrieval_content='\n----------------------------分割线------------------------\n'.join(content) ))
    self.status =  resultData
    return resultData

Error trace
[ERROR] Prompt xxx failed to parse output:The output parser failed to parse the output including retries.

Expected behavior
I hope these three values ​​llm_context_precision_with_reference,context_recall and fectual_correctness can be calculated

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

Labels

bugSomething isn't workingmodule-metricsthis is part of metrics modulequestionFurther information is requested

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions