|
29 | 29 | end |
30 | 30 | end |
31 | 31 |
|
| 32 | + shared_examples "an auto-evaluation generate task" do |
| 33 | + let(:answer) { build(:answer) } |
| 34 | + let(:evaluation_result) do |
| 35 | + AutoEvaluation::Result.new( |
| 36 | + score: 0.7, |
| 37 | + reason: "Most statements are relevant.", |
| 38 | + success: true, |
| 39 | + llm_responses: {}, |
| 40 | + metrics: {}, |
| 41 | + ) |
| 42 | + end |
| 43 | + |
| 44 | + before do |
| 45 | + Rake::Task[task_name].reenable |
| 46 | + |
| 47 | + allow(AnswerComposition::PipelineRunner) |
| 48 | + .to receive(:call) |
| 49 | + .with( |
| 50 | + question: an_instance_of(Question), |
| 51 | + pipeline: [ |
| 52 | + AnswerComposition::Pipeline::SearchResultFetcher, |
| 53 | + AnswerComposition::Pipeline::Claude::StructuredAnswerComposer, |
| 54 | + ], |
| 55 | + ) |
| 56 | + .and_return(answer) |
| 57 | + end |
| 58 | + |
| 59 | + it_behaves_like "a task requiring an input" |
| 60 | + |
| 61 | + it "outputs the evaluation result as JSON to stdout" do |
| 62 | + ClimateControl.modify(INPUT: question_message) do |
| 63 | + expected_result_output = { |
| 64 | + score: evaluation_result.score, |
| 65 | + reason: evaluation_result.reason, |
| 66 | + success: evaluation_result.success, |
| 67 | + llm_responses: evaluation_result.llm_responses, |
| 68 | + metrics: evaluation_result.metrics, |
| 69 | + }.to_json |
| 70 | + |
| 71 | + expect { Rake::Task[task_name].invoke } |
| 72 | + .to output("#{expected_result_output}\n").to_stdout |
| 73 | + end |
| 74 | + end |
| 75 | + |
| 76 | + context "when an answer has an error status" do |
| 77 | + let(:error_message) { "Answer generation failed" } |
| 78 | + let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) } |
| 79 | + |
| 80 | + it "warns the user and outputs the error message" do |
| 81 | + ClimateControl.modify(INPUT: question_message) do |
| 82 | + expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n" |
| 83 | + expect { Rake::Task[task_name].invoke } |
| 84 | + .to raise_error(SystemExit) |
| 85 | + .and output(expected_stderr).to_stderr |
| 86 | + end |
| 87 | + end |
| 88 | + end |
| 89 | + end |
| 90 | + |
32 | 91 | describe "generate_answer" do |
33 | 92 | let(:task_name) { "evaluation:generate_answer" } |
34 | 93 | let(:input) { "What is the current VAT rate?" } |
|
523 | 582 | end |
524 | 583 |
|
525 | 584 | describe "generate_answer_relevancy_evaluation" do |
526 | | - let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" } |
527 | 585 | let(:question_message) { "What is the current VAT rate?" } |
528 | | - let(:answer) { build(:answer) } |
529 | | - let(:evaluation_result) do |
530 | | - AutoEvaluation::Result.new( |
531 | | - score: 0.7, |
532 | | - reason: "Most statements are relevant.", |
533 | | - success: true, |
534 | | - llm_responses: {}, |
535 | | - metrics: {}, |
536 | | - ) |
537 | | - end |
538 | 586 |
|
539 | | - before do |
540 | | - Rake::Task[task_name].reenable |
541 | | - |
542 | | - allow(AnswerComposition::PipelineRunner) |
543 | | - .to receive(:call) |
544 | | - .with( |
545 | | - question: an_instance_of(Question), |
546 | | - pipeline: [ |
547 | | - AnswerComposition::Pipeline::Claude::QuestionRouter, |
548 | | - AnswerComposition::Pipeline::SearchResultFetcher, |
549 | | - AnswerComposition::Pipeline::Claude::StructuredAnswerComposer, |
550 | | - ], |
551 | | - ) |
552 | | - .and_return(answer) |
553 | | - |
554 | | - allow(AutoEvaluation::AnswerRelevancy) |
555 | | - .to receive(:call) |
556 | | - .with( |
557 | | - question_message:, |
558 | | - answer_message: answer.message, |
559 | | - ) |
560 | | - .and_return(evaluation_result) |
561 | | - end |
| 587 | + it_behaves_like "an auto-evaluation generate task" do |
| 588 | + let(:task_name) { "evaluation:generate_answer_relevancy_evaluation" } |
562 | 589 |
|
563 | | - it_behaves_like "a task requiring an input" |
564 | | - |
565 | | - it "outputs the evaluation result as JSON to stdout" do |
566 | | - ClimateControl.modify(INPUT: question_message) do |
567 | | - expected_result_output = { |
568 | | - score: evaluation_result.score, |
569 | | - reason: evaluation_result.reason, |
570 | | - success: evaluation_result.success, |
571 | | - llm_responses: evaluation_result.llm_responses, |
572 | | - metrics: evaluation_result.metrics, |
573 | | - }.to_json |
574 | | - |
575 | | - expect { Rake::Task[task_name].invoke } |
576 | | - .to output("#{expected_result_output}\n").to_stdout |
| 590 | + before do |
| 591 | + allow(AutoEvaluation::AnswerRelevancy) |
| 592 | + .to receive(:call) |
| 593 | + .with( |
| 594 | + question_message:, |
| 595 | + answer_message: answer.message, |
| 596 | + ) |
| 597 | + .and_return(evaluation_result) |
577 | 598 | end |
578 | 599 | end |
| 600 | + end |
579 | 601 |
|
580 | | - context "when an answer has an error status" do |
581 | | - let(:error_message) { "Answer generation failed" } |
582 | | - let(:answer) { build(:answer, status: :error_answer_service_error, error_message:) } |
583 | | - |
584 | | - it "warns the user and outputs the error message" do |
585 | | - ClimateControl.modify(INPUT: question_message) do |
586 | | - expected_stderr = "Warning: answer has an error status: #{answer.status}\n#{error_message}\n" |
587 | | - expect { Rake::Task[task_name].invoke } |
588 | | - .to raise_error(SystemExit) |
589 | | - .and output(expected_stderr).to_stderr |
590 | | - end |
| 602 | + describe "generate_coherence_evaluation" do |
| 603 | + it_behaves_like "an auto-evaluation generate task" do |
| 604 | + let(:question_message) { "What is the current VAT rate?" } |
| 605 | + let(:task_name) { "evaluation:generate_coherence_evaluation" } |
| 606 | + |
| 607 | + before do |
| 608 | + allow(AutoEvaluation::Coherence) |
| 609 | + .to receive(:call) |
| 610 | + .with( |
| 611 | + question_message:, |
| 612 | + answer_message: answer.message, |
| 613 | + ) |
| 614 | + .and_return(evaluation_result) |
591 | 615 | end |
592 | 616 | end |
593 | 617 | end |
|
0 commit comments