From d68864d65fdaa458c6d3440a19abeb7c2008f58e Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 10:09:40 -0400 Subject: [PATCH 01/12] added prompt as file for deepseek_r1_code_test --- .../deepseek_r1_code_tests.yaml | 108 +++++++++--------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml index 5b3e627..fce79a7 100644 --- a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml +++ b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml @@ -8,57 +8,61 @@ defaultTest: submission_type: python scenarios: - - config: - - vars: { prompt: code_feedback_r1 } - tests: - - vars: - submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - name: "Prompt: Code Feedback R1" + prompts: + - file: ../../../ai_feedback/data/prompts/user/code_feedback_r1.md + vars: + prompt_name: code_feedback_r1.md - - vars: - submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py +tests: + - vars: + submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - vars: - submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - vars: - submission_file: test_submissions/csc108/correct_submission/correct_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/style_submission/style_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/gac_example/correct_submission/correct_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/fail_submission/fail_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py - solution_file: test_submissions/gac_example/solution.py +# - vars: +# submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py +# solution_file: test_submissions/csc263_opt_connected/solution.py +# +# - vars: +# submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py +# solution_file: test_submissions/csc263_opt_connected/solution.py +# +# - vars: +# submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py +# solution_file: test_submissions/csc263_opt_connected/solution.py +# +# - vars: +# submission_file: test_submissions/csc108/correct_submission/correct_submission.py +# solution_file: test_submissions/csc108/solution.py +# +# - vars: +# submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py +# solution_file: test_submissions/csc108/solution.py +# +# - vars: +# submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py +# solution_file: test_submissions/csc108/solution.py +# +# - vars: +# submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py +# solution_file: test_submissions/csc108/solution.py +# +# - vars: +# submission_file: test_submissions/csc108/style_submission/style_submission.py +# solution_file: test_submissions/csc108/solution.py +# +# - vars: +# submission_file: test_submissions/gac_example/correct_submission/correct_submission.py +# solution_file: test_submissions/gac_example/solution.py +# +# - vars: +# submission_file: test_submissions/gac_example/fail_submission/fail_submission.py +# solution_file: test_submissions/gac_example/solution.py +# +# - vars: +# submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py +# solution_file: test_submissions/gac_example/solution.py +# +# - vars: +# submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py +# solution_file: test_submissions/gac_example/solution.py From f0bbf1b6455b14dbe8a7a317106981dfc4c0710b Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:35:20 -0400 Subject: [PATCH 02/12] updated tests to use external prompt files --- .../codellama_tests/codellama_code_tests.yaml | 112 +++++++++--------- .../deepseek_r1_code_tests.yaml | 9 +- .../deepseek_r1_text_tests.yaml | 61 +++++----- .../deepseek_v3_code_tests.yaml | 86 +++++++------- .../deepseek_v3_text_tests.yaml | 71 +++++------ .../llama_3.2_vision_image_tests.yaml | 70 +++++------ ...vision_image_tests_with_system_prompt.yaml | 70 +++++------ .../llava_tests/llava_34b_image_tests.yaml | 70 +++++------ ...va_34b_image_tests_with_system_prompt.yaml | 70 +++++------ 9 files changed, 308 insertions(+), 311 deletions(-) diff --git a/promptfoo/tests/codellama_tests/codellama_code_tests.yaml b/promptfoo/tests/codellama_tests/codellama_code_tests.yaml index 5c8f0c1..5dcd655 100644 --- a/promptfoo/tests/codellama_tests/codellama_code_tests.yaml +++ b/promptfoo/tests/codellama_tests/codellama_code_tests.yaml @@ -7,59 +7,59 @@ defaultTest: scope: code submission_type: python -scenarios: - - config: - - vars: { prompt: code_overall } - tests: - - vars: - submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - vars: - submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - - vars: - submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - vars: - submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py - - - vars: - submission_file: test_submissions/csc108/correct_submission/correct_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/csc108/style_submission/style_submission.py - solution_file: test_submissions/csc108/solution.py - - - vars: - submission_file: test_submissions/gac_example/correct_submission/correct_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/fail_submission/fail_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py - solution_file: test_submissions/gac_example/solution.py - - - vars: - submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py - solution_file: test_submissions/gac_example/solution.py +prompts: + - file://../../../ai_feedback/data/prompts/user/code_overall.md + +tests: + - vars: + submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + + - vars: + submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc108/correct_submission/correct_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/style_submission/style_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/gac_example/correct_submission/correct_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/fail_submission/fail_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py + solution_file: test_submissions/gac_example/solution.py diff --git a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml index fce79a7..a143370 100644 --- a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml +++ b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml @@ -7,18 +7,15 @@ defaultTest: scope: code submission_type: python -scenarios: - - name: "Prompt: Code Feedback R1" - prompts: - - file: ../../../ai_feedback/data/prompts/user/code_feedback_r1.md - vars: - prompt_name: code_feedback_r1.md +prompts: + - file://../../../ai_feedback/data/prompts/user/code_feedback_r1.md tests: - vars: submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py solution_file: test_submissions/csc263_opt_connected/solution.py + # - vars: # submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py # solution_file: test_submissions/csc263_opt_connected/solution.py diff --git a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml index 84350bb..2e9366e 100644 --- a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml +++ b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml @@ -6,35 +6,34 @@ defaultTest: model: deepSeek-R1:70B scope: text submission_type: pdf +prompts: + - file://../../../ai_feedback/data/prompts/user/text_analyze_r1.md -scenarios: - - config: - - vars: { prompt: text_analyze_r1 } - tests: - - vars: - submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf +tests: + - vars: + submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf diff --git a/promptfoo/tests/deepseek_v3_tests/deepseek_v3_code_tests.yaml b/promptfoo/tests/deepseek_v3_tests/deepseek_v3_code_tests.yaml index af97405..3c9006a 100644 --- a/promptfoo/tests/deepseek_v3_tests/deepseek_v3_code_tests.yaml +++ b/promptfoo/tests/deepseek_v3_tests/deepseek_v3_code_tests.yaml @@ -6,60 +6,60 @@ defaultTest: model: deepSeek-v3 scope: code submission_type: python + system_prompt: code_feedback_v3 -scenarios: - - config: - - vars: { prompt: code_feedback_v3, system_prompt: code_feedback_v3} +prompts: + - file://../../../ai_feedback/data/prompts/user/code_feedback_v3.md - tests: - - vars: - submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py +tests: + - vars: + submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc108/correct_submission/correct_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correct_submission/correct_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/style_submission/style_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/style_submission/style_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/gac_example/correct_submission/correct_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/correct_submission/correct_submission.py + solution_file: test_submissions/gac_example/solution.py - - vars: - submission_file: test_submissions/gac_example/fail_submission/fail_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/fail_submission/fail_submission.py + solution_file: test_submissions/gac_example/solution.py - - vars: - submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py + solution_file: test_submissions/gac_example/solution.py - - vars: - submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py + solution_file: test_submissions/gac_example/solution.py diff --git a/promptfoo/tests/deepseek_v3_tests/deepseek_v3_text_tests.yaml b/promptfoo/tests/deepseek_v3_tests/deepseek_v3_text_tests.yaml index 6c04541..e2c42ca 100644 --- a/promptfoo/tests/deepseek_v3_tests/deepseek_v3_text_tests.yaml +++ b/promptfoo/tests/deepseek_v3_tests/deepseek_v3_text_tests.yaml @@ -6,38 +6,39 @@ defaultTest: model: deepSeek-v3 scope: text submission_type: pdf - -scenarios: - - config: - - vars: { prompt: text_analyze_v3, system_prompt: text_feedback_v3 } - tests: - - vars: - submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - prompt: text_eft_proof - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - prompt: text_eft_proof - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - prompt: text_eft_proof + system_prompt: text_feedback_v3 + +prompts: + - file://../../../ai_feedback/data/prompts/user/text_analyze_v3.md + +tests: + - vars: + submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + prompt: text_eft_proof + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + prompt: text_eft_proof + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + prompt: text_eft_proof diff --git a/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests.yaml b/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests.yaml index 1c6a6a0..f9f9d20 100644 --- a/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests.yaml +++ b/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests.yaml @@ -7,38 +7,38 @@ defaultTest: scope: image submission_type: jupyter -scenarios: - - config: - - vars: { prompt: image_overall } - tests: - - description: "JSC270 correct submission" - vars: - submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb - - - description: "JSC270 correctness issue" - vars: - submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb - - - description: "JSC270 efficiency issue" - vars: - submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb - - - description: "JSC270 style issue" - vars: - submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb - - - description: "STA130 correct submission" - vars: - submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb - - - description: "STA130 correctness issue" - vars: - submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb - - - description: "STA130 efficiency issue" - vars: - submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb - - - description: "STA130 style issue" - vars: - submission_file: test_submissions/sta130/style_submission/style_submission.ipynb +prompts: + - file://../../../ai_feedback/data/prompts/user/image_overall.md + +tests: + - description: "JSC270 correct submission" + vars: + submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb + + - description: "JSC270 correctness issue" + vars: + submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb + + - description: "JSC270 efficiency issue" + vars: + submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb + + - description: "JSC270 style issue" + vars: + submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb + + - description: "STA130 correct submission" + vars: + submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb + + - description: "STA130 correctness issue" + vars: + submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb + + - description: "STA130 efficiency issue" + vars: + submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb + + - description: "STA130 style issue" + vars: + submission_file: test_submissions/sta130/style_submission/style_submission.ipynb diff --git a/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests_with_system_prompt.yaml b/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests_with_system_prompt.yaml index cae0d73..0b00bbe 100644 --- a/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests_with_system_prompt.yaml +++ b/promptfoo/tests/llama_3.2_vision_tests/llama_3.2_vision_image_tests_with_system_prompt.yaml @@ -8,38 +8,38 @@ defaultTest: submission_type: jupyter system_prompt: image_style_grader -scenarios: - - config: - - vars: { prompt: image_overall } - tests: - - description: "JSC270 correct submission" - vars: - submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb - - - description: "JSC270 correctness issue" - vars: - submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb - - - description: "JSC270 efficiency issue" - vars: - submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb - - - description: "JSC270 style issue" - vars: - submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb - - - description: "STA130 correct submission" - vars: - submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb - - - description: "STA130 correctness issue" - vars: - submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb - - - description: "STA130 efficiency issue" - vars: - submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb - - - description: "STA130 style issue" - vars: - submission_file: test_submissions/sta130/style_submission/style_submission.ipynb +prompts: + - file://../../../ai_feedback/data/prompts/user/image_overall.md + +tests: + - description: "JSC270 correct submission" + vars: + submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb + + - description: "JSC270 correctness issue" + vars: + submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb + + - description: "JSC270 efficiency issue" + vars: + submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb + + - description: "JSC270 style issue" + vars: + submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb + + - description: "STA130 correct submission" + vars: + submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb + + - description: "STA130 correctness issue" + vars: + submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb + + - description: "STA130 efficiency issue" + vars: + submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb + + - description: "STA130 style issue" + vars: + submission_file: test_submissions/sta130/style_submission/style_submission.ipynb diff --git a/promptfoo/tests/llava_tests/llava_34b_image_tests.yaml b/promptfoo/tests/llava_tests/llava_34b_image_tests.yaml index 3e586bf..72c87c6 100644 --- a/promptfoo/tests/llava_tests/llava_34b_image_tests.yaml +++ b/promptfoo/tests/llava_tests/llava_34b_image_tests.yaml @@ -7,38 +7,38 @@ defaultTest: scope: image submission_type: jupyter -scenarios: - - config: - - vars: { prompt: image_overall } - tests: - - description: "JSC270 correct submission" - vars: - submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb - - - description: "JSC270 correctness issue" - vars: - submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb - - - description: "JSC270 efficiency issue" - vars: - submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb - - - description: "JSC270 style issue" - vars: - submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb - - - description: "STA130 correct submission" - vars: - submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb - - - description: "STA130 correctness issue" - vars: - submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb - - - description: "STA130 efficiency issue" - vars: - submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb - - - description: "STA130 style issue" - vars: - submission_file: test_submissions/sta130/style_submission/style_submission.ipynb +prompts: + - file://../../../ai_feedback/data/prompts/user/image_overall.md + +tests: + - description: "JSC270 correct submission" + vars: + submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb + + - description: "JSC270 correctness issue" + vars: + submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb + + - description: "JSC270 efficiency issue" + vars: + submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb + + - description: "JSC270 style issue" + vars: + submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb + + - description: "STA130 correct submission" + vars: + submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb + + - description: "STA130 correctness issue" + vars: + submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb + + - description: "STA130 efficiency issue" + vars: + submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb + + - description: "STA130 style issue" + vars: + submission_file: test_submissions/sta130/style_submission/style_submission.ipynb diff --git a/promptfoo/tests/llava_tests/llava_34b_image_tests_with_system_prompt.yaml b/promptfoo/tests/llava_tests/llava_34b_image_tests_with_system_prompt.yaml index 94580d2..26dbfad 100644 --- a/promptfoo/tests/llava_tests/llava_34b_image_tests_with_system_prompt.yaml +++ b/promptfoo/tests/llava_tests/llava_34b_image_tests_with_system_prompt.yaml @@ -8,38 +8,38 @@ defaultTest: submission_type: jupyter system_prompt: "You are a teaching-assistant for an undergraduate Data-Visualization course. A student submits one figure (PNG/JPG/PDF screenshot) with no accompanying text." -scenarios: - - config: - - vars: { prompt: image_overall } - tests: - - description: "JSC270 correct submission" - vars: - submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb - - - description: "JSC270 correctness issue" - vars: - submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb - - - description: "JSC270 efficiency issue" - vars: - submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb - - - description: "JSC270 style issue" - vars: - submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb - - - description: "STA130 correct submission" - vars: - submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb - - - description: "STA130 correctness issue" - vars: - submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb - - - description: "STA130 efficiency issue" - vars: - submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb - - - description: "STA130 style issue" - vars: - submission_file: test_submissions/sta130/style_submission/style_submission.ipynb +prompts: + - file://../../../ai_feedback/data/prompts/user/image_overall.md + +tests: + - description: "JSC270 correct submission" + vars: + submission_file: test_submissions/jsc270/correct_submission/correct_submission.ipynb + + - description: "JSC270 correctness issue" + vars: + submission_file: test_submissions/jsc270/correctness_submission/correctness_submission.ipynb + + - description: "JSC270 efficiency issue" + vars: + submission_file: test_submissions/jsc270/efficiency_submission/efficiency_submission.ipynb + + - description: "JSC270 style issue" + vars: + submission_file: test_submissions/jsc270/style_submission/style_submission.ipynb + + - description: "STA130 correct submission" + vars: + submission_file: test_submissions/sta130/correct_submission/correct_submission.ipynb + + - description: "STA130 correctness issue" + vars: + submission_file: test_submissions/sta130/correctness_submission/correctness_submission.ipynb + + - description: "STA130 efficiency issue" + vars: + submission_file: test_submissions/sta130/efficiency_submission/efficiency_submission.ipynb + + - description: "STA130 style issue" + vars: + submission_file: test_submissions/sta130/style_submission/style_submission.ipynb From 2e02f7babd2c60ea2114372505a2caa1ec4b6183 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:35:45 -0400 Subject: [PATCH 03/12] updated runner to use prompt_text --- promptfoo/promptfoo_test_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/promptfoo/promptfoo_test_runner.py b/promptfoo/promptfoo_test_runner.py index d2c4946..7c80810 100644 --- a/promptfoo/promptfoo_test_runner.py +++ b/promptfoo/promptfoo_test_runner.py @@ -34,8 +34,8 @@ def call_api(prompt: str, context: dict, metadata: dict) -> dict: options["scope"], "--model", options["model"], - "--prompt", - options['prompt'], + "--prompt_text", + prompt, "--llama_mode", "server", '--submission_type', From fb150b95ca3f48104b733e5b6ec8f3f8c02bcc21 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:36:31 -0400 Subject: [PATCH 04/12] changed library to use prompt_text as a possble standalone prompt --- ai_feedback/__main__.py | 4 ++-- ai_feedback/helpers/constants.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ai_feedback/__main__.py b/ai_feedback/__main__.py index 57813fb..bbe3e2f 100644 --- a/ai_feedback/__main__.py +++ b/ai_feedback/__main__.py @@ -193,8 +193,8 @@ def main() -> int: prompt = load_markdown_prompt(args.prompt) prompt_content += prompt["prompt_content"] - if args.prompt_text: - prompt_content += args.prompt_text + if args.prompt_text: + prompt_content += args.prompt_text if args.scope == "image": prompt["prompt_content"] = prompt_content diff --git a/ai_feedback/helpers/constants.py b/ai_feedback/helpers/constants.py index ecb716b..139ea33 100644 --- a/ai_feedback/helpers/constants.py +++ b/ai_feedback/helpers/constants.py @@ -2,7 +2,7 @@ HELP_MESSAGES = { "submission_type": "The format of the submission file (e.g., Jupyter notebook, Python script).", "prompt": "The specific prompt to use for evaluating the assignment.", - "prompt_text": "Additional messages to concatenate to the prompt.", + "prompt_text": "Optional standalone prompt or additional text to append to the base prompt.", "prompt_custom": "The path to a prompt to use.", "scope": "The section of the assignment the model should analyze (e.g., code or image).", "submission": "The file path for the submission file.", From f9b440ed446bbc28abaca5ff6016741b1c4e25fb Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:38:49 -0400 Subject: [PATCH 05/12] uncommented --- .../deepseek_r1_code_tests.yaml | 93 +++++++++---------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml index a143370..139f032 100644 --- a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml +++ b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_code_tests.yaml @@ -15,51 +15,50 @@ tests: submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py + + - vars: + submission_file: test_submissions/csc108/correct_submission/correct_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py + solution_file: test_submissions/csc108/solution.py -# - vars: -# submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py -# solution_file: test_submissions/csc263_opt_connected/solution.py -# -# - vars: -# submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py -# solution_file: test_submissions/csc263_opt_connected/solution.py -# -# - vars: -# submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py -# solution_file: test_submissions/csc263_opt_connected/solution.py -# -# - vars: -# submission_file: test_submissions/csc108/correct_submission/correct_submission.py -# solution_file: test_submissions/csc108/solution.py -# -# - vars: -# submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py -# solution_file: test_submissions/csc108/solution.py -# -# - vars: -# submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py -# solution_file: test_submissions/csc108/solution.py -# -# - vars: -# submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py -# solution_file: test_submissions/csc108/solution.py -# -# - vars: -# submission_file: test_submissions/csc108/style_submission/style_submission.py -# solution_file: test_submissions/csc108/solution.py -# -# - vars: -# submission_file: test_submissions/gac_example/correct_submission/correct_submission.py -# solution_file: test_submissions/gac_example/solution.py -# -# - vars: -# submission_file: test_submissions/gac_example/fail_submission/fail_submission.py -# solution_file: test_submissions/gac_example/solution.py -# -# - vars: -# submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py -# solution_file: test_submissions/gac_example/solution.py -# -# - vars: -# submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py -# solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/csc108/style_submission/style_submission.py + solution_file: test_submissions/csc108/solution.py + + - vars: + submission_file: test_submissions/gac_example/correct_submission/correct_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/fail_submission/fail_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py + solution_file: test_submissions/gac_example/solution.py From 1db731fa96fbe81d2d6e2e1ba99f7733efc388b2 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:40:11 -0400 Subject: [PATCH 06/12] updated readme --- README.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 36b93d0..7d8b26d 100644 --- a/README.md +++ b/README.md @@ -26,24 +26,24 @@ For the image scope, the program takes up to two files, depending on the prompt - Saves response output in Markdown format with a predefined template or prints to stdout. ## Argument Details -| Argument | Description | Required | -|----------------------|-------------------------------------------------------------------|----------| -| `--submission_type` | Type of submission (from `arg_options.FileType`) | ❌ | -| `--prompt` | The name of a preddefined prompt file (from `arg_options.Prompt`) | ❌ **| -| `--prompt_text` | Additional string text prompt that can be fed to model. | ❌ ** | -| `--prompt_custom` | The name of prompt file uploaded to be used by model. | ❌ ** | -| `--scope` | Processing scope (`image` or `code` or `text`) | ✅ | -| `--submission` | Submission file path | ✅ | -| `--question` | Specific question to evaluate | ❌ | -| `--model` | Model type (from `arg_options.Models`) | ✅ | -| `--output` | File path for where to record the output | ❌ | -| `--solution` | File path for the solution file | ❌ | -| `--test_output` | File path for the file containing the results from tests | ❌ | -| `--submission_image` | File path for the submission image file | ❌ | -| `--solution_image` | File path for the solution image file | ❌ | -| `--system_prompt` | File path for the system instructions prompt | ❌ | -| `--llama_mode` | How to invoke deepSeek-v3 (choices in `arg_options.LlamaMode`) | ❌ | -| `--output_template` | Output template file (from `arg_options.OutputTemplate) | ❌ | +| Argument | Description | Required | +|----------------------|------------------------------------------------------------------------------|----------| +| `--submission_type` | Type of submission (from `arg_options.FileType`) | ❌ | +| `--prompt` | The name of a preddefined prompt file (from `arg_options.Prompt`) | ❌ **| +| `--prompt_text` | Additional string text prompt that can be fed to model or standalone prompt. | ❌ ** | +| `--prompt_custom` | The name of prompt file uploaded to be used by model. | ❌ ** | +| `--scope` | Processing scope (`image` or `code` or `text`) | ✅ | +| `--submission` | Submission file path | ✅ | +| `--question` | Specific question to evaluate | ❌ | +| `--model` | Model type (from `arg_options.Models`) | ✅ | +| `--output` | File path for where to record the output | ❌ | +| `--solution` | File path for the solution file | ❌ | +| `--test_output` | File path for the file containing the results from tests | ❌ | +| `--submission_image` | File path for the submission image file | ❌ | +| `--solution_image` | File path for the solution image file | ❌ | +| `--system_prompt` | File path for the system instructions prompt | ❌ | +| `--llama_mode` | How to invoke deepSeek-v3 (choices in `arg_options.LlamaMode`) | ❌ | +| `--output_template` | Output template file (from `arg_options.OutputTemplate) | ❌ | ** One of either prompt, prompt_custom, or prompt_text must be selected. ## Scope From 4a41ef38597ddf165c35a74fb7db3b7385ca0700 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 13:51:35 -0400 Subject: [PATCH 07/12] fixed image prompt not being instatiated --- ai_feedback/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ai_feedback/__main__.py b/ai_feedback/__main__.py index bbe3e2f..f2c40e5 100644 --- a/ai_feedback/__main__.py +++ b/ai_feedback/__main__.py @@ -174,6 +174,7 @@ def main() -> int: with open(system_prompt_path, encoding='utf-8') as file: system_instructions = file.read() + prompt = {} if args.prompt_custom: prompt_filename = os.path.join("./", args.prompt_custom) with open(prompt_filename, encoding='utf-8') as prompt_file: @@ -195,6 +196,7 @@ def main() -> int: if args.prompt_text: prompt_content += args.prompt_text + prompt["prompt_content"] = prompt_content if args.scope == "image": prompt["prompt_content"] = prompt_content From 1e4d773b424f14114eaa47905dc62998980215b3 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Tue, 8 Jul 2025 15:18:41 -0400 Subject: [PATCH 08/12] spacing --- promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml index 2e9366e..62cdc17 100644 --- a/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml +++ b/promptfoo/tests/deepseek_r1_tests/deepseek_r1_text_tests.yaml @@ -6,6 +6,7 @@ defaultTest: model: deepSeek-R1:70B scope: text submission_type: pdf + prompts: - file://../../../ai_feedback/data/prompts/user/text_analyze_r1.md From 36a4fce7114e82f7b572fded28efc145b90ecd11 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Wed, 9 Jul 2025 13:28:38 -0400 Subject: [PATCH 09/12] remove prompt init --- ai_feedback/__main__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ai_feedback/__main__.py b/ai_feedback/__main__.py index 1df8581..ae0ca8f 100644 --- a/ai_feedback/__main__.py +++ b/ai_feedback/__main__.py @@ -215,7 +215,6 @@ def main() -> int: args.submission_type = detect_submission_type(args.submission) prompt_content = "" - prompt = {} system_instructions = load_system_prompt_content(args.system_prompt) if args.prompt: @@ -236,7 +235,6 @@ def main() -> int: if args.prompt_text: prompt_content += args.prompt_text - prompt["prompt_content"] = prompt_content if args.scope == "image": prompt = {"prompt_content": prompt_content} From dadf56c2dd326452c53a30317159e485fa6f23c9 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Wed, 9 Jul 2025 13:38:39 -0400 Subject: [PATCH 10/12] updated docs and help message for --prompt_text --- README.md | 34 ++++++++++++++++---------------- ai_feedback/helpers/constants.py | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index dcabf4c..a84cab3 100644 --- a/README.md +++ b/README.md @@ -26,24 +26,24 @@ For the image scope, the program takes up to two files, depending on the prompt - Saves response output in Markdown format with a predefined template or prints to stdout. ## Argument Details -| Argument | Description | Required | -|----------------------|-------------------------------------------------------------------|----------| -| `--submission_type` | Type of submission (from `arg_options.FileType`) | ❌ | -| `--prompt` | Pre-defined prompt name or file path to custom prompt file | ❌ **| -| `--prompt_text` | Additional string text prompt that can be fed to model or standalone prompt. | ❌ ** | -| `--scope` | Processing scope (`image` or `code` or `text`) | ✅ | -| `--submission` | Submission file path | ✅ | -| `--question` | Specific question to evaluate | ❌ | -| `--model` | Model type (from `arg_options.Models`) | ✅ | -| `--output` | File path for where to record the output | ❌ | -| `--solution` | File path for the solution file | ❌ | -| `--test_output` | File path for the file containing the results from tests | ❌ | -| `--submission_image` | File path for the submission image file | ❌ | -| `--solution_image` | File path for the solution image file | ❌ | +| Argument | Description | Required | +|----------------------|---------------------------------------------------------------------|----------| +| `--submission_type` | Type of submission (from `arg_options.FileType`) | ❌ | +| `--prompt` | Pre-defined prompt name or file path to custom prompt file | ❌ **| +| `--prompt_text` | String prompt | ❌ ** | +| `--scope` | Processing scope (`image` or `code` or `text`) | ✅ | +| `--submission` | Submission file path | ✅ | +| `--question` | Specific question to evaluate | ❌ | +| `--model` | Model type (from `arg_options.Models`) | ✅ | +| `--output` | File path for where to record the output | ❌ | +| `--solution` | File path for the solution file | ❌ | +| `--test_output` | File path for the file containing the results from tests | ❌ | +| `--submission_image` | File path for the submission image file | ❌ | +| `--solution_image` | File path for the solution image file | ❌ | | `--system_prompt` | Pre-defined system prompt name or file path to custom system prompt | ❌ | -| `--llama_mode` | How to invoke deepSeek-v3 (choices in `arg_options.LlamaMode`) | ❌ | -| `--output_template` | Output template file (from `arg_options.OutputTemplate) | ❌ | -** One of either `--prompt` or `--prompt_text` must be selected. +| `--llama_mode` | How to invoke deepSeek-v3 (choices in `arg_options.LlamaMode`) | ❌ | +| `--output_template` | Output template file (from `arg_options.OutputTemplate) | ❌ | +** One of either `--prompt` or `--prompt_text` must be selected. If both are provided, `--prompt_text` will be appended to the contents of the file specified by `--prompt`. ## Scope The program supports three scopes: code or text or image. Depending on which is selected, the program supports different models and prompts tailored for each option. diff --git a/ai_feedback/helpers/constants.py b/ai_feedback/helpers/constants.py index 5d50631..592e16b 100644 --- a/ai_feedback/helpers/constants.py +++ b/ai_feedback/helpers/constants.py @@ -2,7 +2,7 @@ HELP_MESSAGES = { "submission_type": "The format of the submission file (e.g., Jupyter notebook, Python script).", "prompt": "Pre-defined prompt name (from ai_feedback/data/prompts/user/) or file path to custom prompt file.", - "prompt_text": "Optional standalone prompt or additional text to append to the base prompt.", + "prompt_text": "The string prompt that is sent to the model", "scope": "The section of the assignment the model should analyze (e.g., code or image).", "submission": "The file path for the submission file.", "solution": "The file path for the solution file.", From aff0be21fd21e797d81f513581682dfd1543ee1c Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Wed, 9 Jul 2025 15:57:08 -0400 Subject: [PATCH 11/12] added prompts to remote tests --- .../tests/remote_tests/remote_code_tests.yaml | 94 +++++++++---------- .../tests/remote_tests/remote_text_tests.yaml | 65 +++++++------ 2 files changed, 79 insertions(+), 80 deletions(-) diff --git a/promptfoo/tests/remote_tests/remote_code_tests.yaml b/promptfoo/tests/remote_tests/remote_code_tests.yaml index 61030c7..60a24d0 100644 --- a/promptfoo/tests/remote_tests/remote_code_tests.yaml +++ b/promptfoo/tests/remote_tests/remote_code_tests.yaml @@ -7,62 +7,62 @@ defaultTest: scope: code submission_type: python -scenarios: - - config: - - vars: { prompt: code_table } - - vars: { prompt: code_explanation } - - vars: { prompt: code_hint } - - vars: { prompt: code_lines } - - vars: { prompt: code_template } - tests: - - vars: - submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py +prompts: + - file://../../../ai_feedback/data/prompts/user/code_table.md + - file://../../../ai_feedback/data/prompts/user/code_explanation.md + - file://../../../ai_feedback/data/prompts/user/code_hint.md + - file://../../../ai_feedback/data/prompts/user/code_lines.md + - file://../../../ai_feedback/data/prompts/user/code_template.md - - vars: - submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py +tests: + - vars: + submission_file: test_submissions/csc263_opt_connected/correct_submission/correct_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/fail_submission/fail_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py - solution_file: test_submissions/csc263_opt_connected/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/incorrect_algo_submission/incorrect_algo_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc108/correct_submission/correct_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc263_opt_connected/style_issues_submission/style_issues_submission.py + solution_file: test_submissions/csc263_opt_connected/solution.py - - vars: - submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correct_submission/correct_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correctness_1_submission/correctness_1_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/correctness_2_submission/correctness_2_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/csc108/style_submission/style_submission.py - solution_file: test_submissions/csc108/solution.py + - vars: + submission_file: test_submissions/csc108/efficiency_submission/efficiency_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/gac_example/correct_submission/correct_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/csc108/style_submission/style_submission.py + solution_file: test_submissions/csc108/solution.py - - vars: - submission_file: test_submissions/gac_example/fail_submission/fail_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/correct_submission/correct_submission.py + solution_file: test_submissions/gac_example/solution.py - - vars: - submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/fail_submission/fail_submission.py + solution_file: test_submissions/gac_example/solution.py - - vars: - submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py - solution_file: test_submissions/gac_example/solution.py + - vars: + submission_file: test_submissions/gac_example/inefficient_submission/inefficient_submission.py + solution_file: test_submissions/gac_example/solution.py + + - vars: + submission_file: test_submissions/gac_example/partial_correct_submission/partial_correct_submission.py + solution_file: test_submissions/gac_example/solution.py diff --git a/promptfoo/tests/remote_tests/remote_text_tests.yaml b/promptfoo/tests/remote_tests/remote_text_tests.yaml index d7e8a0c..4660e7c 100644 --- a/promptfoo/tests/remote_tests/remote_text_tests.yaml +++ b/promptfoo/tests/remote_tests/remote_text_tests.yaml @@ -5,36 +5,35 @@ defaultTest: vars: model: remote scope: text - submission_type: pdf - -scenarios: - - config: - - vars: { prompt: text_pdf_analyze } - tests: - - vars: - submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt - solution_file: test_submissions/data_collection_ethics_module/solution.txt - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf - - - vars: - submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf - solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + +prompts: + - file://../../../ai_feedback/data/prompts/user/text_pdf_analyze.md + +tests: + - vars: + submission_file: test_submissions/data_collection_ethics_module/average_submission/average_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/excellent_submission/excellent_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/off_topic_submission/off_topic_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/data_collection_ethics_module/weak_submission/weak_submission.txt + solution_file: test_submissions/data_collection_ethics_module/solution.txt + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/fail_submission/fail_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/incomplete_submission/incomplete_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf + + - vars: + submission_file: test_submissions/csc373_eft_optimality_proof/induction_submission/induction_submission.pdf + solution_file: test_submissions/csc373_eft_optimality_proof/solution.pdf From 0a27eeca3d95af87ff1893ea7764e62fc336da01 Mon Sep 17 00:00:00 2001 From: Will Kukkamalla Date: Wed, 9 Jul 2025 15:58:20 -0400 Subject: [PATCH 12/12] removed submission type --- promptfoo/tests/remote_tests/remote_code_tests.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/promptfoo/tests/remote_tests/remote_code_tests.yaml b/promptfoo/tests/remote_tests/remote_code_tests.yaml index 60a24d0..192832e 100644 --- a/promptfoo/tests/remote_tests/remote_code_tests.yaml +++ b/promptfoo/tests/remote_tests/remote_code_tests.yaml @@ -5,7 +5,6 @@ defaultTest: vars: model: remote scope: code - submission_type: python prompts: - file://../../../ai_feedback/data/prompts/user/code_table.md