Updated MarkUs test scripts (#31)

wkukka1 · web-flow · commit 90a7f363228a · 2025-08-07T13:46:59.000-04:00
diff --git a/markus_test_scripts/python_tester/llm_helpers.py b/markus_test_scripts/python_tester/llm_helpers.py
@@ -1,3 +1,4 @@
+import ast
 import json
 import os
 import re
@@ -14,11 +15,41 @@
 Each annotation should include: filename: The name of the student's file. content:
 A short description of the mistake. line_start and line_end: The line number(s) where the
 mistake occurs. Ensure the JSON is valid and properly formatted. Here is a sample format
-of the json array to return: {{ "annotations": [{{"filename": "student_code.py",
-"content": "Variable 'x' is unused.", "line_start": 5, "line_end": 5}}] }}.
+of the json array to return:  {{
+  "annotations": [
+    {{
+      "filename": "submission.py",
+      "content": "The variable 'result' is assigned but never used.",
+      "line_start": 3,
+      "line_end": 3,
+      "column_start": 4,
+      "column_end": 16
+    }},
+    {{
+      "filename": "submission.py",
+      "content": "Missing parentheses in the print statement. Use print('Hello') instead.",
+      "line_start": 5,
+      "line_end": 5,
+      "column_start": 0,
+      "column_end": 20
+    }},
+    {{
+      "filename": "submission.py",
+      "content": "The function 'calculate_sum' is called but not defined.",
+      "line_start": 10,
+      "line_end": 10,
+      "column_start": 0,
+      "column_end": 15
+    }}
+  ]
+}}
+
 ONLY return the json object and nothing else. Make sure the line #s don't exceed
 the number of lines in the file. You can use markdown syntax in the annotation's content,
-especially when denoting code."""
+especially when denoting code.
+
+Use only double quotes in your response
+"""
 
 
 def add_annotation_columns(annotations: List[Dict[str, Any]], submission: Any) -> List[Dict[str, Any]]:
@@ -92,13 +123,14 @@ def run_llm(
     submission_path: str,
     model: str,
     scope: str,
-    submission_type: Optional[str],
+    submission_type: Optional[str] = None,
     submission_image: Optional[str] = None,
     question: Optional[str] = None,
     prompt_text: Optional[str] = None,
     prompt: Optional[str] = None,
     json_schema: Optional[str] = None,
     output: Optional[str] = None,
+    model_options: Optional[str] = None,
 ) -> str:
     """
     Executes the LLM feedback generator script and captures its output.
@@ -114,6 +146,9 @@ def run_llm(
         prompt: Name of predefined prompt file to use.
         json_schema: Optional JSON schema to format the response.
         output: filepath of output file.
+        model_options: model options to pass to the llm
+        submission_image: An optional file path to the image to give feedback to.
+        json_schema: Optional JSON schema to format the response.
 
     Returns:
         The output from the LLM feedback generator as a string, or an error message.
@@ -142,6 +177,8 @@ def run_llm(
         llm_command += ["--output", output]
     if submission_image:
         llm_command += ["--submission_image", submission_image]
+    if model_options:
+        llm_command += ["--model_options", model_options]
     if submission_type:
         llm_command += ["--submission_type", submission_type]
 
@@ -155,6 +192,17 @@ def run_llm(
     return llm_result.stdout.strip()
 
 
+def safe_eval_dict(match: str):
+    try:
+        return json.loads(match)  # Try strict JSON first
+    except json.JSONDecodeError:
+        try:
+            return ast.literal_eval(match)  # Fall back to Python-style dict
+        except (SyntaxError, ValueError) as e:
+            print(f"[SKIPPED] Malformed match: {match[:80]}... Reason: {e}")
+            return None
+
+
 def extract_json(response: str) -> List[Dict[str, Any]]:
     """
     Extracts JSON objects embedded in a string.
@@ -166,7 +214,7 @@ def extract_json(response: str) -> List[Dict[str, Any]]:
         A list of parsed JSON dictionaries extracted from the input string.
     """
     matches = re.findall(r"(\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\})", response)
-    return [json.loads(match) for match in matches]
+    return [parsed for match in matches if (parsed := safe_eval_dict(match))]
 
 
 MINIMUM_ANNOTATION_WIDTH = 8
diff --git a/markus_test_scripts/python_tester/python_tester_custom_prompt.py b/markus_test_scripts/python_tester/python_tester_custom_prompt.py
@@ -15,6 +15,7 @@ def test_with_feedback(request):
         submission_path='student_submission.py',
         scope="code",
         model="claude-3.7-sonnet",
+        model_options='max_tokens=1000',
     )
     request.node.add_marker(pytest.mark.markus_message(llm_feedback))
     request.node.add_marker(pytest.mark.markus_overall_comments(llm_feedback))
diff --git a/markus_test_scripts/python_tester/python_tester_llm_code.py b/markus_test_scripts/python_tester/python_tester_llm_code.py
@@ -1,7 +1,5 @@
 import os.path
 
-import pytest
-
 # Modify depending on name of student's submission file
 import student_submission as submission
 from llm_helpers import *
@@ -20,6 +18,7 @@ def test_with_feedback(request):
         submission_path='student_submission.py',
         scope="code",
         model="claude-3.7-sonnet",
+        model_options='max_tokens=1000',
     )
     request.node.add_marker(pytest.mark.markus_message(llm_feedback))
     request.node.add_marker(pytest.mark.markus_overall_comments(llm_feedback))
@@ -28,7 +27,8 @@ def test_with_feedback(request):
 def test_with_annotations(request):
     """Generates LLM Annotations"""
     # feed in previous LLM message to create annotations
-    prompt = f"Previous message: {llm_feedback}."
+    prompt = f"<previous_message> {llm_feedback} </previous_message>"
+    prompt = prompt.replace("{", "{{").replace("}", "}}")
     prompt += ANNOTATION_PROMPT
 
     # Run LLM feedback
@@ -38,6 +38,7 @@ def test_with_annotations(request):
         scope="code",
         model="claude-3.7-sonnet",
         json_schema="code_annotation_schema",
+        model_options='max_tokens=1000',
     )  # generate annotations
 
     annotations_json_list = extract_json(raw_annotation)
diff --git a/markus_test_scripts/python_tester/python_tester_llm_image_analyze_annotations_claude.py b/markus_test_scripts/python_tester/python_tester_llm_image_analyze_annotations_claude.py
@@ -12,6 +12,7 @@ def test_image_analyze(request):
         scope="image",
         model="claude-3.7-sonnet",
         prompt="image_analyze_annotations",
+        model_options='max_tokens=1000',
     )
 
     # Display LLM output in the test output
diff --git a/markus_test_scripts/python_tester/python_tester_llm_image_style_annotations_claude.py b/markus_test_scripts/python_tester/python_tester_llm_image_style_annotations_claude.py
@@ -12,6 +12,7 @@ def test_with_markers(request):
         scope="image",
         model="claude-3.7-sonnet",
         prompt="image_style_annotations",
+        model_options='max_tokens=1000',
     )
 
     # Display LLM output in the test output

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@ def test_with_feedback(request):`
`15`	`15`	`submission_path='student_submission.py',`
`16`	`16`	`scope="code",`
`17`	`17`	`model="claude-3.7-sonnet",`
	`18`	`+ model_options='max_tokens=1000',`
`18`	`19`	`)`
`19`	`20`	`request.node.add_marker(pytest.mark.markus_message(llm_feedback))`
`20`	`21`	`request.node.add_marker(pytest.mark.markus_overall_comments(llm_feedback))`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ def test_image_analyze(request):`
`12`	`12`	`scope="image",`
`13`	`13`	`model="claude-3.7-sonnet",`
`14`	`14`	`prompt="image_analyze_annotations",`
	`15`	`+ model_options='max_tokens=1000',`
`15`	`16`	`)`
`16`	`17`
`17`	`18`	`# Display LLM output in the test output`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ def test_with_markers(request):`
`12`	`12`	`scope="image",`
`13`	`13`	`model="claude-3.7-sonnet",`
`14`	`14`	`prompt="image_style_annotations",`
	`15`	`+ model_options='max_tokens=1000',`
`15`	`16`	`)`
`16`	`17`
`17`	`18`	`# Display LLM output in the test output`