Skip to content

Commit 90a7f36

Browse files
authored
Updated MarkUs test scripts (#31)
1 parent 26d8c27 commit 90a7f36

File tree

5 files changed

+60
-8
lines changed

5 files changed

+60
-8
lines changed

markus_test_scripts/python_tester/llm_helpers.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import ast
12
import json
23
import os
34
import re
@@ -14,11 +15,41 @@
1415
Each annotation should include: filename: The name of the student's file. content:
1516
A short description of the mistake. line_start and line_end: The line number(s) where the
1617
mistake occurs. Ensure the JSON is valid and properly formatted. Here is a sample format
17-
of the json array to return: {{ "annotations": [{{"filename": "student_code.py",
18-
"content": "Variable 'x' is unused.", "line_start": 5, "line_end": 5}}] }}.
18+
of the json array to return: {{
19+
"annotations": [
20+
{{
21+
"filename": "submission.py",
22+
"content": "The variable 'result' is assigned but never used.",
23+
"line_start": 3,
24+
"line_end": 3,
25+
"column_start": 4,
26+
"column_end": 16
27+
}},
28+
{{
29+
"filename": "submission.py",
30+
"content": "Missing parentheses in the print statement. Use print('Hello') instead.",
31+
"line_start": 5,
32+
"line_end": 5,
33+
"column_start": 0,
34+
"column_end": 20
35+
}},
36+
{{
37+
"filename": "submission.py",
38+
"content": "The function 'calculate_sum' is called but not defined.",
39+
"line_start": 10,
40+
"line_end": 10,
41+
"column_start": 0,
42+
"column_end": 15
43+
}}
44+
]
45+
}}
46+
1947
ONLY return the json object and nothing else. Make sure the line #s don't exceed
2048
the number of lines in the file. You can use markdown syntax in the annotation's content,
21-
especially when denoting code."""
49+
especially when denoting code.
50+
51+
Use only double quotes in your response
52+
"""
2253

2354

2455
def add_annotation_columns(annotations: List[Dict[str, Any]], submission: Any) -> List[Dict[str, Any]]:
@@ -92,13 +123,14 @@ def run_llm(
92123
submission_path: str,
93124
model: str,
94125
scope: str,
95-
submission_type: Optional[str],
126+
submission_type: Optional[str] = None,
96127
submission_image: Optional[str] = None,
97128
question: Optional[str] = None,
98129
prompt_text: Optional[str] = None,
99130
prompt: Optional[str] = None,
100131
json_schema: Optional[str] = None,
101132
output: Optional[str] = None,
133+
model_options: Optional[str] = None,
102134
) -> str:
103135
"""
104136
Executes the LLM feedback generator script and captures its output.
@@ -114,6 +146,9 @@ def run_llm(
114146
prompt: Name of predefined prompt file to use.
115147
json_schema: Optional JSON schema to format the response.
116148
output: filepath of output file.
149+
model_options: model options to pass to the llm
150+
submission_image: An optional file path to the image to give feedback to.
151+
json_schema: Optional JSON schema to format the response.
117152
118153
Returns:
119154
The output from the LLM feedback generator as a string, or an error message.
@@ -142,6 +177,8 @@ def run_llm(
142177
llm_command += ["--output", output]
143178
if submission_image:
144179
llm_command += ["--submission_image", submission_image]
180+
if model_options:
181+
llm_command += ["--model_options", model_options]
145182
if submission_type:
146183
llm_command += ["--submission_type", submission_type]
147184

@@ -155,6 +192,17 @@ def run_llm(
155192
return llm_result.stdout.strip()
156193

157194

195+
def safe_eval_dict(match: str):
196+
try:
197+
return json.loads(match) # Try strict JSON first
198+
except json.JSONDecodeError:
199+
try:
200+
return ast.literal_eval(match) # Fall back to Python-style dict
201+
except (SyntaxError, ValueError) as e:
202+
print(f"[SKIPPED] Malformed match: {match[:80]}... Reason: {e}")
203+
return None
204+
205+
158206
def extract_json(response: str) -> List[Dict[str, Any]]:
159207
"""
160208
Extracts JSON objects embedded in a string.
@@ -166,7 +214,7 @@ def extract_json(response: str) -> List[Dict[str, Any]]:
166214
A list of parsed JSON dictionaries extracted from the input string.
167215
"""
168216
matches = re.findall(r"(\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\})", response)
169-
return [json.loads(match) for match in matches]
217+
return [parsed for match in matches if (parsed := safe_eval_dict(match))]
170218

171219

172220
MINIMUM_ANNOTATION_WIDTH = 8

markus_test_scripts/python_tester/python_tester_custom_prompt.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def test_with_feedback(request):
1515
submission_path='student_submission.py',
1616
scope="code",
1717
model="claude-3.7-sonnet",
18+
model_options='max_tokens=1000',
1819
)
1920
request.node.add_marker(pytest.mark.markus_message(llm_feedback))
2021
request.node.add_marker(pytest.mark.markus_overall_comments(llm_feedback))

markus_test_scripts/python_tester/python_tester_llm_code.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import os.path
22

3-
import pytest
4-
53
# Modify depending on name of student's submission file
64
import student_submission as submission
75
from llm_helpers import *
@@ -20,6 +18,7 @@ def test_with_feedback(request):
2018
submission_path='student_submission.py',
2119
scope="code",
2220
model="claude-3.7-sonnet",
21+
model_options='max_tokens=1000',
2322
)
2423
request.node.add_marker(pytest.mark.markus_message(llm_feedback))
2524
request.node.add_marker(pytest.mark.markus_overall_comments(llm_feedback))
@@ -28,7 +27,8 @@ def test_with_feedback(request):
2827
def test_with_annotations(request):
2928
"""Generates LLM Annotations"""
3029
# feed in previous LLM message to create annotations
31-
prompt = f"Previous message: {llm_feedback}."
30+
prompt = f"<previous_message> {llm_feedback} </previous_message>"
31+
prompt = prompt.replace("{", "{{").replace("}", "}}")
3232
prompt += ANNOTATION_PROMPT
3333

3434
# Run LLM feedback
@@ -38,6 +38,7 @@ def test_with_annotations(request):
3838
scope="code",
3939
model="claude-3.7-sonnet",
4040
json_schema="code_annotation_schema",
41+
model_options='max_tokens=1000',
4142
) # generate annotations
4243

4344
annotations_json_list = extract_json(raw_annotation)

markus_test_scripts/python_tester/python_tester_llm_image_analyze_annotations_claude.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def test_image_analyze(request):
1212
scope="image",
1313
model="claude-3.7-sonnet",
1414
prompt="image_analyze_annotations",
15+
model_options='max_tokens=1000',
1516
)
1617

1718
# Display LLM output in the test output

markus_test_scripts/python_tester/python_tester_llm_image_style_annotations_claude.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def test_with_markers(request):
1212
scope="image",
1313
model="claude-3.7-sonnet",
1414
prompt="image_style_annotations",
15+
model_options='max_tokens=1000',
1516
)
1617

1718
# Display LLM output in the test output

0 commit comments

Comments
 (0)