Skip to content

Commit 3cf7e6c

Browse files
Rolland-HeWill Kukkamalla
authored andcommitted
Wrap file contents in XML tags for prompts (MarkUsProject#21)
1 parent 3ad0306 commit 3cf7e6c

File tree

2 files changed

+155
-47
lines changed

2 files changed

+155
-47
lines changed

ai_feedback/helpers/template_utils.py

Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def render_prompt_template(
4242
if question_num is not None:
4343
template_data['file_contents'] = _get_question_contents([submission, solution], question_num)
4444
else:
45-
template_data['file_contents'] = gather_file_contents([submission, solution, test_output])
45+
template_data['file_contents'] = gather_xml_file_contents(submission, solution, test_output)
4646

4747
# Handle image placeholders with context-aware replacement
4848
if '{submission_image}' in prompt_content and 'submission_image' not in template_data:
@@ -64,69 +64,111 @@ def render_prompt_template(
6464
return prompt_content.format(**template_data)
6565

6666

67-
def gather_file_references(submission: Path, solution: Optional[Path], test_output: Optional[Path]) -> str:
67+
def gather_file_references(
68+
submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
69+
) -> str:
6870
"""Generate file reference descriptions for prompt templates.
6971
7072
Args:
71-
submission (Path): Student's submission file path
73+
submission (Path, optional): Student's submission file path
7274
solution (Path, optional): Instructor's solution file path
7375
test_output (Path, optional): Student's test output file path
7476
7577
Returns:
7678
str: Descriptions like "The instructor's solution file..."
7779
"""
7880
references: List[str] = []
79-
references.append(f"The student's submission file is {submission.name}.")
81+
if submission:
82+
references.append(f"The student's submission file is {submission.name}.")
8083
if solution:
8184
references.append(f"The instructor's solution file is {solution.name}.")
8285
if test_output:
8386
references.append(f"The student's test output file is {test_output.name}.")
8487
return "\n".join(references)
8588

8689

87-
def gather_file_contents(assignment_files: List[Optional[Path]]) -> str:
88-
"""Generate file contents with line numbers for prompt templates.
90+
def gather_xml_file_contents(
91+
submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
92+
) -> str:
93+
"""Generate file contents with XML tags for prompt templates.
8994
9095
Args:
91-
assignment_files (list[str]): List of file paths to process
96+
submission (Path, optional): Student's submission file path
97+
solution (Path, optional): Instructor's solution file path
98+
test_output (Path, optional): Student's test output file path
9299
93100
Returns:
94-
str: File contents formatted with line numbers
101+
str: File contents formatted with XML tags and line numbers
95102
"""
96103
file_contents = ""
97104

98-
for file_path in assignment_files:
99-
if not file_path:
100-
continue
101-
filename = os.path.basename(file_path)
102-
103-
try:
104-
# Handle PDF files separately
105-
if filename.lower().endswith('.pdf'):
106-
text_content = extract_pdf_text(file_path)
107-
lines = text_content.split('\n')
108-
else:
109-
# Handle regular text files
110-
with open(file_path, "r", encoding="utf-8") as file:
111-
lines = file.readlines()
112-
113-
# Common processing for both file types
114-
file_contents += f"=== {filename} ===\n"
115-
for i, line in enumerate(lines, start=1):
116-
stripped_line = line.rstrip('\n').rstrip()
117-
if stripped_line.strip():
118-
file_contents += f"(Line {i}) {stripped_line}\n"
119-
else:
120-
file_contents += f"(Line {i}) \n"
121-
file_contents += "\n"
122-
123-
except Exception as e:
124-
print(f"Error reading file {filename}: {e}")
125-
continue
105+
if submission:
106+
file_contents += _format_file_with_xml_tag(submission, "submission")
107+
108+
if solution:
109+
file_contents += _format_file_with_xml_tag(solution, "solution")
110+
111+
if test_output:
112+
file_contents += _format_file_with_xml_tag(test_output, "test_output")
126113

127114
return file_contents
128115

129116

117+
def _format_file_with_xml_tag(file_path: Path, tag_name: str) -> str:
118+
"""Format a single file with XML tags and line numbers.
119+
120+
Args:
121+
file_path (Path): Path to the file to format
122+
tag_name (str): The XML tag name (submission, solution, test_output)
123+
124+
Returns:
125+
str: Formatted file content with XML tags
126+
"""
127+
if not file_path:
128+
return ""
129+
130+
filename = os.path.basename(file_path)
131+
132+
try:
133+
# Handle PDF files separately
134+
if filename.lower().endswith('.pdf'):
135+
text_content = extract_pdf_text(file_path)
136+
return f"<{tag_name} filename=\"{filename}\">\n{text_content}\n</{tag_name}>\n\n"
137+
else:
138+
# Handle regular text files
139+
with open(file_path, "r", encoding="utf-8") as file:
140+
lines = file.readlines()
141+
return _wrap_lines_with_xml(lines, tag_name, filename)
142+
143+
except Exception as e:
144+
print(f"Error reading file {filename}: {e}")
145+
return ""
146+
147+
148+
def _wrap_lines_with_xml(lines: List[str], tag_name: str, filename: str) -> str:
149+
"""Wrap lines with XML tags and add line numbers.
150+
151+
Args:
152+
lines (List[str]): List of lines to format
153+
tag_name (str): The XML tag name (submission, solution, test_output)
154+
filename (str): The filename to include in the XML tag
155+
156+
Returns:
157+
str: Formatted content with XML tags and line numbers
158+
"""
159+
content = f"<{tag_name} filename=\"{filename}\">\n"
160+
161+
for i, line in enumerate(lines, start=1):
162+
stripped_line = line.rstrip("\n")
163+
if stripped_line.strip():
164+
content += f"(Line {i}) {stripped_line}\n"
165+
else:
166+
content += f"(Line {i}) {line}"
167+
168+
content += f"</{tag_name}>\n\n"
169+
return content
170+
171+
130172
def extract_pdf_text(pdf_path: str) -> str:
131173
"""Extract text content from a PDF file.
132174
@@ -233,6 +275,7 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
233275
234276
Args:
235277
assignment_files (List[Optional[Path]]): List of Path or None objects to parse.
278+
Expected order: [submission, solution]
236279
question_num (int): The target task number to extract.
237280
238281
Returns:
@@ -244,7 +287,9 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
244287
file_contents = ""
245288
task_found = False
246289

247-
for file_path in assignment_files:
290+
semantic_tags = ["submission", "solution"]
291+
292+
for index, file_path in enumerate(assignment_files):
248293
if (
249294
not file_path
250295
or file_path.suffix != '.txt'
@@ -266,9 +311,11 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
266311
task_content = task_match.group(1).strip()
267312
task_found = True
268313

269-
file_contents += f"\n\n---\n### {file_path}\n\n"
314+
tag_name = semantic_tags[index] if index < len(semantic_tags) else "file"
315+
file_contents += f"<{tag_name} filename=\"{file_path.name}\">\n"
270316
file_contents += intro_content + "\n\n" if intro_content else ""
271317
file_contents += task_content + "\n\n"
318+
file_contents += f"</{tag_name}>\n\n"
272319

273320
if not task_found:
274321
print(f"Task {question_num} not found in any assignment file.")

tests/open_ai_model_tests/integration_test.py

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44

55

66
def test_cnn_example_openai_stdout(capsys, mock_and_capture):
7-
"""
8-
Example 1:
7+
"""Example 1:
98
Evaluate cnn_example test using openAI model and print to stdout.
109
python -m ai_feedback --prompt code_lines --scope code \
1110
--submission test_submissions/cnn_example/cnn_submission \
@@ -30,13 +29,12 @@ def test_cnn_example_openai_stdout(capsys, mock_and_capture):
3029

3130
assert "Compare the student's code and solution code. For each mistake" in output
3231
assert "(Line 1) import numpy as np" in output
33-
assert "=== cnn_submission.py ===" in output
34-
assert "=== cnn_solution.py ===" in output
32+
assert '<submission filename="cnn_submission.py">' in output
33+
assert '<solution filename="cnn_solution.py">' in output
3534

3635

3736
def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
38-
"""
39-
Example 2:
37+
"""Example 2:
4038
Evaluate cnn_example test using openAI model and a custom prompt text, printing to stdout.
4139
python -m ai_feedback --prompt_text "Evaluate the student's code readability." \
4240
--scope code \
@@ -58,13 +56,12 @@ def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
5856
]
5957
output = run_cli_and_capture(args, capsys)
6058
assert "Evaluate the student's code readability." in output
61-
assert "=== cnn_submission.py ===" in output
59+
assert '<submission filename="cnn_submission.py">' in output
6260
assert "(Line 1) import numpy as np" in output
6361

6462

6563
def test_pdf_example_openai_direct(capsys, mock_and_capture):
66-
"""
67-
Example 3:
64+
"""Example 3:
6865
Evaluate pdf_example test using openAI model and direct output mode.
6966
python -m ai_feedback --prompt text_pdf_analyze --scope text \
7067
--submission test_submissions/pdf_example/student_pdf_submission.pdf \
@@ -86,3 +83,67 @@ def test_pdf_example_openai_direct(capsys, mock_and_capture):
8683
assert "Does the student correctly respond to the question, and meet all the" in output
8784
assert "student_pdf_submission.pdf" in output
8885
assert "Normalization allows each feature to have an equal influence on the mode" in output
86+
87+
88+
def test_xml_formatting_code_scope(capsys, mock_and_capture):
89+
"""
90+
Test XML formatting for file contents in code scope.
91+
Verifies that file contents use XML tags while file references remain plain text.
92+
"""
93+
parent = Path(__file__).parent.parent.parent
94+
95+
args = [
96+
"--prompt_text",
97+
"File references: {file_references}\n\nFile contents:\n{file_contents}",
98+
"--scope",
99+
"code",
100+
"--submission",
101+
str(parent / "test_submissions/csc108/correct_submission/correct_submission.py"),
102+
"--solution",
103+
str(parent / "test_submissions/csc108/solution.py"),
104+
"--model",
105+
"openai",
106+
]
107+
output = run_cli_and_capture(args, capsys)
108+
109+
assert "The student's submission file is correct_submission.py." in output
110+
assert "The instructor's solution file is solution.py." in output
111+
112+
assert '<submission filename="correct_submission.py">' in output
113+
assert '</submission>' in output
114+
assert '<solution filename="solution.py">' in output
115+
assert '</solution>' in output
116+
117+
assert "(Line 1) def fizzbuzz(n: int) -> list:" in output
118+
119+
120+
def test_xml_formatting_text_scope_with_test_output(capsys, mock_and_capture):
121+
"""
122+
Test XML formatting for file contents in text scope with all file types.
123+
Verifies submission, solution, and test_output files all use XML formatting.
124+
"""
125+
parent = Path(__file__).parent.parent.parent
126+
127+
args = [
128+
"--prompt_text",
129+
"File references: {file_references}\n\nFile contents:\n{file_contents}",
130+
"--submission_type",
131+
"python",
132+
"--scope",
133+
"text",
134+
"--submission",
135+
str(parent / "test_submissions/ggr274_homework5/test1/student_submission.txt"),
136+
"--solution",
137+
str(parent / "test_submissions/ggr274_homework5/test1/Homework_5_solution.txt"),
138+
"--model",
139+
"openai",
140+
]
141+
output = run_cli_and_capture(args, capsys)
142+
143+
assert "The student's submission file is student_submission.txt." in output
144+
assert "The instructor's solution file is Homework_5_solution.txt." in output
145+
146+
assert '<submission filename="student_submission.txt">' in output
147+
assert '</submission>' in output
148+
assert '<solution filename="Homework_5_solution.txt">' in output
149+
assert '</solution>' in output

0 commit comments

Comments
 (0)