Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 93 additions & 37 deletions ai_feedback/helpers/template_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def render_prompt_template(
if question_num is not None:
template_data['file_contents'] = _get_question_contents([submission, solution], question_num)
else:
template_data['file_contents'] = gather_file_contents([submission, solution, test_output])
template_data['file_contents'] = gather_xml_file_contents(submission, solution, test_output)

# Handle image placeholders with context-aware replacement
if '{submission_image}' in prompt_content and 'submission_image' not in template_data:
Expand All @@ -64,69 +64,120 @@ def render_prompt_template(
return prompt_content.format(**template_data)


def gather_file_references(submission: Path, solution: Optional[Path], test_output: Optional[Path]) -> str:
def gather_file_references(
submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
) -> str:
"""Generate file reference descriptions for prompt templates.

Args:
submission (Path): Student's submission file path
submission (Path, optional): Student's submission file path
solution (Path, optional): Instructor's solution file path
test_output (Path, optional): Student's test output file path

Returns:
str: Descriptions like "The instructor's solution file..."
"""
references: List[str] = []
references.append(f"The student's submission file is {submission.name}.")
if submission:
references.append(f"The student's submission file is {submission.name}.")
if solution:
references.append(f"The instructor's solution file is {solution.name}.")
if test_output:
references.append(f"The student's test output file is {test_output.name}.")
return "\n".join(references)


def gather_file_contents(assignment_files: List[Optional[Path]]) -> str:
"""Generate file contents with line numbers for prompt templates.
def gather_xml_file_contents(
submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
) -> str:
"""Generate file contents with XML tags for prompt templates.

Args:
assignment_files (list[str]): List of file paths to process
submission (Path, optional): Student's submission file path
solution (Path, optional): Instructor's solution file path
test_output (Path, optional): Student's test output file path

Returns:
str: File contents formatted with line numbers
str: File contents formatted with XML tags and line numbers
"""
file_contents = ""

for file_path in assignment_files:
if not file_path:
continue
filename = os.path.basename(file_path)
if submission:
file_contents += _format_file_with_xml_tag(submission, "submission")

try:
# Handle PDF files separately
if filename.lower().endswith('.pdf'):
text_content = extract_pdf_text(file_path)
lines = text_content.split('\n')
else:
# Handle regular text files
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()

# Common processing for both file types
file_contents += f"=== {filename} ===\n"
for i, line in enumerate(lines, start=1):
stripped_line = line.rstrip('\n').rstrip()
if stripped_line.strip():
file_contents += f"(Line {i}) {stripped_line}\n"
else:
file_contents += f"(Line {i}) \n"
file_contents += "\n"

except Exception as e:
print(f"Error reading file {filename}: {e}")
continue
if solution:
file_contents += _format_file_with_xml_tag(solution, "solution")

if test_output:
file_contents += _format_file_with_xml_tag(test_output, "test_output")

return file_contents


def _format_file_with_xml_tag(file_path: Path, tag_name: str) -> str:
"""Format a single file with XML tags and line numbers.

Args:
file_path (Path): Path to the file to format
tag_name (str): The XML tag name (submission, solution, test_output)

Returns:
str: Formatted file content with XML tags
"""
if not file_path:
return ""

filename = os.path.basename(file_path)

try:
# Handle PDF files separately
if filename.lower().endswith('.pdf'):
text_content = extract_pdf_text(file_path)
lines = text_content.split('\n')
return _wrap_lines_with_xml(lines, tag_name, filename, is_pdf=True)
else:
# Handle regular text files
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
return _wrap_lines_with_xml(lines, tag_name, filename, is_pdf=False)

except Exception as e:
print(f"Error reading file {filename}: {e}")
return ""


def _wrap_lines_with_xml(lines: List[str], tag_name: str, filename: str, is_pdf: bool = False) -> str:
"""Wrap lines with XML tags and add line numbers.

Args:
lines (List[str]): List of lines to format
tag_name (str): The XML tag name (submission, solution, test_output)
filename (str): The filename to include in the XML tag
is_pdf (bool): Whether this is PDF content (affects empty line handling)

Returns:
str: Formatted content with XML tags and line numbers
"""
content = f"<{tag_name} file=\"{filename}\">\n"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the attribute name filename instead of file, here and throughout all XML tags that have to do with files

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.


for i, line in enumerate(lines, start=1):
if is_pdf:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay looking at this more carefully, I'm realizing that actually for text extracted from PDFs I don't think we need to do the line numbering (which was really to support precise annotations for code). I'm not sure we need to call this function at all when the given text is from a PDF, and instead we can just pass the raw text directly as the file contents to the LLM.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed!

stripped_line = line.rstrip()
if stripped_line.strip():
content += f"(Line {i}) {stripped_line}\n"
else:
content += f"(Line {i}) \n"
else:
stripped_line = line.rstrip("\n")
if stripped_line.strip():
content += f"(Line {i}) {stripped_line}\n"
else:
content += f"(Line {i}) {line}"

content += f"</{tag_name}>\n\n"
return content


def extract_pdf_text(pdf_path: str) -> str:
"""Extract text content from a PDF file.

Expand Down Expand Up @@ -233,6 +284,7 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:

Args:
assignment_files (List[Optional[Path]]): List of Path or None objects to parse.
Expected order: [submission, solution]
question_num (int): The target task number to extract.

Returns:
Expand All @@ -244,7 +296,9 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
file_contents = ""
task_found = False

for file_path in assignment_files:
semantic_tags = ["submission", "solution"]

for index, file_path in enumerate(assignment_files):
if (
not file_path
or file_path.suffix != '.txt'
Expand All @@ -266,9 +320,11 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
task_content = task_match.group(1).strip()
task_found = True

file_contents += f"\n\n---\n### {file_path}\n\n"
tag_name = semantic_tags[index] if index < len(semantic_tags) else "file"
file_contents += f"<{tag_name} file=\"{file_path.name}\">\n"
file_contents += intro_content + "\n\n" if intro_content else ""
file_contents += task_content + "\n\n"
file_contents += f"</{tag_name}>\n\n"

if not task_found:
print(f"Task {question_num} not found in any assignment file.")
Expand Down
79 changes: 70 additions & 9 deletions tests/open_ai_model_tests/integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@


def test_cnn_example_openai_stdout(capsys, mock_and_capture):
"""
Example 1:
"""Example 1:
Evaluate cnn_example test using openAI model and print to stdout.
python -m ai_feedback --prompt code_lines --scope code \
--submission test_submissions/cnn_example/cnn_submission \
Expand All @@ -30,13 +29,12 @@ def test_cnn_example_openai_stdout(capsys, mock_and_capture):

assert "Compare the student's code and solution code. For each mistake" in output
assert "(Line 1) import numpy as np" in output
assert "=== cnn_submission.py ===" in output
assert "=== cnn_solution.py ===" in output
assert '<submission file="cnn_submission.py">' in output
assert '<solution file="cnn_solution.py">' in output


def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
"""
Example 2:
"""Example 2:
Evaluate cnn_example test using openAI model and a custom prompt text, printing to stdout.
python -m ai_feedback --prompt_text "Evaluate the student's code readability." \
--scope code \
Expand All @@ -58,13 +56,12 @@ def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
]
output = run_cli_and_capture(args, capsys)
assert "Evaluate the student's code readability." in output
assert "=== cnn_submission.py ===" in output
assert '<submission file="cnn_submission.py">' in output
assert "(Line 1) import numpy as np" in output


def test_pdf_example_openai_direct(capsys, mock_and_capture):
"""
Example 3:
"""Example 3:
Evaluate pdf_example test using openAI model and direct output mode.
python -m ai_feedback --prompt text_pdf_analyze --scope text \
--submission test_submissions/pdf_example/student_pdf_submission.pdf \
Expand All @@ -86,3 +83,67 @@ def test_pdf_example_openai_direct(capsys, mock_and_capture):
assert "Does the student correctly respond to the question, and meet all the" in output
assert "student_pdf_submission.pdf" in output
assert "Normalization allows each feature to have an equal influence on the mode" in output


def test_xml_formatting_code_scope(capsys, mock_and_capture):
"""
Test XML formatting for file contents in code scope.
Verifies that file contents use XML tags while file references remain plain text.
"""
parent = Path(__file__).parent.parent.parent

args = [
"--prompt_text",
"File references: {file_references}\n\nFile contents:\n{file_contents}",
"--scope",
"code",
"--submission",
str(parent / "test_submissions/csc108/correct_submission/correct_submission.py"),
"--solution",
str(parent / "test_submissions/csc108/solution.py"),
"--model",
"openai",
]
output = run_cli_and_capture(args, capsys)

assert "The student's submission file is correct_submission.py." in output
assert "The instructor's solution file is solution.py." in output

assert '<submission file="correct_submission.py">' in output
assert '</submission>' in output
assert '<solution file="solution.py">' in output
assert '</solution>' in output

assert "(Line 1) def fizzbuzz(n: int) -> list:" in output


def test_xml_formatting_text_scope_with_test_output(capsys, mock_and_capture):
"""
Test XML formatting for file contents in text scope with all file types.
Verifies submission, solution, and test_output files all use XML formatting.
"""
parent = Path(__file__).parent.parent.parent

args = [
"--prompt_text",
"File references: {file_references}\n\nFile contents:\n{file_contents}",
"--submission_type",
"python",
"--scope",
"text",
"--submission",
str(parent / "test_submissions/ggr274_homework5/test1/student_submission.txt"),
"--solution",
str(parent / "test_submissions/ggr274_homework5/test1/Homework_5_solution.txt"),
"--model",
"openai",
]
output = run_cli_and_capture(args, capsys)

assert "The student's submission file is student_submission.txt." in output
assert "The instructor's solution file is Homework_5_solution.txt." in output

assert '<submission file="student_submission.txt">' in output
assert '</submission>' in output
assert '<solution file="Homework_5_solution.txt">' in output
assert '</solution>' in output