Wrap file contents in XML tags for prompts (MarkUsProject#21)

Rolland-He · Will Kukkamalla · commit 3cf7e6c589ee · 2025-08-29T10:33:20.000-04:00
diff --git a/ai_feedback/helpers/template_utils.py b/ai_feedback/helpers/template_utils.py
@@ -42,7 +42,7 @@ def render_prompt_template(
     if question_num is not None:
         template_data['file_contents'] = _get_question_contents([submission, solution], question_num)
     else:
-        template_data['file_contents'] = gather_file_contents([submission, solution, test_output])
+        template_data['file_contents'] = gather_xml_file_contents(submission, solution, test_output)
 
     # Handle image placeholders with context-aware replacement
     if '{submission_image}' in prompt_content and 'submission_image' not in template_data:
@@ -64,69 +64,111 @@ def render_prompt_template(
     return prompt_content.format(**template_data)
 
 
-def gather_file_references(submission: Path, solution: Optional[Path], test_output: Optional[Path]) -> str:
+def gather_file_references(
+    submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
+) -> str:
     """Generate file reference descriptions for prompt templates.
 
     Args:
-        submission (Path): Student's submission file path
+        submission (Path, optional): Student's submission file path
         solution (Path, optional): Instructor's solution file path
         test_output (Path, optional): Student's test output file path
 
     Returns:
         str: Descriptions like "The instructor's solution file..."
     """
     references: List[str] = []
-    references.append(f"The student's submission file is {submission.name}.")
+    if submission:
+        references.append(f"The student's submission file is {submission.name}.")
     if solution:
         references.append(f"The instructor's solution file is {solution.name}.")
     if test_output:
         references.append(f"The student's test output file is {test_output.name}.")
     return "\n".join(references)
 
 
-def gather_file_contents(assignment_files: List[Optional[Path]]) -> str:
-    """Generate file contents with line numbers for prompt templates.
+def gather_xml_file_contents(
+    submission: Optional[Path] = None, solution: Optional[Path] = None, test_output: Optional[Path] = None
+) -> str:
+    """Generate file contents with XML tags for prompt templates.
 
     Args:
-        assignment_files (list[str]): List of file paths to process
+        submission (Path, optional): Student's submission file path
+        solution (Path, optional): Instructor's solution file path
+        test_output (Path, optional): Student's test output file path
 
     Returns:
-        str: File contents formatted with line numbers
+        str: File contents formatted with XML tags and line numbers
     """
     file_contents = ""
 
-    for file_path in assignment_files:
-        if not file_path:
-            continue
-        filename = os.path.basename(file_path)
-
-        try:
-            # Handle PDF files separately
-            if filename.lower().endswith('.pdf'):
-                text_content = extract_pdf_text(file_path)
-                lines = text_content.split('\n')
-            else:
-                # Handle regular text files
-                with open(file_path, "r", encoding="utf-8") as file:
-                    lines = file.readlines()
-
-            # Common processing for both file types
-            file_contents += f"=== {filename} ===\n"
-            for i, line in enumerate(lines, start=1):
-                stripped_line = line.rstrip('\n').rstrip()
-                if stripped_line.strip():
-                    file_contents += f"(Line {i}) {stripped_line}\n"
-                else:
-                    file_contents += f"(Line {i}) \n"
-            file_contents += "\n"
-
-        except Exception as e:
-            print(f"Error reading file {filename}: {e}")
-            continue
+    if submission:
+        file_contents += _format_file_with_xml_tag(submission, "submission")
+
+    if solution:
+        file_contents += _format_file_with_xml_tag(solution, "solution")
+
+    if test_output:
+        file_contents += _format_file_with_xml_tag(test_output, "test_output")
 
     return file_contents
 
 
+def _format_file_with_xml_tag(file_path: Path, tag_name: str) -> str:
+    """Format a single file with XML tags and line numbers.
+
+    Args:
+        file_path (Path): Path to the file to format
+        tag_name (str): The XML tag name (submission, solution, test_output)
+
+    Returns:
+        str: Formatted file content with XML tags
+    """
+    if not file_path:
+        return ""
+
+    filename = os.path.basename(file_path)
+
+    try:
+        # Handle PDF files separately
+        if filename.lower().endswith('.pdf'):
+            text_content = extract_pdf_text(file_path)
+            return f"<{tag_name} filename=\"{filename}\">\n{text_content}\n</{tag_name}>\n\n"
+        else:
+            # Handle regular text files
+            with open(file_path, "r", encoding="utf-8") as file:
+                lines = file.readlines()
+            return _wrap_lines_with_xml(lines, tag_name, filename)
+
+    except Exception as e:
+        print(f"Error reading file {filename}: {e}")
+        return ""
+
+
+def _wrap_lines_with_xml(lines: List[str], tag_name: str, filename: str) -> str:
+    """Wrap lines with XML tags and add line numbers.
+
+    Args:
+        lines (List[str]): List of lines to format
+        tag_name (str): The XML tag name (submission, solution, test_output)
+        filename (str): The filename to include in the XML tag
+
+    Returns:
+        str: Formatted content with XML tags and line numbers
+    """
+    content = f"<{tag_name} filename=\"{filename}\">\n"
+
+    for i, line in enumerate(lines, start=1):
+        stripped_line = line.rstrip("\n")
+        if stripped_line.strip():
+            content += f"(Line {i}) {stripped_line}\n"
+        else:
+            content += f"(Line {i}) {line}"
+
+    content += f"</{tag_name}>\n\n"
+    return content
+
+
 def extract_pdf_text(pdf_path: str) -> str:
     """Extract text content from a PDF file.
 
@@ -233,6 +275,7 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
 
     Args:
         assignment_files (List[Optional[Path]]): List of Path or None objects to parse.
+            Expected order: [submission, solution]
         question_num (int): The target task number to extract.
 
     Returns:
@@ -244,7 +287,9 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
     file_contents = ""
     task_found = False
 
-    for file_path in assignment_files:
+    semantic_tags = ["submission", "solution"]
+
+    for index, file_path in enumerate(assignment_files):
         if (
             not file_path
             or file_path.suffix != '.txt'
@@ -266,9 +311,11 @@ def _get_question_contents(assignment_files: List[Optional[Path]], question_num:
             task_content = task_match.group(1).strip()
             task_found = True
 
-        file_contents += f"\n\n---\n### {file_path}\n\n"
+        tag_name = semantic_tags[index] if index < len(semantic_tags) else "file"
+        file_contents += f"<{tag_name} filename=\"{file_path.name}\">\n"
         file_contents += intro_content + "\n\n" if intro_content else ""
         file_contents += task_content + "\n\n"
+        file_contents += f"</{tag_name}>\n\n"
 
     if not task_found:
         print(f"Task {question_num} not found in any assignment file.")
diff --git a/tests/open_ai_model_tests/integration_test.py b/tests/open_ai_model_tests/integration_test.py
@@ -4,8 +4,7 @@
 
 
 def test_cnn_example_openai_stdout(capsys, mock_and_capture):
-    """
-    Example 1:
+    """Example 1:
     Evaluate cnn_example test using openAI model and print to stdout.
     python -m ai_feedback --prompt code_lines --scope code \
         --submission test_submissions/cnn_example/cnn_submission \
@@ -30,13 +29,12 @@ def test_cnn_example_openai_stdout(capsys, mock_and_capture):
 
     assert "Compare the student's code and solution code. For each mistake" in output
     assert "(Line 1) import numpy as np" in output
-    assert "=== cnn_submission.py ===" in output
-    assert "=== cnn_solution.py ===" in output
+    assert '<submission filename="cnn_submission.py">' in output
+    assert '<solution filename="cnn_solution.py">' in output
 
 
 def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
-    """
-    Example 2:
+    """Example 2:
     Evaluate cnn_example test using openAI model and a custom prompt text, printing to stdout.
     python -m ai_feedback --prompt_text "Evaluate the student's code readability." \
         --scope code \
@@ -58,13 +56,12 @@ def test_cnn_example_custom_prompt_stdout(capsys, mock_and_capture):
     ]
     output = run_cli_and_capture(args, capsys)
     assert "Evaluate the student's code readability." in output
-    assert "=== cnn_submission.py ===" in output
+    assert '<submission filename="cnn_submission.py">' in output
     assert "(Line 1) import numpy as np" in output
 
 
 def test_pdf_example_openai_direct(capsys, mock_and_capture):
-    """
-    Example 3:
+    """Example 3:
     Evaluate pdf_example test using openAI model and direct output mode.
     python -m ai_feedback --prompt text_pdf_analyze --scope text \
         --submission test_submissions/pdf_example/student_pdf_submission.pdf \
@@ -86,3 +83,67 @@ def test_pdf_example_openai_direct(capsys, mock_and_capture):
     assert "Does the student correctly respond to the question, and meet all the" in output
     assert "student_pdf_submission.pdf" in output
     assert "Normalization allows each feature to have an equal influence on the mode" in output
+
+
+def test_xml_formatting_code_scope(capsys, mock_and_capture):
+    """
+    Test XML formatting for file contents in code scope.
+    Verifies that file contents use XML tags while file references remain plain text.
+    """
+    parent = Path(__file__).parent.parent.parent
+
+    args = [
+        "--prompt_text",
+        "File references: {file_references}\n\nFile contents:\n{file_contents}",
+        "--scope",
+        "code",
+        "--submission",
+        str(parent / "test_submissions/csc108/correct_submission/correct_submission.py"),
+        "--solution",
+        str(parent / "test_submissions/csc108/solution.py"),
+        "--model",
+        "openai",
+    ]
+    output = run_cli_and_capture(args, capsys)
+
+    assert "The student's submission file is correct_submission.py." in output
+    assert "The instructor's solution file is solution.py." in output
+
+    assert '<submission filename="correct_submission.py">' in output
+    assert '</submission>' in output
+    assert '<solution filename="solution.py">' in output
+    assert '</solution>' in output
+
+    assert "(Line 1) def fizzbuzz(n: int) -> list:" in output
+
+
+def test_xml_formatting_text_scope_with_test_output(capsys, mock_and_capture):
+    """
+    Test XML formatting for file contents in text scope with all file types.
+    Verifies submission, solution, and test_output files all use XML formatting.
+    """
+    parent = Path(__file__).parent.parent.parent
+
+    args = [
+        "--prompt_text",
+        "File references: {file_references}\n\nFile contents:\n{file_contents}",
+        "--submission_type",
+        "python",
+        "--scope",
+        "text",
+        "--submission",
+        str(parent / "test_submissions/ggr274_homework5/test1/student_submission.txt"),
+        "--solution",
+        str(parent / "test_submissions/ggr274_homework5/test1/Homework_5_solution.txt"),
+        "--model",
+        "openai",
+    ]
+    output = run_cli_and_capture(args, capsys)
+
+    assert "The student's submission file is student_submission.txt." in output
+    assert "The instructor's solution file is Homework_5_solution.txt." in output
+
+    assert '<submission filename="student_submission.txt">' in output
+    assert '</submission>' in output
+    assert '<solution filename="Homework_5_solution.txt">' in output
+    assert '</solution>' in output