diff --git a/README.md b/README.md index a84cab3..4630c1c 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ For the image scope, the program takes up to two files, depending on the prompt | `--system_prompt` | Pre-defined system prompt name or file path to custom system prompt | ❌ | | `--llama_mode` | How to invoke deepSeek-v3 (choices in `arg_options.LlamaMode`) | ❌ | | `--output_template` | Output template file (from `arg_options.OutputTemplate) | ❌ | +| `--json_schema` | File path to json file for schema for structured output | ❌ | ** One of either `--prompt` or `--prompt_text` must be selected. If both are provided, `--prompt_text` will be appended to the contents of the file specified by `--prompt`. ## Scope @@ -317,6 +318,12 @@ python3 -m ai_feedback --prompt code_table --scope code \ --model deepSeek-v3 --llama_mode cli ``` + +#### Get annotations for cnn_example test using openAI model +```bash +python -m ai_feedback --prompt code_annotations --scope code --submission test_submissions/cnn_example/cnn_submission --solution test_submissions/cnn_example/cnn_solution.py --model openai --json_schema ai_feedback/data/schema/code_annotation_schema.json +``` + #### Evaluate using custom prompt file path ```bash python -m ai_feedback --prompt ai_feedback/data/prompts/user/code_overall.md --scope code --submission test_submissions/csc108/correct_submission/correct_submission.py --solution test_submissions/csc108/solution.py --model codellama:latest diff --git a/ai_feedback/__main__.py b/ai_feedback/__main__.py index ae0ca8f..b730826 100644 --- a/ai_feedback/__main__.py +++ b/ai_feedback/__main__.py @@ -207,6 +207,13 @@ def main() -> int: default="cli", help=HELP_MESSAGES["llama_mode"], ) + parser.add_argument( + "--json_schema", + type=str, + required=False, + default="", + help=HELP_MESSAGES["json_schema"], + ) args = parser.parse_args() diff --git a/ai_feedback/code_processing.py b/ai_feedback/code_processing.py index e034882..a10cc2c 100644 --- a/ai_feedback/code_processing.py +++ b/ai_feedback/code_processing.py @@ -87,6 +87,7 @@ def process_code(args, prompt: str, system_instructions: str) -> Tuple[str, str] question_num=args.question, system_instructions=system_instructions, llama_mode=args.llama_mode, + json_schema=args.json_schema, ) else: request, response = model.generate_response( @@ -96,6 +97,7 @@ def process_code(args, prompt: str, system_instructions: str) -> Tuple[str, str] test_output=test_output_file, system_instructions=system_instructions, llama_mode=args.llama_mode, + json_schema=args.json_schema, ) return request, response diff --git a/ai_feedback/data/schema/code_annotation_schema.json b/ai_feedback/data/schema/code_annotation_schema.json new file mode 100644 index 0000000..7193602 --- /dev/null +++ b/ai_feedback/data/schema/code_annotation_schema.json @@ -0,0 +1,54 @@ +{ + "name": "student_code_annotation", + "description": "List of code annotations describing specific mistakes in the student's code.", + "schema": { + "type": "object", + "properties": { + "annotations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "filename": { + "type": "string", + "description": "The name of the student's file where the issue was found." + }, + "content": { + "type": "string", + "description": "A short description of the mistake or issue." + }, + "line_start": { + "type": "integer", + "description": "The starting line number where the issue begins.", + "minimum": 1 + }, + "line_end": { + "type": "integer", + "description": "The ending line number where the issue ends.", + "minimum": 1 + }, + "column_start": { + "type": "integer", + "description": "The starting column position of the mistake.", + "minimum": 0 + }, + "column_end": { + "type": "integer", + "description": "The ending column position of the mistake.", + "minimum": 0 + } + }, + "required": [ + "filename", + "content", + "line_start", + "line_end", + "column_start", + "column_end" + ] + } + } + }, + "required": ["annotations"] + } +} diff --git a/ai_feedback/helpers/constants.py b/ai_feedback/helpers/constants.py index 592e16b..0896d65 100644 --- a/ai_feedback/helpers/constants.py +++ b/ai_feedback/helpers/constants.py @@ -14,5 +14,6 @@ "test_output": "The output of tests from evaluating the assignment.", "submission_image": "The file path for the image file.", "solution_image": "The file path to the solution image.", + "json_schema": "file path to a json file that contains the schema for ai output", "system_prompt": "Pre-defined system prompt name (from ai_feedback/data/prompts/system/) or file path to custom system prompt file.", } diff --git a/ai_feedback/image_processing.py b/ai_feedback/image_processing.py index 206bc40..a7374ac 100644 --- a/ai_feedback/image_processing.py +++ b/ai_feedback/image_processing.py @@ -165,6 +165,7 @@ def process_image(args, prompt: dict, system_instructions: str) -> tuple[str, st system_instructions=system_instructions, question_num=question, submission_image=args.submission_image, + json_schema=args.json_schema, ) responses.append(str(response)) else: diff --git a/ai_feedback/models/ClaudeModel.py b/ai_feedback/models/ClaudeModel.py index 584baa9..e3fae4d 100644 --- a/ai_feedback/models/ClaudeModel.py +++ b/ai_feedback/models/ClaudeModel.py @@ -29,6 +29,7 @@ def generate_response( question_num: Optional[int] = None, test_output: Optional[Path] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Optional[Tuple[str, str]]: """ Generates a response from Claude using the provided prompt and assignment file context. @@ -42,6 +43,7 @@ def generate_response( question_num (Optional[int]): Specific task number to extract from text files. system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. + json_schema (Optional[str]): Optional json schema to use. Returns: Optional[Tuple[str, str]]: The original prompt and the model's response, or None if the response is invalid. diff --git a/ai_feedback/models/CodeLlamaModel.py b/ai_feedback/models/CodeLlamaModel.py index 8299348..01d082f 100644 --- a/ai_feedback/models/CodeLlamaModel.py +++ b/ai_feedback/models/CodeLlamaModel.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import Optional, Tuple @@ -26,6 +27,7 @@ def generate_response( test_output: Optional[Path] = None, scope: Optional[str] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Optional[Tuple[str, str]]: """ Generates a response from the CodeLlama model using the provided prompt @@ -40,11 +42,20 @@ def generate_response( question_num (Optional[int]): An optional specific question number to extract content for. system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. + json_schema (Optional[str]): Optional json schema to use. Returns: Optional[Tuple[str, str]]: A tuple of the request and the model's response, or None if no valid response is returned. """ + if json_schema: + schema_path = Path(json_schema) + if not schema_path.exists(): + raise FileNotFoundError(f"JSON schema file not found: {schema_path}") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + else: + schema = None response = ollama.chat( model=self.model["model"], @@ -52,6 +63,7 @@ def generate_response( {"role": "system", "content": system_instructions}, {"role": "user", "content": prompt}, ], + format=schema['schema'] if schema else None, ) if not response or "message" not in response or "content" not in response["message"]: diff --git a/ai_feedback/models/DeepSeekModel.py b/ai_feedback/models/DeepSeekModel.py index ebace4c..c4562a5 100644 --- a/ai_feedback/models/DeepSeekModel.py +++ b/ai_feedback/models/DeepSeekModel.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import Optional, Tuple @@ -24,6 +25,7 @@ def generate_response( test_output: Optional[Path] = None, scope: Optional[str] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Optional[Tuple[str, str]]: """ Generate a model response using the prompt and assignment files. @@ -37,11 +39,20 @@ def generate_response( question_num (Optional[int]): An optional question number to target specific content. system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. + json_schema (Optional[str]): Optional json schema to use. Returns: Optional[Tuple[str, str]]: A tuple containing the prompt and the model's response, or None if the response was invalid. """ + if json_schema: + schema_path = Path(json_schema) + if not schema_path.exists(): + raise FileNotFoundError(f"JSON schema file not found: {schema_path}") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + else: + schema = None response = ollama.chat( model=self.model["model"], @@ -49,6 +60,7 @@ def generate_response( {"role": "system", "content": system_instructions}, {"role": "user", "content": prompt}, ], + format=schema['schema'] if schema else None, ) if not response or "message" not in response or "content" not in response["message"]: diff --git a/ai_feedback/models/DeepSeekV3Model.py b/ai_feedback/models/DeepSeekV3Model.py index 2b0975f..8c91269 100644 --- a/ai_feedback/models/DeepSeekV3Model.py +++ b/ai_feedback/models/DeepSeekV3Model.py @@ -1,3 +1,4 @@ +import json import os import subprocess import sys @@ -31,6 +32,7 @@ def generate_response( question_num: Optional[int] = None, test_output: Optional[Path] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Optional[Tuple[str, str]]: """ Generate a model response using the prompt and assignment files. @@ -44,18 +46,28 @@ def generate_response( test_output (Optional[Path]): Path Object pointing to the test output file. llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. question_num (Optional[int]): An optional question number to target specific content. + json_schema (Optional[str]): Optional json schema to use. Returns: Optional[Tuple[str, str]]: A tuple containing the prompt and the model's response, or None if the response was invalid. """ + if json_schema: + schema_path = Path(json_schema) + if not schema_path.exists(): + raise FileNotFoundError(f"JSON schema file not found: {schema_path}") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + else: + schema = None + prompt = f"{system_instructions}\n{prompt}" if llama_mode == 'server': self._ensure_env_vars('LLAMA_SERVER_URL') - response = self._get_response_server(prompt) + response = self._get_response_server(prompt, schema) else: self._ensure_env_vars('LLAMA_MODEL_PATH', 'LLAMA_CLI_PATH') - response = self._get_response_cli(prompt) + response = self._get_response_cli(prompt, schema) response = response.strip() @@ -81,24 +93,24 @@ def _ensure_env_vars(self, *names): if missing: raise RuntimeError(f"Error: Environment variable(s) {', '.join(missing)} not set") - def _get_response_server( - self, - prompt: str, - ) -> str: + def _get_response_server(self, prompt: str, schema: Optional[dict] = None) -> str: """ Generate a model response using the prompt Args: prompt (str): The input prompt provided by the user. + schema (Optional[dict]): Optional schema provided by the user. Returns: str: A tuple containing the model response or None if the response was invalid. """ url = f"{LLAMA_SERVER_URL}/v1/completions" - payload = { - "prompt": prompt, - } + payload = {"prompt": prompt, "temperature": 0.7, "max_tokens": 1000} + + if schema: + raw_schema = schema.get("schema", schema) + payload["json_schema"] = raw_schema try: response = requests.post(url, json=payload, timeout=3000) @@ -116,15 +128,13 @@ def _get_response_server( return model_output - def _get_response_cli( - self, - prompt: str, - ) -> str: + def _get_response_cli(self, prompt: str, schema: Optional[dict] = None) -> str: """ Generate a model response using the prompt Args: prompt (str): The input prompt provided by the user. + schema (Optional[dict]): Optional schema provided by the user. Returns: str: The model response or None if the response was invalid. @@ -141,6 +151,10 @@ def _get_response_cli( "--no-display-prompt", ] + if schema: + raw_schema = schema["schema"] if "schema" in schema else schema + cmd += ["--json-schema", json.dumps(raw_schema)] + try: completed = subprocess.run( cmd, input=prompt.encode(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=300 diff --git a/ai_feedback/models/OpenAIModel.py b/ai_feedback/models/OpenAIModel.py index a765ec6..8ecaa5f 100644 --- a/ai_feedback/models/OpenAIModel.py +++ b/ai_feedback/models/OpenAIModel.py @@ -1,4 +1,6 @@ +import json import os +import re from pathlib import Path from typing import Optional, Tuple @@ -30,6 +32,7 @@ def generate_response( test_output: Optional[Path] = None, scope: Optional[str] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Tuple[str, str]: """ Generate a response based on the given prompt and assignment context. @@ -43,30 +46,47 @@ def generate_response( question_num (Optional[int]): Specific question number to focus on. system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. + json_schema (Optional[str]): Optional json schema to use. Returns: Tuple[str, str]: The full prompt and the generated response from OpenAI. """ - response = self._call_openai(prompt, system_instructions) + if json_schema: + schema_path = Path(json_schema) + if not schema_path.exists(): + raise FileNotFoundError(f"JSON schema file not found: {schema_path}") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + else: + schema = None + + response = self._call_openai(prompt, system_instructions, schema) return prompt, response - def _call_openai(self, prompt: str, system_instructions: str) -> str: + def _call_openai(self, prompt: str, system_instructions: str, schema: Optional[dict] = None) -> str: """ Send a prompt to OpenAI's chat completion API and retrieve the generated response. Args: prompt (str): The fully constructed input prompt including file content. + schema (Optional[dict]): Optional json schema to use. Returns: str: The model's response text. """ + response_format = None + if schema: + response_format = {"type": "json_schema", "json_schema": schema} + response = self.client.chat.completions.create( - model="gpt-4-turbo", + model="gpt-4o-mini", messages=[ {"role": "system", "content": system_instructions}, {"role": "user", "content": prompt}, ], - max_tokens=1000, + response_format=response_format, temperature=0.5, + max_tokens=1000, ) + return response.choices[0].message.content diff --git a/ai_feedback/models/OpenAIModelVector.py b/ai_feedback/models/OpenAIModelVector.py index 84d8986..b5dd311 100644 --- a/ai_feedback/models/OpenAIModelVector.py +++ b/ai_feedback/models/OpenAIModelVector.py @@ -1,3 +1,4 @@ +import json import os from pathlib import Path from typing import List, Optional @@ -28,7 +29,7 @@ def __init__(self) -> None: self.vector_store = self.client.vector_stores.create(name="Markus LLM Vector Store") self.model = self.client.beta.assistants.create( name="Markus LLM model", - model="gpt-4-turbo", + model="gpt-4o-mini", tools=[{"type": "file_search"}], tool_resources={"file_search": {"vector_store_ids": [self.vector_store.id]}}, ) @@ -43,6 +44,7 @@ def generate_response( test_output: Optional[Path] = None, scope: Optional[str] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ) -> tuple[str, str]: """ Generate a response from the OpenAI model using the provided prompt and assignment files. @@ -56,6 +58,7 @@ def generate_response( question_num (Optional[int]): An optional question number. system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. + json_schema (Optional[str]): Optional json schema to use. Returns: tuple[str, str]: A tuple containing the full system request and the model's text response. @@ -64,6 +67,15 @@ def generate_response( if not self.model: raise RuntimeError("Model was not created successfully.") + if json_schema: + schema_path = Path(json_schema) + if not schema_path.exists(): + raise FileNotFoundError(f"JSON schema file not found: {schema_path}") + with open(schema_path, "r", encoding="utf-8") as f: + schema = json.load(f) + else: + schema = None + request = "Uploaded Files: " file_ids: List[str] = [] assignment_files = [f for f in (submission_file, solution_file, test_output) if f] @@ -77,7 +89,7 @@ def generate_response( if question_num: prompt += f" Identify and generate a response for the mistakes **only** in task ${question_num}. " - response = self._call_openai(prompt) + response = self._call_openai(prompt, schema) self._cleanup_resources(file_ids) request = f"\n{system_instructions}\n{prompt}" @@ -98,12 +110,13 @@ def _upload_file(self, file_path: Path) -> str: self.client.vector_stores.files.create(vector_store_id=self.vector_store.id, file_id=response.id) return response.id - def _call_openai(self, prompt: str) -> str: + def _call_openai(self, prompt: str, schema: Optional[dict] = None) -> str: """ Send the user prompt to OpenAI's assistant model and retrieve the generated response. Args: prompt (str): The input prompt for the assistant. + schema (Optional[dict]): Optional json schema to use. Returns: str: The assistant's generated response text. @@ -112,7 +125,18 @@ def _call_openai(self, prompt: str) -> str: self.client.beta.threads.messages.create(thread_id=thread.id, role="user", content=prompt) - run = self.client.beta.threads.runs.create(thread_id=thread.id, assistant_id=self.model.id) + response_format = None + if schema: + response_format = { + "type": "json_schema", + "json_schema": schema, + } + + run = self.client.beta.threads.runs.create( + thread_id=thread.id, + assistant_id=self.model.id, + **({"response_format": response_format} if response_format else {}), + ) while run.status not in ["completed", "failed"]: run = self.client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id) diff --git a/ai_feedback/models/RemoteModel.py b/ai_feedback/models/RemoteModel.py index 43645fe..06ee52d 100644 --- a/ai_feedback/models/RemoteModel.py +++ b/ai_feedback/models/RemoteModel.py @@ -39,6 +39,7 @@ def generate_response( scope: Optional[str] = None, llama_mode: Optional[str] = None, submission_image: Optional[str] = None, + json_schema: Optional[str] = None, ) -> Optional[Tuple[str, str]]: """ Generate a model response using the prompt and assignment files. @@ -53,6 +54,7 @@ def generate_response( system_instructions (str): instructions for the model llama_mode (Optional[str]): Optional mode to invoke llama.cpp in. submission_image (Optional[str]): An optional path to a submission image file. + json_schema (Optional[str]): An optional json schema to use. Returns: Optional[Tuple[str, str]]: A tuple containing the prompt and the model's response, diff --git a/ai_feedback/text_processing.py b/ai_feedback/text_processing.py index 9250090..81b0dd6 100644 --- a/ai_feedback/text_processing.py +++ b/ai_feedback/text_processing.py @@ -60,6 +60,7 @@ def process_text(args, prompt: str, system_instructions: str) -> Tuple[str, str] question_num=args.question, system_instructions=system_instructions, llama_mode=args.llama_mode, + json_schema=args.json_schema, ) else: request, response = model.generate_response( @@ -69,6 +70,7 @@ def process_text(args, prompt: str, system_instructions: str) -> Tuple[str, str] scope=args.scope, system_instructions=system_instructions, llama_mode=args.llama_mode, + json_schema=args.json_schema, ) return request, response diff --git a/tests/local_tests/schema_structure_validation.py b/tests/local_tests/schema_structure_validation.py new file mode 100644 index 0000000..98d0ede --- /dev/null +++ b/tests/local_tests/schema_structure_validation.py @@ -0,0 +1,66 @@ +import json +import subprocess +from pathlib import Path + +import pytest + +BASE_DIR = Path(__file__).resolve().parent.parent.parent +SUBMISSION = BASE_DIR / "test_submissions/cnn_example/cnn_submission.py" +SOLUTION = BASE_DIR / "test_submissions/cnn_example/cnn_solution.py" +SCHEMA_PATH = BASE_DIR / "ai_feedback/data/schema/code_annotation_schema.json" + + +def run_cli(model_name: str) -> dict: + command = [ + "python3", + "-m", + "ai_feedback", + "--prompt", + "code_annotation", + "--scope", + "code", + "--submission", + str(SUBMISSION), + "--solution", + str(SOLUTION), + "--model", + model_name, + "--json_schema", + str(SCHEMA_PATH), + ] + result = subprocess.run(command, capture_output=True, text=True) + assert result.returncode == 0, f"{model_name} failed: {result.stderr}" + + output = result.stdout.strip() + json_start = output.find("{") + assert json_start != -1, f"{model_name} output has no JSON object" + + return json.loads(output[json_start:]) + + +def validate_json_schema(result: dict): + assert "annotations" in result, "Missing 'annotations' key" + assert isinstance(result["annotations"], list), "'annotations' must be a list" + for item in result["annotations"]: + assert isinstance(item, dict), "Each annotation must be an object" + for key in ["filename", "content", "line_start", "line_end", "column_start", "column_end"]: + assert key in item, f"Missing key: {key}" + if key in ["filename", "content"]: + assert isinstance(item[key], str), f"{key} must be a string" + else: + assert isinstance(item[key], int), f"{key} must be an integer" + + +@pytest.mark.parametrize( + "model", + [ + "openai", + "openai-vector", + "codellama:latest", + "deepSeek-R1:70B", + "deepSeek-v3", + ], +) +def test_model_outputs_valid_json_schema(model): + result = run_cli(model) + validate_json_schema(result) diff --git a/tests/test_helper.py b/tests/test_helper.py index a9ee45e..f00284b 100644 --- a/tests/test_helper.py +++ b/tests/test_helper.py @@ -73,6 +73,7 @@ def fake_generate_response( test_output: Optional[Path] = None, scope: Optional[str] = None, llama_mode: Optional[str] = None, + json_schema: Optional[str] = None, ): all_prompts.append((test_name, "OpenAIModel.generate_response", prompt)) return prompt, f"[MOCKED RESPONSE] \n {prompt}"