tensorzero · virajmehta · Dec 4, 2025 · Dec 2, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/tensorzero/swe_agent_config/aaron_templates/action_observation.minijinja b/tensorzero/swe_agent_config/aaron_templates/action_observation.minijinja
@@ -0,0 +1,24 @@
+<returncode>{{output.returncode}}</returncode>
+{% if output.output | length < 10000 -%}
+<output>
+{{ output.output -}}
+</output>
+{%- else -%}
+<warning>
+The output of your last command was too long.
+Please try a different command that produces less output.
+If you're looking at a file, use head, tail, or sed to view a smaller number of lines selectively.
+If you're using grep or find and it produced too much output, use a more selective search pattern.
+If you really need to see something from the full output, redirect it to a file and then search in that file.
+</warning>
+{%- set elided_chars = output.output | length - 10000 -%}
+<output_head>
+{{ output.output[:5000] }}
+</output_head>
+<elided_chars>
+{{ elided_chars }} characters elided
+</elided_chars>
+<output_tail>
+{{ output.output[-5000:] }}
+</output_tail>
+{%- endif -%}
diff --git a/tensorzero/swe_agent_config/aaron_templates/format_error.minijinja b/tensorzero/swe_agent_config/aaron_templates/format_error.minijinja
@@ -0,0 +1,23 @@
+Please always provide EXACTLY ONE action in triple backticks, found {{actions|length}} actions.
+
+If you want to end the task, use the completion command:
+
+```bash
+echo "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+REASONING: [Your reasoning here]"
+```
+
+Do not combine the completion command with any other command.
+
+Otherwise, format your response exactly as follows:
+
+<response_example>
+THOUGHT: Your reasoning about why you want to perform this action.
+
+```bash
+<your_command_here>
+```
+</response_example>
+
+Note: In rare cases, if you need to reference triple backticks in your command, proceed in two steps:
+first write TRIPLEBACKTICKSBASH, then replace it with ```bash in a subsequent command.
diff --git a/tensorzero/swe_agent_config/aaron_templates/instance.minijinja b/tensorzero/swe_agent_config/aaron_templates/instance.minijinja
@@ -0,0 +1,89 @@
+{{task}}
+
+## Your Mission
+
+Your goal is to:
+1. Read and understand the CI failure information provided in `ci_failure_context.md`
+2. Make targeted fixes to resolve the failing tests/checks
+3. Validate your fixes by running the appropriate tests locally
+
+## Validation Requirements
+
+After making changes, you MUST validate them by running:
+- The specific failing tests (to ensure they now pass)
+- Linters and formatters (eslint, prettier, black, ruff, cargo fmt, etc.)
+- The build process (npm run build, cargo build, etc.)
+- Language-specific checks (cargo check, cargo clippy, tsc --noEmit, etc.)
+
+Your response must contain exactly ONE bash code block with ONE command (or commands connected with && or ||).
+Include a THOUGHT section before your command where you explain your reasoning process.
+Format your response as shown in <format_example>.
+
+<format_example>
+Your reasoning and analysis here. Explain why you want to perform the action.
+
+```bash
+your_command_here
+```
+</format_example>
+
+Failure to follow these rules will cause your response to be rejected.
+
+## Completion Signal
+
+When you are done and have validated your fix, signal completion:
+
+```bash
+echo "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT
+REASONING: Brief explanation of the changes you made and what you fixed"
+```
+
+Do not combine the completion command with any other command.
+
+## CI Failure Information
+
+The CI failure details are available in the file `ci_failure_context.md` in the current directory.
+Read this file first to understand what failed and why.
+
+## Recommended Workflow
+
+Work step-by-step to ensure you can iterate on your changes and catch any problems:
+
+1. **Read the CI failure context** - `cat ci_failure_context.md`
+2. **Analyze the codebase** - Find and read relevant files mentioned in the failure
+3. **Understand the root cause** - Identify why the tests/checks are failing
+4. **Create a reproduction script** (if applicable) - Verify you can reproduce the failure locally
+5. **Make targeted fixes** - Edit the source code to resolve the issue
+6. **Run validation** - Execute the failing tests, linters, and build to verify your fix
+7. **Iterate if needed** - If validation fails, debug and fix until all checks pass
+9. **Submit your work** - Signal completion using the completion signal
+
+## Important Rules
+
+1. Every response must contain exactly one action in triple backticks
+2. Directory or environment variable changes are not persistent - every action runs in a new subshell
+3. You can prefix commands with environment variables or directory changes: `cd /path && command`
+4. If a command needs more time, add '# timeout: <seconds>' on the first line (max {{max_timeout}} seconds).
+
+<system_information>
+{{system}} {{release}} {{version}} {{machine}}
+</system_information>
+
+## Example Session
+
+<example_response>
+THOUGHT: I need to first read the CI failure context to understand what went wrong in the pull request.
+
+```bash
+cat ci_failure_context.md
+```
+</example_response>
+
+## With max_timeout
+```bash
+# timeout: 300
+uv run expensive_script.py
+```
+
+Now begin your work!
+Do not commit to git, just signal completion when you are happy with the state of the project.
diff --git a/tensorzero/swe_agent_config/aaron_templates/system.minijinja b/tensorzero/swe_agent_config/aaron_templates/system.minijinja
@@ -0,0 +1 @@
+You are an expert software engineer helping to fix CI failures in a GitHub pull request.
diff --git a/tensorzero/swe_agent_config/tensorzero.toml b/tensorzero/swe_agent_config/tensorzero.toml
@@ -1,6 +1,7 @@
 gateway.debug = true
 
 [[rate_limiting.rules]]
+always = true
 tokens_per_day = 10_000_000
 
 [models.gpt-5]
@@ -33,6 +34,13 @@ routing = ["anthropic"]
 type = "anthropic"
 model_name = "claude-sonnet-4-5-20250929"
 
+[models.claude-opus-4-5]
+routing = ["anthropic"]
+
+[models.claude-opus-4-5.providers.anthropic]
+type = "anthropic"
+model_name = "claude-opus-4-5-20251101"
+
 [models.claude-4-5-opus-thinking]
 routing = ["anthropic"]
 
@@ -47,8 +55,12 @@ extra_body = [
 [functions.swe_agent]
 type = "chat"
 
+[functions.swe_agent.experimentation]
+type = "track_and_stop"
+candidate_variants = ["shuyang-gpt-5-1-codex", "aaron-claude-opus", "viraj-claude-4-5-opus", "alan-gemini-3-0-pro", "aj-claude-4-5-opus-thinking", "gb"] 
+metric = "ci_fix_pr_merged_agent"
+
 [functions.swe_agent.variants.gpt-5]
-weight = 0
 type = "chat_completion"
 model = "gpt-5"
 templates.system.path = "templates/system.minijinja"
@@ -57,51 +69,76 @@ templates.action_observation.path = "templates/action_observation.minijinja"
 templates.format_error.path = "templates/format_error.minijinja"
 
 [functions.swe_agent.variants.shuyang-gpt-5-1-codex]
-weight = 1
 type = "chat_completion"
 model = "gpt-5.1-codex"
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
 templates.system.path = "templates/shuyang-gpt-5-1/system.minijinja"
 templates.instance.path = "templates/shuyang-gpt-5-1/instance.minijinja"
 templates.action_observation.path = "templates/action_observation.minijinja"
 templates.format_error.path = "templates/format_error.minijinja"
 
 [functions.swe_agent.variants.claude-4-5-sonnet]
-weight = 1
 type = "chat_completion"
 model = "claude-4-5-sonnet"
 templates.system.path = "templates/system.minijinja"
 templates.instance.path = "templates/instance.minijinja"
 templates.action_observation.path = "templates/action_observation.minijinja"
 templates.format_error.path = "templates/format_error.minijinja"
 
-[functions.swe_agent.variants.viraj]
-weight = 1
+[functions.swe_agent.variants.aaron-claude-opus]
+type = "chat_completion"
+model = "claude-opus-4-5"
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
+templates.system.path = "aaron_templates/system.minijinja"
+templates.instance.path = "aaron_templates/instance.minijinja"
+templates.action_observation.path = "aaron_templates/action_observation.minijinja"
+templates.format_error.path = "aaron_templates/format_error.minijinja"
+
+[functions.swe_agent.variants.viraj-claude-4-5-opus]
 type = "chat_completion"
 model = "claude-4-5-opus-thinking"
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
 templates.system.path = "viraj/templates/system.minijinja"
 templates.instance.path = "viraj/templates/instance.minijinja"
 templates.action_observation.path = "viraj/templates/action_observation.minijinja"
 templates.format_error.path = "viraj/templates/format_error.minijinja"
 
 [functions.swe_agent.variants.aj-claude-4-5-opus-thinking]
-weight = 1
 type = "chat_completion"
 model = "claude-4-5-opus-thinking"
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
 templates.system.path = "aj/system.minijinja"
 templates.instance.path = "aj/instance.minijinja"
 templates.action_observation.path = "aj/action_observation.minijinja"
 templates.format_error.path = "aj/format_error.minijinja"
 max_tokens = 64_000
 
-[functions.swe_agent.variants.gemini-3-0-pro]
-weight = 1
+[functions.swe_agent.variants.alan-gemini-3-0-pro]
 type = "chat_completion"
-model = "google::gemini-3.0-pro-exp"
+model = "google_ai_studio_gemini::gemini-3.0-pro-exp"
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
 templates.system.path = "templates_gemini/system_gemini.minijinja"
 templates.instance.path = "templates_gemini/instance_gemini.minijinja"
 templates.action_observation.path = "templates_gemini/action_observation_gemini.minijinja"
 templates.format_error.path = "templates_gemini/format_error_gemini.minijinja"
 
+[functions.swe_agent.variants.gb]
+type = "chat_completion"
+model = "anthropic::claude-opus-4-5"
+max_tokens = 64_000
+thinking_budget_tokens = 32_000
+retries = { num_retries = 2, max_delay_s = 15 }
+timeouts = { non_streaming.total_ms = 120_000, streaming.ttft_ms = 30_000 }
+templates.system.path = "templates/gb/system.minijinja"
+templates.instance.path = "templates/gb/instance.minijinja"
+templates.action_observation.path = "templates/gb/action_observation.minijinja"
+templates.format_error.path = "templates/gb/format_error.minijinja"
+
 # Metrics for tracking agent performance
 # Many of them are not yet used except for ci_fix_pr_merged_agent
 [metrics.ci_fix_validation_passed]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		You are an expert software engineer helping to fix CI failures in a GitHub pull request.
virajmehta marked this conversation as resolved. Show resolved Hide resolved