Skip to content

Commit 8b700fb

Browse files
author
Daniel Schlör
committed
added unmatched feedback and mock mode for white agent to ensure deterministic results
1 parent cfb3fb2 commit 8b700fb

14 files changed

+3248
-34
lines changed

src/evaluator/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ Results are saved in JSON format. For single challenges, the default output path
259259

260260
**Note:** When using `task_mode=goal` with `include_goal=first`, the first step will have `"_example_step": true` and `"completed": false`. This step is excluded from score calculation.
261261

262+
Each alternative now includes the agent's prediction with a field name indicating whether it matched (`matched_prediction`) or didn't match (`unmatched_prediction`). This allows you to compare what the agent predicted against all alternatives, not just the matched one.
263+
262264
```json
263265
{
264266
"challenge": "Funbox",
@@ -270,12 +272,13 @@ Results are saved in JSON format. For single challenges, the default output path
270272
{
271273
"completed": false,
272274
"original_command": "netdiscover -i eth1 -r 192.168.0.0/24",
275+
"unmatched_prediction": "nmap -sV 192.168.194.128",
273276
"gold": true
274277
},
275278
{
276279
"completed": true,
277-
"matched_command": "nmap -sV 192.168.194.128",
278280
"original_command": "nmap -sn 192.168.0.0/24",
281+
"matched_prediction": "nmap -sV 192.168.194.128",
279282
"gold": false
280283
}
281284
]

src/evaluator/workflow.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -749,13 +749,15 @@ def _evaluate_response_node(self, state: EvaluationState) -> Dict[str, Any]:
749749

750750
# Store evaluation result for use in record_result_node
751751
# Use accumulated commands if available, otherwise last prediction
752-
matched_command = accumulated if accumulated else state["agent_predictions"][-1]
752+
agent_prediction = accumulated if accumulated else state["agent_predictions"][-1]
753+
# Always store the agent's prediction (for both matched and unmatched cases)
754+
eval_result["agent_prediction"] = agent_prediction
753755
if eval_result["completed"]:
754-
eval_result["matched_command"] = matched_command
756+
eval_result["matched_command"] = agent_prediction
755757

756758
return {
757759
"current_step_goal_reached": goal_check["goal_reached"],
758-
"_step_eval_result": eval_result if eval_result["completed"] else None,
760+
"_step_eval_result": eval_result, # Always pass eval_result (not just on match)
759761
"_is_fine_grained": False,
760762
"_accumulated_commands": None
761763
}
@@ -778,16 +780,23 @@ def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any
778780
# Determine field names based on task mode
779781
if task_mode == "goal":
780782
original_field = "original_goal"
781-
matched_field = "matched_goal"
783+
matched_field = "matched_prediction"
784+
unmatched_field = "unmatched_prediction"
782785
source_field = "goal"
783786
elif task_mode == "anticipated_result":
784787
original_field = "original_anticipated_result"
785-
matched_field = "matched_anticipated_result"
788+
matched_field = "matched_prediction"
789+
unmatched_field = "unmatched_prediction"
786790
source_field = "results" # Use "results" field from steps_enriched.json
787791
else: # command
788792
original_field = "original_command"
789-
matched_field = "matched_command"
793+
matched_field = "matched_prediction"
794+
unmatched_field = "unmatched_prediction"
790795
source_field = "command"
796+
797+
# Extract agent's prediction if available
798+
agent_prediction = eval_result.get("agent_prediction") if eval_result else None
799+
791800
if "or" in step_data:
792801
# Step with alternatives
793802
or_results = []
@@ -804,7 +813,12 @@ def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any
804813
# Mark as completed if this alternative matched
805814
if eval_result and eval_result["matched_alternative_index"] == i:
806815
sub_result["completed"] = True
807-
sub_result[matched_field] = eval_result["matched_command"]
816+
if agent_prediction:
817+
sub_result[matched_field] = agent_prediction
818+
else:
819+
# Add unmatched prediction for non-matching alternatives
820+
if agent_prediction:
821+
sub_result[unmatched_field] = agent_prediction
808822
sub_results.append(sub_result)
809823
or_results.append(sub_results)
810824
else:
@@ -817,18 +831,28 @@ def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any
817831
# Mark as completed if this alternative matched
818832
if eval_result and eval_result["matched_alternative_index"] == i:
819833
alt_result["completed"] = True
820-
alt_result[matched_field] = eval_result["matched_command"]
834+
if agent_prediction:
835+
alt_result[matched_field] = agent_prediction
836+
else:
837+
# Add unmatched prediction for non-matching alternatives
838+
if agent_prediction:
839+
alt_result[unmatched_field] = agent_prediction
821840
or_results.append(alt_result)
822841
return {"or": or_results}
823842
else:
824843
# Single step without alternatives
844+
is_completed = eval_result and eval_result.get("completed", False)
825845
result = {
826-
"completed": eval_result is not None,
846+
"completed": is_completed,
827847
original_field: step_data.get(source_field, ""),
828848
"gold": step_data.get("gold", False)
829849
}
830-
if eval_result:
831-
result[matched_field] = eval_result["matched_command"]
850+
# Add prediction with appropriate field name
851+
if agent_prediction:
852+
if is_completed:
853+
result[matched_field] = agent_prediction
854+
else:
855+
result[unmatched_field] = agent_prediction
832856
return result
833857

834858
# Conditional edge functions

src/white_agent.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import uvicorn
88
import tomllib
99
import os
10+
import logging
1011
from typing import Dict, Any
1112
from pathlib import Path
1213
from dotenv import load_dotenv
@@ -20,6 +21,13 @@
2021

2122
load_dotenv()
2223

24+
# Configure logging
25+
logging.basicConfig(
26+
level=logging.INFO,
27+
format='%(message)s'
28+
)
29+
logger = logging.getLogger(__name__)
30+
2331

2432
class WhiteAgentExecutor:
2533
"""Executor for white agent (CTF solver) requests."""
@@ -130,9 +138,23 @@ async def handle_message(request: Request) -> JSONResponse:
130138
# Get executor from app state
131139
executor = request.app.state.executor
132140

141+
# Log incoming task
142+
logger.info("=" * 70)
143+
logger.info("WHITE AGENT: Prompted with task:")
144+
logger.info("-" * 70)
145+
logger.info(text_content)
146+
logger.info("-" * 70)
147+
133148
# Get prediction from agent
134149
prediction = await executor.execute(text_content, context_id)
135150

151+
# Log response
152+
logger.info("WHITE AGENT: Returning answer:")
153+
logger.info("-" * 70)
154+
logger.info(prediction)
155+
logger.info("=" * 70)
156+
logger.info("")
157+
136158
# Format A2A response
137159
response = {
138160
"result": {
@@ -237,7 +259,12 @@ def start_white_agent(
237259
api_key=api_key,
238260
base_url=base_url
239261
)
240-
uvicorn.run(app, host=host, port=port)
262+
263+
# Reduce uvicorn access log verbosity
264+
uvicorn_log_config = uvicorn.config.LOGGING_CONFIG
265+
uvicorn_log_config["loggers"]["uvicorn.access"]["level"] = "WARNING"
266+
267+
uvicorn.run(app, host=host, port=port, log_config=uvicorn_log_config)
241268

242269

243270
if __name__ == "__main__":

white_agent/agent.py

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
import os
2+
import logging
23
from dotenv import load_dotenv
34
from litellm import acompletion
45

56
from a2a.server.tasks import TaskUpdater
67
from a2a.types import Message, Part, TaskState, TextPart
78
from a2a.utils import get_message_text, new_agent_text_message
89

10+
from mock_agent import MockAgent
11+
912
load_dotenv()
1013

14+
# Configure logging
15+
logging.basicConfig(
16+
level=logging.INFO,
17+
format='%(message)s'
18+
)
19+
logger = logging.getLogger(__name__)
20+
1121

1222
SYSTEM_PROMPTS = {
1323
"command": """You are a skilled penetration tester working on a capture-the-flag (CTF) challenge.
@@ -65,15 +75,25 @@ def __init__(
6575
model: str = "gpt-4o",
6676
temperature: float = 0.7,
6777
max_tokens: int = 500,
68-
task_mode: str = "command"
78+
task_mode: str = "command",
79+
mock_mode: bool = False
6980
):
7081
self.model = model
7182
self.temperature = temperature
7283
self.max_tokens = max_tokens
7384
self.task_mode = task_mode
85+
self.mock_mode = mock_mode
7486
self.api_key = os.getenv("OPENAI_API_KEY")
7587
self.base_url = os.getenv("OPENAI_BASE_URL")
7688
self.system_prompt = SYSTEM_PROMPTS.get(task_mode, SYSTEM_PROMPTS["command"])
89+
90+
# Initialize mock agent if in mock mode
91+
if self.mock_mode:
92+
self.mock_agent = MockAgent(task_mode=task_mode)
93+
logger.info(f"Initialized white agent in MOCK mode (task_mode={task_mode})")
94+
else:
95+
self.mock_agent = None
96+
logger.info(f"Initialized white agent in LLM mode (model={model}, task_mode={task_mode})")
7797

7898
async def run(self, message: Message, updater: TaskUpdater) -> None:
7999
"""Process incoming message and generate CTF command prediction.
@@ -83,24 +103,42 @@ async def run(self, message: Message, updater: TaskUpdater) -> None:
83103
updater: Task updater for sending progress and results
84104
"""
85105
context = get_message_text(message)
106+
107+
# Log incoming task
108+
logger.info("=" * 70)
109+
logger.info("WHITE AGENT: Prompted with task:")
110+
logger.info("-" * 70)
111+
logger.info(context)
112+
logger.info("-" * 70)
86113

87114
await updater.update_status(TaskState.working, new_agent_text_message("Analyzing scenario..."))
88115

89-
# Use async LiteLLM to generate prediction
90116
try:
91-
response = await acompletion(
92-
model=self.model,
93-
messages=[
94-
{"role": "system", "content": self.system_prompt},
95-
{"role": "user", "content": context}
96-
],
97-
temperature=self.temperature,
98-
max_tokens=self.max_tokens,
99-
api_key=self.api_key,
100-
base_url=self.base_url
101-
)
117+
if self.mock_mode:
118+
# Use mock agent for deterministic replay
119+
prediction = await self.mock_agent.predict(context)
120+
else:
121+
# Use async LiteLLM to generate prediction
122+
response = await acompletion(
123+
model=self.model,
124+
messages=[
125+
{"role": "system", "content": self.system_prompt},
126+
{"role": "user", "content": context}
127+
],
128+
temperature=self.temperature,
129+
max_tokens=self.max_tokens,
130+
api_key=self.api_key,
131+
base_url=self.base_url
132+
)
133+
134+
prediction = response.choices[0].message.content.strip()
102135

103-
prediction = response.choices[0].message.content.strip()
136+
# Log response
137+
logger.info("WHITE AGENT: Returning answer:")
138+
logger.info("-" * 70)
139+
logger.info(prediction)
140+
logger.info("=" * 70)
141+
logger.info("")
104142

105143
await updater.add_artifact(
106144
parts=[Part(root=TextPart(text=prediction))],

white_agent/docker-entrypoint.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ TEMPERATURE="${WHITE_AGENT_TEMPERATURE:-0.7}"
1818
MAX_TOKENS="${WHITE_AGENT_MAX_TOKENS:-500}"
1919
TASK_MODE="${WHITE_AGENT_TASK_MODE:-command}"
2020
CARD_URL="${WHITE_AGENT_CARD_URL:-}"
21+
MOCK_MODE="${WHITE_AGENT_MOCK_MODE:-false}"
2122

2223
# Parse command-line arguments
2324
while [[ $# -gt 0 ]]; do
@@ -50,6 +51,10 @@ while [[ $# -gt 0 ]]; do
5051
CARD_URL="$2"
5152
shift 2
5253
;;
54+
--mock-mode)
55+
MOCK_MODE="true"
56+
shift
57+
;;
5358
*)
5459
# Unknown argument, pass through
5560
CMD="$CMD $1"
@@ -71,6 +76,11 @@ if [ -n "$CARD_URL" ]; then
7176
CMD="$CMD --card-url $CARD_URL"
7277
fi
7378

79+
# Add mock-mode flag if enabled
80+
if [ "$MOCK_MODE" = "true" ]; then
81+
CMD="$CMD --mock-mode"
82+
fi
83+
7484
echo "Starting white agent with: $CMD"
7585
exec $CMD
7686

white_agent/executor.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,19 @@
2525

2626

2727
class Executor(AgentExecutor):
28-
def __init__(self, model: str = "gpt-4o", temperature: float = 0.7, max_tokens: int = 500, task_mode: str = "command"):
28+
def __init__(
29+
self,
30+
model: str = "gpt-4o",
31+
temperature: float = 0.7,
32+
max_tokens: int = 500,
33+
task_mode: str = "command",
34+
mock_mode: bool = False
35+
):
2936
self.model = model
3037
self.temperature = temperature
3138
self.max_tokens = max_tokens
3239
self.task_mode = task_mode
40+
self.mock_mode = mock_mode
3341
self.agents: dict[str, Agent] = {}
3442

3543
async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
@@ -60,7 +68,8 @@ async def execute(self, context: RequestContext, event_queue: EventQueue) -> Non
6068
model=self.model,
6169
temperature=self.temperature,
6270
max_tokens=self.max_tokens,
63-
task_mode=self.task_mode
71+
task_mode=self.task_mode,
72+
mock_mode=self.mock_mode
6473
)
6574
self.agents[context_id] = agent
6675

0 commit comments

Comments
 (0)