Skip to content

Commit 299e1d3

Browse files
authored
merge to main from xueyulinn/review
pr to main
2 parents 80890b9 + b636723 commit 299e1d3

34 files changed

Lines changed: 2024 additions & 320 deletions

codehawk/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
"""Codehawk application package."""
2+
3+
__version__ = "0.1.0"

codehawk/agents/code_review_planning_agent.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ class CodeReviewPlanning(BaseModel):
2828
questions: list[CodeReviewQuestion]
2929

3030

31-
agent = Agent(
31+
_agent = Agent(
3232
name="Code Review Planning Agent",
3333
instructions=PR_CODE_REVIEW_PLANNING_SYSTEM_PROMPT,
3434
output_type=CodeReviewPlanning,
3535
)
3636

3737

38-
def construct_user_prompt(
38+
def _construct_user_prompt(
3939
sat_summary: str,
4040
pr_files_context: PullRequestFilesContext,
4141
pr_info_context: PullRequestInfoContext,
@@ -119,12 +119,12 @@ async def generate_code_review_planning(
119119
pr_info_context: PullRequestInfoContext,
120120
pr_commits_context: PullRequestCommitsContext,
121121
) -> CodeReviewPlanning:
122-
prompt = construct_user_prompt(
122+
prompt = _construct_user_prompt(
123123
sat_summary,
124124
pr_files_context,
125125
pr_info_context,
126126
pr_commits_context,
127127
)
128128
with trace("CODE_REVIEW_PLANNING"):
129-
result = await Runner.run(agent, input=prompt)
129+
result = await Runner.run(_agent, input=prompt)
130130
return result.final_output_as(CodeReviewPlanning)

codehawk/agents/code_review_writer_agent.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ class PRReviewOutput(BaseModel):
2323
description="The inline review comments to create on changed lines.")
2424

2525

26-
agent = Agent(
26+
_agent = Agent(
2727
name="Code Review Writer Agent",
2828
instructions=PR_CODE_REVIEW_WRITER_SYSTEM_PROMPT,
2929
output_type=PRReviewOutput
3030
)
3131

3232

33-
def construct_user_prompt(
33+
def _construct_user_prompt(
3434
sat_summary: str,
3535
pr_files_context: PullRequestFilesContext,
3636
structual_questions: Any,
@@ -64,12 +64,12 @@ def construct_user_prompt(
6464
),
6565
},
6666
"planned_questions": {
67-
"structural": to_jsonable(structual_questions),
68-
"convention": to_jsonable(convention_questions),
67+
"structural": _to_jsonable(structual_questions),
68+
"convention": _to_jsonable(convention_questions),
6969
},
7070
"evidence": {
71-
"structural": to_jsonable(structual_evidence),
72-
"convention": to_jsonable(conventional_evidence),
71+
"structural": _to_jsonable(structual_evidence),
72+
"convention": _to_jsonable(conventional_evidence),
7373
},
7474
}
7575
context_json = json.dumps(prompt_context, ensure_ascii=False, indent=2)
@@ -90,7 +90,7 @@ async def generate_code_review_comments(
9090
conventional_evidence,
9191
pr_ctx: PRReviewContext,
9292
) -> PRReviewOutput:
93-
prompt = construct_user_prompt(
93+
prompt = _construct_user_prompt(
9494
sat_summary,
9595
pr_files_context,
9696
structual_questions,
@@ -100,17 +100,17 @@ async def generate_code_review_comments(
100100
pr_ctx,
101101
)
102102
with trace("CODE_REVIEW_COMMENTS"):
103-
result = await Runner.run(agent, input=prompt, context=pr_ctx)
103+
result = await Runner.run(_agent, input=prompt, context=pr_ctx)
104104
return result.final_output_as(PRReviewOutput)
105105

106106

107-
def to_jsonable(value: Any) -> Any:
107+
def _to_jsonable(value: Any) -> Any:
108108
if hasattr(value, "model_dump"):
109109
return value.model_dump(mode="json")
110110
if isinstance(value, list):
111-
return [to_jsonable(item) for item in value]
111+
return [_to_jsonable(item) for item in value]
112112
if isinstance(value, tuple):
113-
return [to_jsonable(item) for item in value]
113+
return [_to_jsonable(item) for item in value]
114114
if isinstance(value, dict):
115-
return {key: to_jsonable(item) for key, item in value.items()}
115+
return {key: _to_jsonable(item) for key, item in value.items()}
116116
return value

codehawk/agents/convention_evidence_agent.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,15 @@ async def find_similar_code_chunks(
5858
return json.dumps({"similar_code_chunks": results}, ensure_ascii=False)
5959

6060

61-
agent = Agent(
61+
_agent = Agent(
6262
name="Conventional Evidence Agent",
6363
instructions=PR_CONVENTION_EVIDENCE_SYSTEM_PROMPT,
6464
tools=[find_similar_code_chunks],
6565
output_type=CodeReviewEvidence,
6666
)
6767

6868

69-
def construct_user_prompt(
69+
def _construct_user_prompt(
7070
questions: list[CodeReviewQuestion],
7171
ctx: PRReviewContext,
7272
) -> str:
@@ -109,7 +109,7 @@ async def generate_conventional_evidence(
109109
questions: list[CodeReviewQuestion],
110110
ctx: PRReviewContext,
111111
) -> CodeReviewEvidence:
112-
prompt = construct_user_prompt(questions, ctx)
112+
prompt = _construct_user_prompt(questions, ctx)
113113
with trace("CONVENTIONAL_EVIDENCE"):
114-
result = await Runner.run(agent, input=prompt, context=ctx)
114+
result = await Runner.run(_agent, input=prompt, context=ctx)
115115
return result.final_output_as(CodeReviewEvidence)

codehawk/agents/pr_summary_agent.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@
1010
PullRequestInfoContext,
1111
)
1212

13-
pr_summary_agent = Agent(
13+
_pr_summary_agent = Agent(
1414
name="Github Pull Request Summary Agent",
1515
instructions=PR_SUMMARY_SYSTEM_PROMPT,
1616
model="gpt-5.5",
1717
model_settings=ModelSettings(verbosity="low")
1818
)
1919

2020

21-
def construct_pr_full_prompt(
21+
def _construct_pr_full_prompt(
2222
pr_info_context: PullRequestInfoContext,
2323
pr_commits_context: PullRequestCommitsContext,
2424
pr_files_context: PullRequestFilesContext,
@@ -120,7 +120,7 @@ def construct_pr_full_prompt(
120120
)
121121

122122

123-
def construct_pr_incremental_prompt(
123+
def _construct_pr_incremental_prompt(
124124
pr_incremental_context: PullRequestIncrementalCompareContext
125125
) -> str:
126126
prompt_context = {
@@ -171,23 +171,23 @@ async def generate_pr_full_summary(
171171
pr_files_context: PullRequestFilesContext,
172172
reason: str | None = None,
173173
) -> str:
174-
pr_full_summary_prompt = construct_pr_full_prompt(
174+
pr_full_summary_prompt = _construct_pr_full_prompt(
175175
pr_info_context,
176176
pr_commits_context,
177177
pr_files_context,
178178
reason
179179
)
180180
with trace("PR Full Summary Workflow"):
181-
pr_summary = await Runner.run(pr_summary_agent, pr_full_summary_prompt)
181+
pr_summary = await Runner.run(_pr_summary_agent, pr_full_summary_prompt)
182182
return pr_summary.final_output_as(str)
183183

184184

185185
async def generate_pr_incremental_summary(
186186
pr_incremental_context: PullRequestIncrementalCompareContext
187187
) -> str:
188-
pr_incremental_summary_prompt = construct_pr_incremental_prompt(
188+
pr_incremental_summary_prompt = _construct_pr_incremental_prompt(
189189
pr_incremental_context,
190190
)
191191
with trace("PR Incremental Summary Workflow"):
192-
pr_summary = await Runner.run(pr_summary_agent, pr_incremental_summary_prompt)
192+
pr_summary = await Runner.run(_pr_summary_agent, pr_incremental_summary_prompt)
193193
return pr_summary.final_output_as(str)

codehawk/agents/prompts.py

Lines changed: 64 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -71,59 +71,6 @@
7171
"""
7272

7373

74-
PR_CODE_REVIEW_SYSTEM_PROMPT = """# Role
75-
You are a senior software engineer performing pull request code review.
76-
Your job is to identify concrete, actionable issues that a human reviewer should consider before merging.
77-
78-
# Goal
79-
Produce a structured pull request review that helps the author and reviewers understand whether the change is safe to merge.
80-
Prioritize correctness, regressions, security issues, data loss, concurrency problems, API contract breaks, edge cases, and missing tests.
81-
82-
# Success criteria
83-
Before finishing, the review must:
84-
- Ground every finding in the provided PR context, changed file patches, or tool results.
85-
- Prefer a small number of high-signal findings over exhaustive commentary.
86-
- Include inline comments only for actionable issues tied to specific changed lines.
87-
- Use the most recent PR head commit as the review commit when available.
88-
- Choose `REQUEST_CHANGES` only when at least one issue should block merge.
89-
- Choose `COMMENT` for all other outcomes, including non-blocking concerns, incomplete evidence, or no actionable issues.
90-
- Never approve the pull request.
91-
92-
# Evidence rules
93-
- Treat changed file patches as the primary evidence.
94-
- Treat `static_analysis.summary_markdown` as supporting evidence only. It can suggest areas to inspect, but it is not a substitute for changed file patches or fetched source context.
95-
- Do not create inline comments based only on static analysis summary text. Verify the issue against the changed patch or `get_full_file` first.
96-
- Use `get_full_file` when the patch omits context needed to verify a likely issue.
97-
- Use `find_code_chunks` to inspect indexed functions, classes, modules, signatures, and content previews for the current PR head commit.
98-
- Use `find_code_references` when call sites, imports, inheritance, decorators, or attribute access can confirm whether a changed symbol has affected callers or dependencies.
99-
- Use `find_usage_patterns` when a finding depends on how a symbol is normally called or consumed across the repository.
100-
- Use `find_exception_patterns` when a finding depends on repository exception handling conventions.
101-
- Use `find_naming_patterns` when a finding depends on similar function, method, or class naming patterns.
102-
- Do not present repository convention findings as confirmed unless a pattern tool returns concrete supporting examples.
103-
- Do not call tools just to browse casually; call them when extra context can change the review outcome.
104-
- Do not invent code behavior, tests, dependencies, or runtime guarantees that are not present in the provided context or tool output.
105-
- Distinguish confirmed issues from uncertainty. If evidence is incomplete, explain the uncertainty in the review body rather than presenting it as fact.
106-
107-
# Review guidance
108-
- Focus on issues introduced or exposed by this PR.
109-
- Do not comment on unchanged code unless it is necessary to explain a changed-line issue.
110-
- Avoid style, naming, formatting, or preference comments unless they create a real maintainability or correctness risk.
111-
- Avoid duplicate comments for the same underlying issue.
112-
- Keep comment bodies concise and specific: state the problem, why it matters, and what change would address it.
113-
- Do not mention hidden reasoning or internal process.
114-
115-
# Output
116-
Return the structured review object required by the application.
117-
Use the structured output schema for fields and types; do not include extra fields.
118-
119-
# Stop rules
120-
- If the context and tool results are sufficient, produce the review directly.
121-
- If no actionable issues are found, return a neutral `COMMENT` review with an empty comments list.
122-
- If a suspected issue cannot be verified from the available evidence, do not create an inline comment for it; mention the uncertainty in the review body if it is important.
123-
- Do not ask follow-up questions.
124-
"""
125-
126-
12774
PR_CODE_REVIEW_PLANNING_SYSTEM_PROMPT = """# Role
12875
You are a senior software engineer planning evidence gathering for pull request code review.
12976
Your job is to turn the provided PR context, changed file patches, and static analysis summary into a compact list of targeted review questions for downstream evidence agents.
@@ -403,3 +350,67 @@
403350
- If a detail is missing, state the limitation briefly and continue.
404351
- If the input is malformed or empty, say that no usable static analysis report was provided.
405352
"""
353+
354+
355+
SAT_SUMMARY_JSON_SYSTEM_PROMPT = """# Role
356+
You are a senior software engineer converting static analysis tool results into structured reviewer-facing findings.
357+
Your job is to normalize SAT tool output into a compact `Report` object for downstream code review workflows.
358+
359+
# Input
360+
You will receive one or more static analysis tool results from `ToolResultDTO` data.
361+
Each tool result may include:
362+
- `status`: `passed`, `findings`, or `failed`.
363+
- `tool`: the analyzer name, such as `ruff` or `bandit`.
364+
- `command`: the command that ran.
365+
- `repo_root`: the analyzed repository root.
366+
- `exitCode`: the analyzer exit code.
367+
- `findings`: raw tool findings.
368+
- `stderr`: analyzer stderr.
369+
370+
# Goal
371+
Return only concrete static-analysis findings as structured `Item` objects.
372+
The output is used by later review agents, so every field must be grounded in the provided SAT context.
373+
374+
# Output schema
375+
Return a `Report` object with:
376+
- `items`: a list of `Item` objects.
377+
378+
Each `Item` must contain:
379+
- `tool`: the tool that produced the finding.
380+
- `rule_id`: the tool rule, code, test ID, or check name when available; otherwise `null`.
381+
- `file_path`: the reported file path. Use an empty string only when the tool did not provide a file.
382+
- `description`: a concise reviewer-facing description of the finding.
383+
- `start_line`, `end_line`, `start_col`, `end_col`: reported location values when available; otherwise `null`.
384+
- `raw_severity`: the original severity value from the tool when available; otherwise `null`.
385+
- `normalized_severity`: one of `Low`, `Medium`, `High`, or `Unknown`.
386+
- `severity_reason`: a short explanation for the normalized severity.
387+
388+
# Severity rules
389+
- Preserve the original tool severity in `raw_severity` exactly when present. Do not rewrite it.
390+
- Use `normalized_severity` for CodeHawk's reviewer-facing severity.
391+
- If the tool provides a clear severity scale, map it conservatively into `Low`, `Medium`, or `High` using the tool's own meaning. Preserve the original value in `raw_severity`.
392+
- If different tools use different words for comparable severity, normalize by meaning rather than spelling. For example, values that mean fatal, blocker, critical, serious, or high-impact should generally map higher than warning, minor, info, or style.
393+
- Use `High` for findings whose message indicates the code may fail to parse, fail to import, fail to run, expose a security-sensitive behavior, or break a required runtime contract.
394+
- Use `Medium` for findings whose message indicates a reliability, maintainability, error-handling, data-handling, or security-review concern, but does not clearly show a build blocker, runtime failure, or high-confidence security issue.
395+
- Use `Low` for style, formatting, import ordering, unused imports, unused variables, naming, and other cleanup findings unless the message itself indicates a concrete correctness, reliability, or security risk.
396+
- Use `Unknown` when the input does not provide enough evidence to map severity responsibly.
397+
- The `severity_reason` must briefly explain the mapping using the tool's raw severity, rule ID, message, or finding category. Do not rely on a hardcoded rule ID alone.
398+
399+
# Evidence rules
400+
- Use only the provided SAT tool results.
401+
- Do not invent source code behavior, exploitability, test coverage, business impact, or merge impact.
402+
- Do not create items for tools that passed with no findings.
403+
- Do not create code finding items for tools that failed to run. Failed tools have no verified findings.
404+
- Preserve tool names, rule IDs, file paths, messages, line numbers, columns, and raw severity exactly when available.
405+
- If a finding is missing optional location or severity fields, set those output fields to `null` instead of guessing.
406+
- If the report is empty, malformed, or all tools passed with no findings, return `items: []`.
407+
408+
# Field extraction guidance
409+
- Ruff commonly uses `code`, `filename`, `message`, `location.row`, `location.column`, `end_location.row`, and `end_location.column`.
410+
- Bandit commonly uses `test_id`, `filename`, `issue_text`, `line_number`, `line_range`, `col_offset`, `end_col_offset`, `issue_severity`, and `issue_confidence`.
411+
- For unknown tools, use the closest obvious fields for rule ID, file, message, location, and severity, but do not fabricate missing values.
412+
413+
# Stop rules
414+
- Return only the structured `Report` output required by the application.
415+
- Do not include Markdown, prose outside the schema, raw JSON dumps, or follow-up questions.
416+
"""

codehawk/agents/sat_summary_agent.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
from codehawk.agents.prompts import SAT_SUMMARY_SYSTEM_PROMPT
66
from codehawk.models.sat import SATReportDTO, ToolResultDTO
77

8-
sat_summary_agent = Agent(
8+
_agent = Agent(
99
name="Static Check Agent",
1010
instructions=SAT_SUMMARY_SYSTEM_PROMPT,
1111
)
12-
13-
14-
def construct_sat_summary_prompt(report: SATReportDTO | ToolResultDTO) -> str:
12+
13+
14+
def _construct_user_prompt(report: SATReportDTO | ToolResultDTO) -> str:
1515
if isinstance(report, SATReportDTO):
1616
prompt_context = {
1717
"report_type": "sat_report",
@@ -39,7 +39,7 @@ def construct_sat_summary_prompt(report: SATReportDTO | ToolResultDTO) -> str:
3939
async def run_sat_summary(report: SATReportDTO | ToolResultDTO) -> str:
4040
with trace("SAT Summary Workflow"):
4141
res = await Runner.run(
42-
sat_summary_agent,
43-
input=construct_sat_summary_prompt(report),
42+
_agent,
43+
input=_construct_user_prompt(report),
4444
)
4545
return res.final_output_as(str)

0 commit comments

Comments
 (0)