Skip to content

Commit a65b341

Browse files
Haiyue Zhangcursoragent
authored andcommitted
feat: release v0.5.2 - Industrial-grade benchmark & OWASP Agentic Top 10 full coverage
## Core Features - Complete OWASP Agentic Top 10 (ASI-01~10) coverage with 45+ detection rules - Agent-Vuln-Bench v1.0: 12 KNOWN CVEs + 6 WILD patterns + 2 NOISE projects - Ground Truth v2.2: 81 samples, 218 vulnerability annotations - Precision 98.51%, Recall 100%, F1-Score 99.25% ## New Detection Rules (v0.5.x) - AGENT-043: Daemon privilege escalation - AGENT-044: Sudoers NOPASSWD configuration - AGENT-045: Browser automation without sandbox - AGENT-046: System credential store access - AGENT-047: Subprocess execution without sandbox - AGENT-048: Extension permission boundaries ## v0.5.2 Micro-Patch - AGENT-043 tightened daemon detection - AGENT-046 credential store deduplication - AGENT-047 extended safe command list - Risk Score v2 formula with natural log scaling ## Benchmark Infrastructure - precision_recall.py: Per-ASI recall metrics - quality_gates_v2.yaml: Layer 1/2 thresholds - Agent-Vuln-Bench harness with SWE-bench style evaluation - Multi-tool comparison (vs Bandit, Semgrep) ## Tests - 716 tests passing - Full ASI category coverage validation Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent a03d5ba commit a65b341

250 files changed

Lines changed: 43594 additions & 241 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
File renamed without changes.

.github/workflows/benchmark.yml

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
name: Benchmark Quality Gate
2+
3+
on:
4+
push:
5+
branches: [main, master]
6+
paths:
7+
- 'packages/audit/**'
8+
- 'rules/**'
9+
- 'tests/**'
10+
pull_request:
11+
branches: [main, master]
12+
workflow_dispatch:
13+
14+
jobs:
15+
precision-recall:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- uses: actions/checkout@v4
19+
20+
- name: Set up Python
21+
uses: actions/setup-python@v5
22+
with:
23+
python-version: '3.11'
24+
25+
- name: Install dependencies
26+
run: |
27+
pip install -e packages/audit/
28+
pip install pyyaml
29+
30+
- name: Run Precision/Recall Evaluation
31+
run: |
32+
mkdir -p results
33+
python tests/benchmark/precision_recall.py \
34+
--output-json results/layer1.json \
35+
--verbose
36+
continue-on-error: true
37+
38+
- name: Upload Layer 1 Results
39+
uses: actions/upload-artifact@v4
40+
with:
41+
name: layer1-results
42+
path: results/layer1.json
43+
44+
- name: Layer 1 Quality Gate Check
45+
run: |
46+
python3 << 'EOF'
47+
import json
48+
import sys
49+
p = 'results/layer1.json'
50+
try:
51+
with open(p) as f:
52+
r = json.load(f)
53+
except FileNotFoundError:
54+
print('::warning::Layer 1 results not found (run may have failed)')
55+
sys.exit(0)
56+
print(f'Precision: {r.get("precision", 0):.2%}')
57+
print(f'Recall: {r.get("recall", 0):.2%}')
58+
print(f'F1: {r.get("f1_score", 0):.2%}')
59+
print(f'TP: {r.get("true_positives", 0)} | FP: {r.get("false_positives", 0)} | FN: {r.get("false_negatives", 0)}')
60+
warnings = []
61+
if r.get('precision', 0) < 0.90:
62+
warnings.append(f'Precision {r["precision"]:.2%} below 90% target')
63+
if r.get('recall', 0) < 0.85:
64+
warnings.append(f'Recall {r["recall"]:.2%} below 85% target')
65+
if r.get('f1_score', 0) < 0.87:
66+
warnings.append(f'F1 {r["f1_score"]:.2%} below 0.87 target')
67+
if warnings:
68+
for w in warnings:
69+
print(f'::warning::{w}')
70+
else:
71+
print('::notice::Layer 1 quality gate PASSED')
72+
EOF
73+
74+
atlas-coverage:
75+
runs-on: ubuntu-latest
76+
steps:
77+
- uses: actions/checkout@v4
78+
79+
- name: Set up Python
80+
uses: actions/setup-python@v5
81+
with:
82+
python-version: '3.11'
83+
84+
- name: Install dependencies
85+
run: pip install pyyaml
86+
87+
- name: Generate ATLAS Coverage Report
88+
run: python tests/benchmark/atlas_report.py
89+
90+
- name: Check ATLAS Mapping Count
91+
run: |
92+
python3 << 'EOF'
93+
import yaml
94+
95+
with open('rules/mappings/mitre_atlas.yaml') as f:
96+
data = yaml.safe_load(f)
97+
98+
mappings = data.get('mappings', {})
99+
print(f'Total rules mapped: {len(mappings)}')
100+
101+
if len(mappings) < 20:
102+
print(f'::warning::Only {len(mappings)} rules mapped, target is 20+')
103+
else:
104+
print(f'::notice::ATLAS coverage good ({len(mappings)} rules)')
105+
EOF
106+
107+
fixture-count:
108+
runs-on: ubuntu-latest
109+
steps:
110+
- uses: actions/checkout@v4
111+
112+
- name: Count Fixtures
113+
run: |
114+
FIXTURE_COUNT=$(find tests/fixtures -name "*.py" | wc -l)
115+
echo "Python fixtures: $FIXTURE_COUNT"
116+
117+
if [ "$FIXTURE_COUNT" -lt 50 ]; then
118+
echo "::warning::Only $FIXTURE_COUNT fixtures, target is 50+"
119+
else
120+
echo "::notice::Fixture count good ($FIXTURE_COUNT)"
121+
fi
122+
123+
- name: Count Ground Truth Samples
124+
run: |
125+
python3 << 'EOF'
126+
import yaml
127+
128+
with open('tests/ground_truth/labeled_samples.yaml') as f:
129+
data = yaml.safe_load(f)
130+
131+
samples = data.get('samples', [])
132+
print(f'Ground truth samples: {len(samples)}')
133+
134+
if len(samples) < 30:
135+
print(f'::warning::Only {len(samples)} labeled samples, target is 30+')
136+
else:
137+
print(f'::notice::Ground truth coverage good ({len(samples)} samples)')
138+
EOF
139+
140+
agent-vuln-bench:
141+
name: Agent-Vuln-Bench
142+
runs-on: ubuntu-latest
143+
steps:
144+
- uses: actions/checkout@v4
145+
146+
- name: Set up Python
147+
uses: actions/setup-python@v5
148+
with:
149+
python-version: '3.11'
150+
151+
- name: Install dependencies
152+
run: pip install -e packages/audit/ pyyaml
153+
154+
- name: Download baseline (if exists)
155+
uses: dawidd6/action-download-artifact@v3
156+
with:
157+
name: avb-baseline
158+
path: results/
159+
if_no_artifact_found: ignore
160+
workflow_conclusion: success
161+
branch: main
162+
continue-on-error: true
163+
164+
- name: Run Agent-Vuln-Bench
165+
run: |
166+
mkdir -p results
167+
BASELINE_ARG=""
168+
if [ -f "results/baseline.json" ]; then
169+
echo "Using baseline from previous run"
170+
BASELINE_ARG="--baseline results/baseline.json"
171+
fi
172+
python tests/benchmark/agent-vuln-bench/harness/run_eval.py \
173+
--tool agent-audit \
174+
--dataset all \
175+
--output results/avb_results.json \
176+
--report results/avb_report.md \
177+
$BASELINE_ARG
178+
continue-on-error: true
179+
180+
- name: Generate baseline (main branch only)
181+
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master'
182+
run: |
183+
python tests/benchmark/scripts/save_avb_baseline.py \
184+
--eval-results results/avb_results.json \
185+
--output results/baseline.json
186+
continue-on-error: true
187+
188+
- name: Upload AVB Results
189+
uses: actions/upload-artifact@v4
190+
with:
191+
name: avb-results
192+
path: results/
193+
194+
- name: Upload baseline artifact (main branch only)
195+
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master'
196+
uses: actions/upload-artifact@v4
197+
with:
198+
name: avb-baseline
199+
path: results/baseline.json
200+
continue-on-error: true
201+
202+
quality-gate:
203+
name: Quality Gate Check (v2)
204+
runs-on: ubuntu-latest
205+
needs: [precision-recall, agent-vuln-bench]
206+
steps:
207+
- uses: actions/checkout@v4
208+
209+
- name: Download artifacts
210+
uses: actions/download-artifact@v4
211+
with:
212+
path: results/
213+
merge-multiple: true
214+
215+
- name: Set up Python
216+
uses: actions/setup-python@v5
217+
with:
218+
python-version: '3.11'
219+
220+
- name: Install dependencies
221+
run: pip install pyyaml
222+
223+
- name: Run Quality Gate (v2)
224+
run: |
225+
python tests/benchmark/quality_gate_check.py \
226+
--config tests/benchmark/quality_gates_v2.yaml \
227+
--results results/
228+
continue-on-error: true
229+
230+
compare-tools:
231+
name: Multi-Tool Comparison (Optional)
232+
runs-on: ubuntu-latest
233+
if: github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
234+
steps:
235+
- uses: actions/checkout@v4
236+
237+
- name: Set up Python
238+
uses: actions/setup-python@v5
239+
with:
240+
python-version: '3.11'
241+
242+
- name: Install dependencies
243+
run: |
244+
pip install -e packages/audit/ pyyaml
245+
# Install optional tools for comparison (continue on error)
246+
pip install bandit semgrep || true
247+
248+
- name: Run Multi-Tool Evaluation
249+
run: |
250+
mkdir -p results
251+
python tests/benchmark/agent-vuln-bench/harness/run_eval.py \
252+
--tool all \
253+
--dataset all \
254+
--output results/ \
255+
--comparison-report results/comparison_report.md
256+
continue-on-error: true
257+
258+
- name: Upload Comparison Results
259+
uses: actions/upload-artifact@v4
260+
with:
261+
name: comparison-results
262+
path: |
263+
results/comparison_report.md
264+
results/comparison_results.json
265+
results/results.json
266+
if: always()

0 commit comments

Comments
 (0)