-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_quality_styx_vs_raw.json
More file actions
executable file
·37 lines (37 loc) · 1.27 KB
/
benchmark_quality_styx_vs_raw.json
File metadata and controls
executable file
·37 lines (37 loc) · 1.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
{
"benchmark": "STYX vs Raw Context Quality",
"version": "1.0",
"date": "2026-02-07",
"description": "Blind A/B quality comparison between STYX compressed context and full raw context",
"methodology": {
"type": "Blind A/B with position randomization",
"samples": 100,
"models_per_sample": 5,
"total_judgments": 500,
"judge_protocol": "Each model sees two responses to the same question - one generated from STYX context, one from full context. Position (A/B) randomized per judgment. Neither judge model nor evaluator knows which is which.",
"hardware": "NVIDIA A100 GPU"
},
"models": [
"mistral:7b",
"llama3.2",
"deepseek-coder-v2:16b",
"phi3:mini",
"qwen2.5-coder:7b"
],
"results": {
"styx_wins": 239,
"raw_wins": 238,
"ties": 23,
"styx_win_rate": 0.478,
"raw_win_rate": 0.476,
"tie_rate": 0.046
},
"verdict": "Statistical parity - STYX compressed context produces equivalent quality answers to full raw context",
"compression_context": {
"average_reduction": "98.27%",
"documents_validated": 60900,
"max_compression_ratio": "57.7x"
},
"patent": "Pending #63/975,190",
"note": "This file contains aggregate metrics only. No sample texts, STYX outputs, or raw contexts are included."
}