|
| 1 | +{ |
| 2 | + "reportSchema": "eliza.speculative-benchmark.v1", |
| 3 | + "generatedAt": "2026-05-20T18:06:45.856Z", |
| 4 | + "verifier": "plugins/plugin-local-inference/native/verify/dflash_drafter_runtime_smoke.mjs", |
| 5 | + "speculator": "dflash", |
| 6 | + "tier": "2b", |
| 7 | + "targetModel": "/Users/shawwalters/.eliza/local-inference/models/eliza-1-2b.bundle/text/eliza-1-2b-64k.gguf", |
| 8 | + "drafterModel": "evidence/speculative/rerun-20260520/true-dflash-2b-200step-q4_k_m.local-stamped.gguf", |
| 9 | + "specBinary": ".tmp/llama-mtp-build/bin/llama-cli", |
| 10 | + "benchTokens": 512, |
| 11 | + "available": true, |
| 12 | + "status": "pass", |
| 13 | + "failure": null, |
| 14 | + "backend": "unknown", |
| 15 | + "binary": { |
| 16 | + "path": ".tmp/llama-mtp-build/bin/llama-cli", |
| 17 | + "exists": true, |
| 18 | + "sha256": "ff9e4ed6bf303fe54095b5757fd66d04799ea28d3548db3f5dc5d6cae50d52f9" |
| 19 | + }, |
| 20 | + "drafted": 255, |
| 21 | + "accepted": 255, |
| 22 | + "acceptanceRate": 1, |
| 23 | + "speedup": 0.9285714285714287, |
| 24 | + "withDrafter": { |
| 25 | + "available": true, |
| 26 | + "binary": ".tmp/llama-mtp-build/bin/llama-cli", |
| 27 | + "withDrafter": true, |
| 28 | + "args": [ |
| 29 | + "-m", |
| 30 | + "/Users/shawwalters/.eliza/local-inference/models/eliza-1-2b.bundle/text/eliza-1-2b-64k.gguf", |
| 31 | + "-md", |
| 32 | + "evidence/speculative/rerun-20260520/true-dflash-2b-200step-q4_k_m.local-stamped.gguf", |
| 33 | + "-p", |
| 34 | + "Write a short paragraph about speculative decoding.", |
| 35 | + "-n", |
| 36 | + "512", |
| 37 | + "-c", |
| 38 | + "2048", |
| 39 | + "-ngl", |
| 40 | + "99", |
| 41 | + "-ngld", |
| 42 | + "99", |
| 43 | + "--single-turn", |
| 44 | + "--simple-io", |
| 45 | + "--no-display-prompt", |
| 46 | + "--log-disable", |
| 47 | + "--spec-draft-n-min", |
| 48 | + "1", |
| 49 | + "--spec-draft-n-max", |
| 50 | + "1", |
| 51 | + "--spec-draft-p-min", |
| 52 | + "0.1", |
| 53 | + "--spec-type", |
| 54 | + "dflash" |
| 55 | + ], |
| 56 | + "skippedCliFlags": [ |
| 57 | + { |
| 58 | + "flag": "--ctx-size-draft", |
| 59 | + "alternatives": [ |
| 60 | + "--spec-draft-ctx-size", |
| 61 | + "-cd" |
| 62 | + ], |
| 63 | + "values": [ |
| 64 | + "2048" |
| 65 | + ], |
| 66 | + "reason": "not advertised by binary --help" |
| 67 | + } |
| 68 | + ], |
| 69 | + "status": 0, |
| 70 | + "signal": null, |
| 71 | + "error": null, |
| 72 | + "wallMs": 206177, |
| 73 | + "contextTokens": 2048, |
| 74 | + "draftMin": 1, |
| 75 | + "draftMax": 1, |
| 76 | + "tokensRequested": 512, |
| 77 | + "drafted": 255, |
| 78 | + "accepted": 255, |
| 79 | + "acceptanceRate": 1, |
| 80 | + "tokensPerSecond": 2.6, |
| 81 | + "generation": { |
| 82 | + "encoded": null, |
| 83 | + "decoded": null, |
| 84 | + "tokensPerSecond": null |
| 85 | + }, |
| 86 | + "timings": { |
| 87 | + "targetPromptEval": null, |
| 88 | + "targetEval": null, |
| 89 | + "draftPromptEval": null, |
| 90 | + "draftEval": null |
| 91 | + }, |
| 92 | + "vocabIncompatibleWarning": false, |
| 93 | + "draftingActive": true, |
| 94 | + "dflashFailure": null, |
| 95 | + "tokenizerCompatible": true, |
| 96 | + "outputTail": " /read <file> add a text file\n /glob <pattern> add text files using globbing pattern\n\n\n> Write a short paragraph about speculative decoding.\n\n[Start thinking]\nThinking Process:\n\n1. **Analyze the Request:**\n * **Topic:** Speculative decoding.\n * **Format:** Short paragraph.\n * **Goal:** Explain the concept concisely.\n\n2. **Define Speculative Decoding:**\n * What is it? A technique used in AI/ML inference to speed up computation.\n * How does it work? Instead of running a complex decoding algorithm (e.g., search, decoding) from scratch for every token, it uses a fast \"guess\" (speculation) based on a simplified model or prior knowledge.\n * What happens next? If the guess is wrong, it's corrected (decoding correction).\n * Why is it good? Speedup, efficiency.\n\n3. **Drafting - Attempt 1 (Mental Outline):**\n Speculative decoding is a technique to speed up AI inference. It uses a fast model to guess the next token. If it's wrong, it fixes it. This saves time compared to pure decoding.\n\n4. **Refining - Attempt 2 (Adding detail):**\n Speculative decoding is a technique used to accelerate machine learning model inference by predicting the output before performing full decoding. It typically uses a lightweight, fast model to make an educated guess about the next token to generate a prediction. If the guess is incorrect, it corrects the model in a second pass, allowing for a speedup.\n\n5. **Polishing - Attempt 3 (Conciseness and flow):**\n Speculative decoding is a technique used to accelerate machine learning model inference by predicting the output before performing full decoding. Instead of running a complex decoding algorithm from scratch for every token, it uses a fast, lightweight model to make an educated guess or \"speculate\" on the next token. If that initial guess is incorrect, the system can quickly correct the error in a second pass, significantly reducing computation time and improving performance.\n\n6. **Review against constraints:**\n * Short paragraph? Yes.\n * About speculative decoding? Yes.\n\n7. **Final Polish:** Make it punchier.\n Speculative decoding is an inference optimization technique designed to accelerate machine learning models by predicting outputs before performing full decoding. Instead of running a computationally expensive decoding process from scratch for every token, it uses a lightweight, fast model to generate an educated guess based on prior knowledge. If the initial\n\n[ Prompt: 50.8 t/s | Generation: 2.6 t/s ]\n[ Draft: 255 accepted / 255 generated | Acceptance: 1.0000 ]\n\nExiting..." |
| 97 | + }, |
| 98 | + "withoutDrafter": { |
| 99 | + "available": true, |
| 100 | + "binary": ".tmp/llama-mtp-build/bin/llama-cli", |
| 101 | + "withDrafter": false, |
| 102 | + "args": [ |
| 103 | + "-m", |
| 104 | + "/Users/shawwalters/.eliza/local-inference/models/eliza-1-2b.bundle/text/eliza-1-2b-64k.gguf", |
| 105 | + "-md", |
| 106 | + "evidence/speculative/rerun-20260520/true-dflash-2b-200step-q4_k_m.local-stamped.gguf", |
| 107 | + "-p", |
| 108 | + "Write a short paragraph about speculative decoding.", |
| 109 | + "-n", |
| 110 | + "512", |
| 111 | + "-c", |
| 112 | + "2048", |
| 113 | + "-ngl", |
| 114 | + "99", |
| 115 | + "-ngld", |
| 116 | + "99", |
| 117 | + "--single-turn", |
| 118 | + "--simple-io", |
| 119 | + "--no-display-prompt", |
| 120 | + "--log-disable", |
| 121 | + "--spec-draft-n-min", |
| 122 | + "0", |
| 123 | + "--spec-draft-n-max", |
| 124 | + "0", |
| 125 | + "--spec-draft-p-min", |
| 126 | + "1", |
| 127 | + "--spec-type", |
| 128 | + "dflash" |
| 129 | + ], |
| 130 | + "skippedCliFlags": [ |
| 131 | + { |
| 132 | + "flag": "--ctx-size-draft", |
| 133 | + "alternatives": [ |
| 134 | + "--spec-draft-ctx-size", |
| 135 | + "-cd" |
| 136 | + ], |
| 137 | + "values": [ |
| 138 | + "2048" |
| 139 | + ], |
| 140 | + "reason": "not advertised by binary --help" |
| 141 | + } |
| 142 | + ], |
| 143 | + "status": 0, |
| 144 | + "signal": null, |
| 145 | + "error": null, |
| 146 | + "wallMs": 186843, |
| 147 | + "contextTokens": 2048, |
| 148 | + "draftMin": 0, |
| 149 | + "draftMax": 0, |
| 150 | + "tokensRequested": 512, |
| 151 | + "drafted": 18, |
| 152 | + "accepted": 18, |
| 153 | + "acceptanceRate": 1, |
| 154 | + "tokensPerSecond": 2.8, |
| 155 | + "generation": { |
| 156 | + "encoded": null, |
| 157 | + "decoded": null, |
| 158 | + "tokensPerSecond": null |
| 159 | + }, |
| 160 | + "timings": { |
| 161 | + "targetPromptEval": null, |
| 162 | + "targetEval": null, |
| 163 | + "draftPromptEval": null, |
| 164 | + "draftEval": null |
| 165 | + }, |
| 166 | + "vocabIncompatibleWarning": false, |
| 167 | + "draftingActive": true, |
| 168 | + "dflashFailure": null, |
| 169 | + "tokenizerCompatible": true, |
| 170 | + "outputTail": "\navailable commands:\n /exit or Ctrl+C stop or exit\n /regen regenerate the last response\n /clear clear the chat history\n /read <file> add a text file\n /glob <pattern> add text files using globbing pattern\n\n\n> Write a short paragraph about speculative decoding.\n\n[Start thinking]\nThinking Process:\n\n1. **Analyze the Request:**\n * **Topic:** Speculative decoding (a concept in machine learning/computing).\n * **Format:** Short paragraph.\n * **Goal:** Explain or describe the concept clearly and concisely.\n\n2. **Define Speculative Decoding:**\n * What is it? It's a technique in NLP/NLP model inference.\n * How does it work? Instead of running the full model (which is slow/expensive), it uses a faster, lighter model (speculative model) to generate a likely sequence of tokens. If the speculative model predicts the right sequence, it's a \"hit\" (success); otherwise, a \"miss\" (failure).\n * What happens on a hit? The original model uses that sequence to continue generating the rest of the text, potentially skipping the computation of the speculative model.\n * What happens on a miss? The original model re-runs the speculative model to re-generate the sequence or a fallback path.\n * Why use it? To speed up inference (latency).\n * Key idea: \"Speculative\" means betting on a correct path based on the speculative model.\n\n3. **Drafting - Attempt 1 (Mental):**\n Speculative decoding is a fast way to generate text. It uses a smaller model to guess the next tokens. If it guesses right, the original big model skips those tokens. This makes inference faster because it saves computation. It's like a cache hit.\n\n4. **Refining - Attempt 2 (Adding technical nuance):**\n Speculative decoding is an optimization technique for large language models (LLMs). Instead of processing every token one by one, it employs a lightweight, faster model to predict a likely sequence of tokens. If the prediction is correct, the heavy model skips processing those tokens, significantly reducing latency. This is known as a \"speculative\" step where a small model is used in place of the full model.\n\n5. **Polishing - Attempt 3 (Concise and smooth):**\n Speculative decoding is an optimization technique for language models that improves inference speed by predicting a likely sequence of tokens using a lightweight, faster model before the full model executes. If the prediction is correct, the original model skips processing those tokens, effectively reducing latency. This approach allows for real-time text generation by leveraging\n\n[ Prompt: 77.0 t/s | Generation: 2.8 t/s ]\n[ Draft: 18 accepted / 18 generated | Acceptance: 1.0000 ]\n\nExiting..." |
| 171 | + }, |
| 172 | + "summary": { |
| 173 | + "tokensPerSecondWithDrafter": 2.6, |
| 174 | + "tokensPerSecondBaseline": 2.8, |
| 175 | + "generationTokensPerSecondWithDrafter": null, |
| 176 | + "generationTokensPerSecondBaseline": null, |
| 177 | + "targetEvalTokensPerSecondWithDrafter": null, |
| 178 | + "draftEvalTokensPerSecondWithDrafter": null, |
| 179 | + "dflashDraftedTokens": 255, |
| 180 | + "dflashAcceptedTokens": 255, |
| 181 | + "dflashAcceptanceRate": 1, |
| 182 | + "dflashSpeedup": 0.9285714285714287, |
| 183 | + "dflashDraftingActive": true, |
| 184 | + "dflashFailure": null, |
| 185 | + "tokenizerCompatible": true, |
| 186 | + "speculator": "dflash", |
| 187 | + "tier": "2b", |
| 188 | + "backend": "unknown", |
| 189 | + "binarySha256": "ff9e4ed6bf303fe54095b5757fd66d04799ea28d3548db3f5dc5d6cae50d52f9", |
| 190 | + "drafted": 255, |
| 191 | + "accepted": 255, |
| 192 | + "acceptanceRate": 1, |
| 193 | + "speedup": 0.9285714285714287, |
| 194 | + "status": "pass", |
| 195 | + "failure": null |
| 196 | + }, |
| 197 | + "draftingActive": true, |
| 198 | + "dflashFailure": null |
| 199 | +} |
0 commit comments