Skip to content

Commit 0235dc7

Browse files
authored
Merge pull request #113 from beehive-lab/feat/performance-metrics
Improve collection of performance/throughput metrics
2 parents 41f0ee5 + e966ed1 commit 0235dc7

18 files changed

Lines changed: 462 additions & 152 deletions

LlamaTornadoCli.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
package org.beehive.gpullama3.cli;
2020

2121
import org.beehive.gpullama3.Options;
22-
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
2322
import org.beehive.gpullama3.inference.sampler.Sampler;
2423
import org.beehive.gpullama3.model.Model;
2524

llama-tornado

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,6 @@ class LlamaRunner:
9696

9797
if args.cuda_graphs:
9898
cmd.append("-Dllama.cudaGraphs=true")
99-
elif args.no_cuda_graphs:
100-
cmd.append("-Dllama.cudaGraphs=false")
10199

102100
# Debug options
103101
debug_config = []

src/main/java/org/beehive/gpullama3/LlamaApp.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package org.beehive.gpullama3;
22

3-
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
3+
import org.beehive.gpullama3.auxiliary.RunMetrics;
44
import org.beehive.gpullama3.inference.sampler.Sampler;
55
import org.beehive.gpullama3.model.Model;
66

@@ -18,7 +18,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o
1818
String response = model.runInstructOnce(sampler, options);
1919
System.out.println(response);
2020
if (SHOW_PERF_INTERACTIVE) {
21-
LastRunMetrics.printMetrics();
21+
RunMetrics.printMetrics();
2222
}
2323
}
2424

src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java

Lines changed: 0 additions & 33 deletions
This file was deleted.
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
package org.beehive.gpullama3.auxiliary;
2+
3+
import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer;
4+
import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer;
5+
import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer;
6+
import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer;
7+
import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot;
8+
9+
import java.io.IOException;
10+
import java.io.UncheckedIOException;
11+
import java.nio.file.Files;
12+
import java.nio.file.Path;
13+
14+
/**
15+
* Singleton that accumulates fine-grained performance metrics across one inference run.
16+
*
17+
* <p>Metrics are set incrementally by different layers of the stack:</p>
18+
* <ul>
19+
* <li>{@link #setLoadDuration} — called from {@code ModelLoader}</li>
20+
* <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
21+
* <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
22+
* <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
23+
* </ul>
24+
*
25+
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable
26+
* {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.</p>
27+
*
28+
* <p>Configurable via system properties:</p>
29+
* <ul>
30+
* <li>{@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}</li>
31+
* <li>{@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}</li>
32+
* <li>{@code llama.metrics.file} — target path when {@code output=file}</li>
33+
* </ul>
34+
*/
35+
public final class RunMetrics {
36+
37+
// ── Core metrics (nanoseconds) ────────────────────────────────────────────
38+
private long totalDurationNs;
39+
private long loadDurationNs;
40+
private int promptEvalCount;
41+
private long promptEvalDurationNs;
42+
private int evalCount;
43+
private long evalDurationNs;
44+
private boolean hasPrefillPhase;
45+
46+
// ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
47+
private long tornadoPlanCreationNs;
48+
private long tornadoJitNs;
49+
private long readOnlyWeightsCopyInNs;
50+
51+
// ── Singleton ─────────────────────────────────────────────────────────────
52+
private static final RunMetrics INSTANCE = new RunMetrics();
53+
54+
private RunMetrics() {}
55+
56+
// ── Setters ───────────────────────────────────────────────────────────────
57+
58+
/** Records the time spent loading the model file (not including TornadoVM initialisation). */
59+
public static void setLoadDuration(long ns) {
60+
INSTANCE.loadDurationNs = ns;
61+
}
62+
63+
/**
64+
* Records TornadoVM-specific initialisation durations.
65+
*
66+
* @param planCreationNs task-graph construction ({@code createExecutionPlan()})
67+
* @param jitNs JIT compilation ({@code withPreCompilation()})
68+
* @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
69+
*/
70+
public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
71+
INSTANCE.tornadoPlanCreationNs = planCreationNs;
72+
INSTANCE.tornadoJitNs = jitNs;
73+
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
74+
}
75+
76+
/**
77+
* Records inference-phase durations at the end of a generation run.
78+
*
79+
* @param promptCount number of prompt tokens processed (prefill)
80+
* @param prefillNs wall-clock time spent in the prefill phase
81+
* @param generatedCount number of tokens generated (decode)
82+
* @param decodeNs wall-clock time spent in the decode phase
83+
* @param totalNs total wall-clock time for the full inference call
84+
*/
85+
public static void setInferenceMetrics(int promptCount, long prefillNs,
86+
int generatedCount, long decodeNs,
87+
long totalNs) {
88+
INSTANCE.promptEvalCount = promptCount;
89+
INSTANCE.promptEvalDurationNs = prefillNs;
90+
INSTANCE.evalCount = generatedCount;
91+
INSTANCE.evalDurationNs = decodeNs;
92+
INSTANCE.totalDurationNs = totalNs;
93+
}
94+
95+
/**
96+
* Signals that prefill and decode are distinct timed phases.
97+
* Called by {@code InferenceEngineWithPrefillDecode} and
98+
* {@code InferenceEngineWithBatchPrefillDecode} before returning.
99+
*/
100+
public static void setHasPrefillPhase(boolean value) {
101+
INSTANCE.hasPrefillPhase = value;
102+
}
103+
104+
// ── Snapshot ──────────────────────────────────────────────────────────────
105+
106+
/** Returns an immutable snapshot of all currently collected metrics. */
107+
public static RunMetricsSnapshot snapshot() {
108+
RunMetrics m = INSTANCE;
109+
return RunMetricsSnapshot.of(
110+
m.totalDurationNs, m.loadDurationNs,
111+
m.promptEvalCount, m.promptEvalDurationNs,
112+
m.evalCount, m.evalDurationNs,
113+
m.hasPrefillPhase,
114+
m.tornadoPlanCreationNs, m.tornadoJitNs,
115+
m.readOnlyWeightsCopyInNs);
116+
}
117+
118+
// ── Output ────────────────────────────────────────────────────────────────
119+
120+
/**
121+
* Builds a snapshot, selects a renderer based on {@code llama.metrics.format},
122+
* and writes the result to the sink configured by {@code llama.metrics.output}.
123+
*/
124+
public static void printMetrics() {
125+
RunMetricsSnapshot snap = snapshot();
126+
127+
MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) {
128+
case "json" -> new JsonMetricsRenderer();
129+
case "github" -> new GitHubMetricsRenderer();
130+
default -> new HumanMetricsRenderer();
131+
};
132+
133+
String rendered = renderer.render(snap);
134+
135+
switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) {
136+
case "stdout" -> System.out.print(rendered);
137+
case "file" -> writeToFile(rendered);
138+
default -> System.err.print(rendered);
139+
}
140+
}
141+
142+
private static void writeToFile(String content) {
143+
String filePath = System.getProperty("llama.metrics.file");
144+
if (filePath == null || filePath.isBlank()) {
145+
throw new IllegalStateException(
146+
"llama.metrics.output=file requires llama.metrics.file to be set");
147+
}
148+
Path path = Path.of(filePath);
149+
try {
150+
Path parent = path.getParent();
151+
if (parent != null) Files.createDirectories(parent);
152+
Files.writeString(path, content);
153+
} catch (IOException e) {
154+
throw new UncheckedIOException("Failed to write metrics to " + filePath, e);
155+
}
156+
}
157+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/**
4+
* Renders metrics as a Markdown table suitable for appending to {@code $GITHUB_STEP_SUMMARY}.
5+
* TornadoVM rows (compile, JIT, weight copy-in) are included only when plan-creation duration
6+
* is non-zero, i.e. on GPU runs.
7+
*
8+
* <p>Enable via system properties and append the output file to the step summary:</p>
9+
* <pre>
10+
* -Dllama.metrics.format=github
11+
* -Dllama.metrics.output=file
12+
* -Dllama.metrics.file=/tmp/metrics.md
13+
* </pre>
14+
*
15+
* <p>In a GitHub Actions workflow step:</p>
16+
* <pre>
17+
* cat /tmp/metrics.md >> $GITHUB_STEP_SUMMARY
18+
* </pre>
19+
*/
20+
public final class GitHubMetricsRenderer implements MetricsRenderer {
21+
22+
@Override
23+
public String render(RunMetricsSnapshot s) {
24+
StringBuilder sb = new StringBuilder();
25+
sb.append("| metric | value |\n");
26+
sb.append("|---|---:|\n");
27+
sb.append(String.format("| eval tok/s | %.2f |%n", s.evalRate()));
28+
sb.append(String.format("| prompt eval tok/s | %.2f |%n", s.promptEvalRate()));
29+
sb.append(String.format("| total tok/s | %.2f |%n", s.totalRate()));
30+
sb.append(String.format("| load ms | %.2f |%n", s.loadDuration() / 1_000_000.0));
31+
sb.append(String.format("| eval tokens | %d |%n", s.evalCount()));
32+
sb.append(String.format("| prompt tokens | %d |%n", s.promptEvalCount()));
33+
sb.append(String.format("| total tokens | %d |%n", s.totalCount()));
34+
if (s.tornadoPlanCreationDuration() > 0) {
35+
sb.append(String.format("| compile ms | %.2f |%n",
36+
s.tornadoPlanCreationDuration() / 1_000_000.0));
37+
sb.append(String.format("| jit ms | %.2f |%n",
38+
s.tornadoJitDuration() / 1_000_000.0));
39+
sb.append(String.format("| weight copy-in ms | %.2f |%n",
40+
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
41+
}
42+
return sb.toString();
43+
}
44+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/**
4+
* Renders metrics in human-readable format to {@code stderr}.
5+
*
6+
* <p>This is the default renderer — no configuration needed. To enable explicitly:</p>
7+
* <pre>
8+
* -Dllama.metrics.format=human (default, can be omitted)
9+
* -Dllama.metrics.output=stderr (default, can be omitted)
10+
* </pre>
11+
*
12+
* <p>To also print TornadoVM initialisation timings (plan creation, JIT, weight copy-in),
13+
* additionally set:</p>
14+
* <pre>
15+
* -Dllama.EnableTimingForTornadoVMInit=true
16+
* </pre>
17+
*/
18+
public final class HumanMetricsRenderer implements MetricsRenderer {
19+
20+
@Override
21+
public String render(RunMetricsSnapshot s) {
22+
StringBuilder sb = new StringBuilder();
23+
sb.append("\n==== Performance Metrics ====\n");
24+
25+
if (s.hasPrefillPhase()) {
26+
sb.append(String.format(
27+
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
28+
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
29+
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
30+
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9,
31+
s.promptEvalRate(), s.promptEvalCount(), s.promptEvalDuration() / 1e9,
32+
s.evalRate(), s.evalCount(), s.evalDuration() / 1e9));
33+
} else {
34+
sb.append(String.format("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
35+
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9));
36+
}
37+
38+
if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
39+
&& s.tornadoPlanCreationDuration() > 0) {
40+
sb.append(String.format(
41+
"GGUF Model Load: %.2f ms%n" +
42+
"Compilation & CodeGen: %.2f ms%n" +
43+
"Warmup: %.2f ms%n" +
44+
"Read-only weights Copy-in: %.2f ms%n",
45+
s.loadDuration() / 1_000_000.0,
46+
s.tornadoPlanCreationDuration() / 1_000_000.0,
47+
s.tornadoJitDuration() / 1_000_000.0,
48+
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
49+
}
50+
51+
return sb.toString();
52+
}
53+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
/**
7+
* Renders metrics as an Ollama-compatible JSON object.
8+
* All duration fields are in nanoseconds; rate fields are in tokens per second.
9+
* The {@code tornadovm} nested object is always included (fields are zero on CPU runs).
10+
*
11+
* <p>Enable via system properties:</p>
12+
* <pre>
13+
* -Dllama.metrics.format=json
14+
* -Dllama.metrics.output=stdout # pipe to jq or another tool
15+
* </pre>
16+
*
17+
* <p>Or write to a file:</p>
18+
* <pre>
19+
* -Dllama.metrics.format=json
20+
* -Dllama.metrics.output=file
21+
* -Dllama.metrics.file=/path/to/metrics.json
22+
* </pre>
23+
*/
24+
public final class JsonMetricsRenderer implements MetricsRenderer {
25+
26+
@Override
27+
public String render(RunMetricsSnapshot s) {
28+
List<String> fields = new ArrayList<>();
29+
fields.add(field(" ", "total_duration", s.totalDuration()));
30+
fields.add(field(" ", "load_duration", s.loadDuration()));
31+
fields.add(field(" ", "prompt_eval_count", s.promptEvalCount()));
32+
fields.add(field(" ", "prompt_eval_duration", s.promptEvalDuration()));
33+
fields.add(field(" ", "eval_count", s.evalCount()));
34+
fields.add(field(" ", "eval_duration", s.evalDuration()));
35+
fields.add(field(" ", "total_count", s.totalCount()));
36+
fields.add(field(" ", "prompt_eval_rate", s.promptEvalRate()));
37+
fields.add(field(" ", "eval_rate", s.evalRate()));
38+
fields.add(field(" ", "total_rate", s.totalRate()));
39+
fields.add(field(" ", "has_prefill_phase", s.hasPrefillPhase()));
40+
fields.add(tornadoObject(s));
41+
return "{\n" + String.join(",\n", fields) + "\n}";
42+
}
43+
44+
private static String tornadoObject(RunMetricsSnapshot s) {
45+
List<String> inner = new ArrayList<>();
46+
inner.add(field(" ", "plan_creation_duration", s.tornadoPlanCreationDuration()));
47+
inner.add(field(" ", "jit_duration", s.tornadoJitDuration()));
48+
inner.add(field(" ", "read_only_weights_copy_in_duration", s.tornadoReadOnlyWeightsCopyInDuration()));
49+
return " \"tornadovm\": {\n" + String.join(",\n", inner) + "\n }";
50+
}
51+
52+
private static String field(String indent, String key, long value) {
53+
return indent + "\"" + key + "\": " + value;
54+
}
55+
56+
private static String field(String indent, String key, int value) {
57+
return indent + "\"" + key + "\": " + value;
58+
}
59+
60+
private static String field(String indent, String key, double value) {
61+
return indent + "\"" + key + "\": " + String.format("%.4f", value);
62+
}
63+
64+
private static String field(String indent, String key, boolean value) {
65+
return indent + "\"" + key + "\": " + value;
66+
}
67+
}

0 commit comments

Comments
 (0)