Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion LlamaTornadoCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
package org.beehive.gpullama3.cli;

import org.beehive.gpullama3.Options;
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;

Expand Down
2 changes: 0 additions & 2 deletions llama-tornado
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@ class LlamaRunner:

if args.cuda_graphs:
cmd.append("-Dllama.cudaGraphs=true")
elif args.no_cuda_graphs:
cmd.append("-Dllama.cudaGraphs=false")

# Debug options
debug_config = []
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/beehive/gpullama3/LlamaApp.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.beehive.gpullama3;

import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;

Expand All @@ -18,7 +18,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o
String response = model.runInstructOnce(sampler, options);
System.out.println(response);
if (SHOW_PERF_INTERACTIVE) {
LastRunMetrics.printMetrics();
RunMetrics.printMetrics();
}
}

Expand Down
33 changes: 0 additions & 33 deletions src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java

This file was deleted.

157 changes: 157 additions & 0 deletions src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package org.beehive.gpullama3.auxiliary;

import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer;
import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer;
import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer;
import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer;
import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* Singleton that accumulates fine-grained performance metrics across one inference run.
*
* <p>Metrics are set incrementally by different layers of the stack:</p>
* <ul>
* <li>{@link #setLoadDuration} — called from {@code ModelLoader}</li>
* <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
* <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
* <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
* </ul>
*
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable
* {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.</p>
*
* <p>Configurable via system properties:</p>
* <ul>
* <li>{@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}</li>
* <li>{@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}</li>
* <li>{@code llama.metrics.file} — target path when {@code output=file}</li>
* </ul>
*/
public final class RunMetrics {

// ── Core metrics (nanoseconds) ────────────────────────────────────────────
private long totalDurationNs;
private long loadDurationNs;
private int promptEvalCount;
private long promptEvalDurationNs;
private int evalCount;
private long evalDurationNs;
private boolean hasPrefillPhase;

// ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
private long tornadoPlanCreationNs;
private long tornadoJitNs;
private long readOnlyWeightsCopyInNs;

// ── Singleton ─────────────────────────────────────────────────────────────
private static final RunMetrics INSTANCE = new RunMetrics();

private RunMetrics() {}

// ── Setters ───────────────────────────────────────────────────────────────

/** Records the time spent loading the model file (not including TornadoVM initialisation). */
public static void setLoadDuration(long ns) {
INSTANCE.loadDurationNs = ns;
}

/**
* Records TornadoVM-specific initialisation durations.
*
* @param planCreationNs task-graph construction ({@code createExecutionPlan()})
* @param jitNs JIT compilation ({@code withPreCompilation()})
* @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
*/
public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
INSTANCE.tornadoPlanCreationNs = planCreationNs;
INSTANCE.tornadoJitNs = jitNs;
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
}

/**
* Records inference-phase durations at the end of a generation run.
*
* @param promptCount number of prompt tokens processed (prefill)
* @param prefillNs wall-clock time spent in the prefill phase
* @param generatedCount number of tokens generated (decode)
* @param decodeNs wall-clock time spent in the decode phase
* @param totalNs total wall-clock time for the full inference call
*/
public static void setInferenceMetrics(int promptCount, long prefillNs,
int generatedCount, long decodeNs,
long totalNs) {
INSTANCE.promptEvalCount = promptCount;
INSTANCE.promptEvalDurationNs = prefillNs;
INSTANCE.evalCount = generatedCount;
INSTANCE.evalDurationNs = decodeNs;
INSTANCE.totalDurationNs = totalNs;
}

/**
* Signals that prefill and decode are distinct timed phases.
* Called by {@code InferenceEngineWithPrefillDecode} and
* {@code InferenceEngineWithBatchPrefillDecode} before returning.
*/
public static void setHasPrefillPhase(boolean value) {
INSTANCE.hasPrefillPhase = value;
}

// ── Snapshot ──────────────────────────────────────────────────────────────

/** Returns an immutable snapshot of all currently collected metrics. */
public static RunMetricsSnapshot snapshot() {
RunMetrics m = INSTANCE;
return RunMetricsSnapshot.of(
m.totalDurationNs, m.loadDurationNs,
m.promptEvalCount, m.promptEvalDurationNs,
m.evalCount, m.evalDurationNs,
m.hasPrefillPhase,
m.tornadoPlanCreationNs, m.tornadoJitNs,
m.readOnlyWeightsCopyInNs);
}

// ── Output ────────────────────────────────────────────────────────────────

/**
* Builds a snapshot, selects a renderer based on {@code llama.metrics.format},
* and writes the result to the sink configured by {@code llama.metrics.output}.
*/
public static void printMetrics() {
RunMetricsSnapshot snap = snapshot();

MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) {
case "json" -> new JsonMetricsRenderer();
case "github" -> new GitHubMetricsRenderer();
default -> new HumanMetricsRenderer();
};

String rendered = renderer.render(snap);

switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) {
case "stdout" -> System.out.print(rendered);
case "file" -> writeToFile(rendered);
default -> System.err.print(rendered);
}
}

private static void writeToFile(String content) {
String filePath = System.getProperty("llama.metrics.file");
if (filePath == null || filePath.isBlank()) {
throw new IllegalStateException(
"llama.metrics.output=file requires llama.metrics.file to be set");
}
Path path = Path.of(filePath);
try {
Path parent = path.getParent();
if (parent != null) Files.createDirectories(parent);
Files.writeString(path, content);
} catch (IOException e) {
throw new UncheckedIOException("Failed to write metrics to " + filePath, e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.beehive.gpullama3.auxiliary.metrics;

/**
* Renders metrics as a Markdown table suitable for appending to {@code $GITHUB_STEP_SUMMARY}.
* TornadoVM rows (compile, JIT, weight copy-in) are included only when plan-creation duration
* is non-zero, i.e. on GPU runs.
*
* <p>Enable via system properties and append the output file to the step summary:</p>
* <pre>
* -Dllama.metrics.format=github
* -Dllama.metrics.output=file
* -Dllama.metrics.file=/tmp/metrics.md
* </pre>
*
* <p>In a GitHub Actions workflow step:</p>
* <pre>
* cat /tmp/metrics.md >> $GITHUB_STEP_SUMMARY
* </pre>
*/
public final class GitHubMetricsRenderer implements MetricsRenderer {

@Override
public String render(RunMetricsSnapshot s) {
StringBuilder sb = new StringBuilder();
sb.append("| metric | value |\n");
sb.append("|---|---:|\n");
sb.append(String.format("| eval tok/s | %.2f |%n", s.evalRate()));
sb.append(String.format("| prompt eval tok/s | %.2f |%n", s.promptEvalRate()));
sb.append(String.format("| total tok/s | %.2f |%n", s.totalRate()));
sb.append(String.format("| load ms | %.2f |%n", s.loadDuration() / 1_000_000.0));
sb.append(String.format("| eval tokens | %d |%n", s.evalCount()));
sb.append(String.format("| prompt tokens | %d |%n", s.promptEvalCount()));
sb.append(String.format("| total tokens | %d |%n", s.totalCount()));
if (s.tornadoPlanCreationDuration() > 0) {
sb.append(String.format("| compile ms | %.2f |%n",
s.tornadoPlanCreationDuration() / 1_000_000.0));
sb.append(String.format("| jit ms | %.2f |%n",
s.tornadoJitDuration() / 1_000_000.0));
sb.append(String.format("| weight copy-in ms | %.2f |%n",
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.beehive.gpullama3.auxiliary.metrics;

/**
* Renders metrics in human-readable format to {@code stderr}.
*
* <p>This is the default renderer — no configuration needed. To enable explicitly:</p>
* <pre>
* -Dllama.metrics.format=human (default, can be omitted)
* -Dllama.metrics.output=stderr (default, can be omitted)
* </pre>
*
* <p>To also print TornadoVM initialisation timings (plan creation, JIT, weight copy-in),
* additionally set:</p>
* <pre>
* -Dllama.EnableTimingForTornadoVMInit=true
* </pre>
*/
public final class HumanMetricsRenderer implements MetricsRenderer {

@Override
public String render(RunMetricsSnapshot s) {
StringBuilder sb = new StringBuilder();
sb.append("\n==== Performance Metrics ====\n");

if (s.hasPrefillPhase()) {
sb.append(String.format(
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9,
s.promptEvalRate(), s.promptEvalCount(), s.promptEvalDuration() / 1e9,
s.evalRate(), s.evalCount(), s.evalDuration() / 1e9));
} else {
sb.append(String.format("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9));
}

if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
&& s.tornadoPlanCreationDuration() > 0) {
sb.append(String.format(
"GGUF Model Load: %.2f ms%n" +
"Compilation & CodeGen: %.2f ms%n" +
"Warmup: %.2f ms%n" +
"Read-only weights Copy-in: %.2f ms%n",
s.loadDuration() / 1_000_000.0,
s.tornadoPlanCreationDuration() / 1_000_000.0,
s.tornadoJitDuration() / 1_000_000.0,
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
}

return sb.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.beehive.gpullama3.auxiliary.metrics;

import java.util.ArrayList;
import java.util.List;

/**
* Renders metrics as an Ollama-compatible JSON object.
* All duration fields are in nanoseconds; rate fields are in tokens per second.
* The {@code tornadovm} nested object is always included (fields are zero on CPU runs).
*
* <p>Enable via system properties:</p>
* <pre>
* -Dllama.metrics.format=json
* -Dllama.metrics.output=stdout # pipe to jq or another tool
* </pre>
*
* <p>Or write to a file:</p>
* <pre>
* -Dllama.metrics.format=json
* -Dllama.metrics.output=file
* -Dllama.metrics.file=/path/to/metrics.json
* </pre>
*/
public final class JsonMetricsRenderer implements MetricsRenderer {

@Override
public String render(RunMetricsSnapshot s) {
List<String> fields = new ArrayList<>();
fields.add(field(" ", "total_duration", s.totalDuration()));
fields.add(field(" ", "load_duration", s.loadDuration()));
fields.add(field(" ", "prompt_eval_count", s.promptEvalCount()));
fields.add(field(" ", "prompt_eval_duration", s.promptEvalDuration()));
fields.add(field(" ", "eval_count", s.evalCount()));
fields.add(field(" ", "eval_duration", s.evalDuration()));
fields.add(field(" ", "total_count", s.totalCount()));
fields.add(field(" ", "prompt_eval_rate", s.promptEvalRate()));
fields.add(field(" ", "eval_rate", s.evalRate()));
fields.add(field(" ", "total_rate", s.totalRate()));
fields.add(field(" ", "has_prefill_phase", s.hasPrefillPhase()));
fields.add(tornadoObject(s));
return "{\n" + String.join(",\n", fields) + "\n}";
}

private static String tornadoObject(RunMetricsSnapshot s) {
List<String> inner = new ArrayList<>();
inner.add(field(" ", "plan_creation_duration", s.tornadoPlanCreationDuration()));
inner.add(field(" ", "jit_duration", s.tornadoJitDuration()));
inner.add(field(" ", "read_only_weights_copy_in_duration", s.tornadoReadOnlyWeightsCopyInDuration()));
return " \"tornadovm\": {\n" + String.join(",\n", inner) + "\n }";
}

private static String field(String indent, String key, long value) {
return indent + "\"" + key + "\": " + value;
}

private static String field(String indent, String key, int value) {
return indent + "\"" + key + "\": " + value;
}

private static String field(String indent, String key, double value) {
return indent + "\"" + key + "\": " + String.format("%.4f", value);
}

private static String field(String indent, String key, boolean value) {
return indent + "\"" + key + "\": " + value;
}
}
Loading
Loading