[prf/dec][refactor] Add unsupported exceptions for Q8_0 weights in GPU prefill-decode and batched-prefill-decode paths

orionpapadakis · orionpapadakis · commit 8585dd6a033d · 2026-04-17T14:24:26.000+03:00
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
@@ -3,8 +3,6 @@
 import org.beehive.gpullama3.auxiliary.LastRunMetrics;
 import org.beehive.gpullama3.inference.sampler.Sampler;
 import org.beehive.gpullama3.inference.state.State;
-import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights;
-import org.beehive.gpullama3.tensor.GGMLType;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
@@ -156,11 +154,6 @@ public static List<Integer> generateTokensGPULlama(
             int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
 
-        if (((TornadoWeights) model.weights()).getWeightType() == GGMLType.Q8_0) {
-            throw new UnsupportedOperationException(
-                    "GPU batched prefill/decode path not yet implemented for Q8_0 weights");
-        }
-
         long startNanos = System.nanoTime();
 
         final Configuration config = model.configuration();
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
@@ -3,8 +3,6 @@
 import org.beehive.gpullama3.auxiliary.LastRunMetrics;
 import org.beehive.gpullama3.inference.sampler.Sampler;
 import org.beehive.gpullama3.inference.state.State;
-import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights;
-import org.beehive.gpullama3.tensor.GGMLType;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
@@ -129,11 +127,6 @@ public static List<Integer> generateTokensGPULlama(
             int maxTokens, Sampler sampler, boolean echo,
             IntConsumer onTokenGenerated, TornadoVMMasterPlan tornadoVMPlan) {
 
-        if (((TornadoWeights) model.weights()).getWeightType() == GGMLType.Q8_0) {
-            throw new UnsupportedOperationException(
-                    "GPU prefill/decode path not yet implemented for Q8_0 weights");
-        }
-
         long startNanos = System.nanoTime();
 
         final Configuration config = model.configuration();
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
@@ -2,6 +2,7 @@
 
 import org.beehive.gpullama3.inference.state.LlamaState;
 import org.beehive.gpullama3.inference.state.State;
+import org.beehive.gpullama3.tensor.GGMLType;
 import org.beehive.gpullama3.inference.weights.tornado.LlamaTornadoWeights;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.model.llama.LlamaConfiguration;
@@ -142,9 +143,21 @@ private TaskGraph buildDecodeActivationGraph(KernelContext ctx, String lastBatch
 
     /**
      * Creates the {@link TornadoExecutionPlan} for forward pass with *prefill in batches and separated decode*.
+     *
+     * TODO: support Q8_0 weights
+     * To implement this, consult how {@link TornadoVMMasterPlanStandard} uses the {@link QuantizationPlannerFactory}
      */
     @Override
     public TornadoExecutionPlan createExecutionPlan() {
+        GGMLType weightType = model.weights().getWeightType();
+        switch (weightType) {
+            case F16 -> { /* supported — continue below */ }
+            case Q8_0 -> throw new UnsupportedOperationException(
+                    "Batched prefill/decode GPU path not yet implemented for Q8_0 weights");
+            default -> throw new UnsupportedOperationException(
+                    "Batched prefill/decode GPU path not supported for weight type: " + weightType);
+        }
+
         LlamaTornadoWeights weights       = (LlamaTornadoWeights) model.weights();
         SchedulerType       schedulerType = SchedulerDetectionService.determineSchedulerType(model);
 
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java
@@ -2,6 +2,7 @@
 
 import org.beehive.gpullama3.inference.state.LlamaState;
 import org.beehive.gpullama3.inference.state.State;
+import org.beehive.gpullama3.tensor.GGMLType;
 import org.beehive.gpullama3.inference.weights.tornado.LlamaTornadoWeights;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.model.llama.LlamaConfiguration;
@@ -121,9 +122,24 @@ private TaskGraph buildActivationGraph(KernelContext ctx) {
     }
 
     // ── Plan construction ─────────────────────────────────────────────────────
-
+    /**
+     * Creates the {@link TornadoExecutionPlan} for forward pass with *prefill/decode separation*.
+     * Prefill is token-by-token but does not compute logits.
+     *
+     * TODO: support Q8_0 weights
+     * To implement this, consult how {@link TornadoVMMasterPlanStandard} uses the {@link QuantizationPlannerFactory}
+     */
     @Override
     public TornadoExecutionPlan createExecutionPlan() {
+        GGMLType weightType = model.weights().getWeightType();
+        switch (weightType) {
+            case F16 -> { /* supported — continue below */ }
+            case Q8_0 -> throw new UnsupportedOperationException(
+                    "Prefill/decode GPU path not yet implemented for Q8_0 weights");
+            default -> throw new UnsupportedOperationException(
+                    "Prefill/decode GPU path not supported for weight type: " + weightType);
+        }
+
         LlamaTornadoWeights weights      = (LlamaTornadoWeights) model.weights();
         SchedulerType       schedulerType = SchedulerDetectionService.determineSchedulerType(model);