Merge pull request #108 from AdamBien/main

orionpapadakis · web-flow · commit b94b20f9c2e0 · 2026-05-29T14:01:39.000+03:00
Add Q4_K/Q5_K/Q6_K GPU support via Q8_0 dequantization
diff --git a/llamaTornado b/llamaTornado
@@ -12,7 +12,7 @@ record Config(
     double temperature, double topP, long seed, int maxTokens,
     boolean stream, boolean echo, boolean interactive, boolean instruct,
     boolean useGpu, Backend backend, String gpuMemory,
-    String heapMin, String heapMax,
+    String heapMin, String heapMax, String directMemory,
     boolean debug, boolean profiler, String profilerDumpDir,
     boolean printBytecodes, boolean threads, boolean printKernel,
     boolean fullDump, boolean verboseInit,
@@ -37,6 +37,7 @@ Config parseArgs(String[] args) {
     String gpuMemory = "14GB";
     String heapMin = "20g";
     String heapMax = "20g";
+    String directMemory = null;
     boolean debug = false;
     boolean profiler = false;
     String profilerDumpDir = null;
@@ -71,6 +72,7 @@ Config parseArgs(String[] args) {
             case "--gpu-memory" -> gpuMemory = args[++i];
             case "--heap-min" -> heapMin = args[++i];
             case "--heap-max" -> heapMax = args[++i];
+            case "--direct-memory" -> directMemory = args[++i];
             case "--debug" -> debug = true;
             case "--profiler" -> profiler = true;
             case "--profiler-dump-dir" -> profilerDumpDir = args[++i];
@@ -101,12 +103,27 @@ Config parseArgs(String[] args) {
         profilerDumpDir = System.getenv("LLAMA_ROOT") + "/profiler-log.json";
     }
 
+    // Default direct memory to 3x heap to accommodate K-quant dequantization
+    if (directMemory == null) {
+        directMemory = parseAndScale(heapMax, 3);
+    }
+
     return new Config(modelPath, prompt, systemPrompt, temperature, topP, seed, maxTokens,
-            stream, echo, interactive, instruct, useGpu, backend, gpuMemory, heapMin, heapMax,
+            stream, echo, interactive, instruct, useGpu, backend, gpuMemory, heapMin, heapMax, directMemory,
             debug, profiler, profilerDumpDir, printBytecodes, threads, printKernel, fullDump,
             verboseInit, showCommand, executeAfterShow, openclFlags, maxWaitEvents, verbose);
 }
 
+String parseAndScale(String memoryValue, int multiplier) {
+    var matcher = java.util.regex.Pattern.compile("(\\d+)([gGmM]?)").matcher(memoryValue);
+    if (matcher.matches()) {
+        long value = Long.parseLong(matcher.group(1)) * multiplier;
+        String suffix = matcher.group(2).isEmpty() ? "" : matcher.group(2);
+        return value + suffix;
+    }
+    return memoryValue;
+}
+
 void printUsage() {
     IO.println("""
             Usage: %s --model <path> [options]
@@ -138,6 +155,7 @@ void printUsage() {
               --gpu-memory <val>      GPU memory allocation (default: 14GB)
               --heap-min <val>        Min JVM heap (default: 20g)
               --heap-max <val>        Max JVM heap (default: 20g)
+              --direct-memory <val>   Max direct buffer memory (default: 3x heap-max)
 
             Debug:
               --debug                 Enable debug output
@@ -195,6 +213,7 @@ List<String> buildCommand(Config cfg, String javaHome, String tornadoSdk, String
         "-XX:+EnableJVMCI",
         "-Xms" + cfg.heapMin(),
         "-Xmx" + cfg.heapMax(),
+        "-XX:MaxDirectMemorySize=" + cfg.directMemory(),
         "--enable-preview",
         "-Djava.library.path=" + tornadoSdk + "/lib",
         "-Djdk.module.showModuleResolution=false",
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/AbstractModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/AbstractModelLoader.java
@@ -1,5 +1,6 @@
 package org.beehive.gpullama3.model.loader;
 
+import org.beehive.gpullama3.tensor.GGMLType;
 import org.beehive.gpullama3.tensor.GGUF;
 import org.beehive.gpullama3.tensor.GGMLTensorEntry;
 import org.beehive.gpullama3.auxiliary.Pair;
@@ -8,6 +9,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tokenizer.Vocabulary;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
 
 import java.io.IOException;
 import java.nio.channels.FileChannel;
@@ -40,10 +42,39 @@ protected String getModelQuantization(Map<String, Object> metadata) {
         return switch (modelQuantizationAsInt) {
             case 1 -> "FP16";
             case 7 -> "Q8_0";
+            case 14, 15 -> "Q8_0"; // Q4_K_S, Q4_K_M (K-quants use Q8_0 activations)
+            case 16, 17 -> "Q8_0"; // Q5_K_S, Q5_K_M
+            case 18 -> "Q8_0";     // Q6_K
             default -> throw new UnsupportedOperationException("Unsupported quantization format: " + modelQuantizationAsInt + " (as int).");
         };
     }
 
+    /**
+     * Returns the effective GPU weight type for TornadoVM execution.
+     * K-quant types (Q4_K, Q5_K, Q6_K) are dequantized to Q8_0 at load time.
+     */
+    protected static GGMLType effectiveGpuWeightType(GGMLType ggmlType) {
+        return switch (ggmlType) {
+            case F16, F32, Q8_0 -> ggmlType;
+            case Q4_K, Q5_K, Q6_K -> GGMLType.Q8_0;
+            default -> ggmlType;
+        };
+    }
+
+    private static String fileTypeName(int fileType) {
+        return switch (fileType) {
+            case 0 -> "F32";
+            case 1 -> "F16";
+            case 7 -> "Q8_0";
+            case 14 -> "Q4_K_S";
+            case 15 -> "Q4_K_M";
+            case 16 -> "Q5_K_S";
+            case 17 -> "Q5_K_M";
+            case 18 -> "Q6_K";
+            default -> "type_" + fileType;
+        };
+    }
+
     /**
      * Template method that defines the model loading workflow. Subclasses should not override this method.
      *
@@ -123,6 +154,11 @@ public Weights loadWeights(Map<String, GGMLTensorEntry> tensorEntries, C config)
 
         // Delegate to specific implementation
         if (useTornadovm) {
+            GGMLType gpuType = effectiveGpuWeightType(outputWeight.ggmlType());
+            if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
+                int fileType = (int) gguf.getMetadata().get("general.file_type");
+                System.out.println("Loading model weights in TornadoVM format (" + fileTypeName(fileType) + " -> " + gpuType + ")");
+            }
             return createTornadoVMWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
         } else {
             return createStandardWeights(tensorEntries, config, ropeFreqs, tokenEmbeddings, outputWeight);
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/DevstralModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/DevstralModelLoader.java
@@ -143,11 +143,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
     // @formatter:off
     @Override
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, DevstralConfiguration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
-        GGMLType ggmlType = outputWeight.ggmlType();
-
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-        }
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
             throw new UnsupportedOperationException("Type: " + ggmlType + " currently not supported for TornadoVM weights.");
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/GraniteLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/GraniteLoader.java
@@ -136,11 +136,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
                                                  Pair<float[], float[]> ropeFreqs,
                                                  GGMLTensorEntry tokenEmbeddings,
                                                  GGMLTensorEntry outputWeight) {
-            GGMLType ggmlType = outputWeight.ggmlType();
-
-            if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-                System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-            }
+            GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
             // Validate supported types
             if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/LlamaModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/LlamaModelLoader.java
@@ -106,11 +106,7 @@ protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntr
                                              Pair<float[], float[]> ropeFreqs,
                                              GGMLTensorEntry tokenEmbeddings,
                                              GGMLTensorEntry outputWeight) {
-        GGMLType ggmlType = outputWeight.ggmlType();
-
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-        }
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         // Validate supported types
         if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/MistralModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/MistralModelLoader.java
@@ -116,11 +116,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
     // @formatter:off
     @Override
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, MistralConfiguration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
-        GGMLType ggmlType = outputWeight.ggmlType();
-
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-        }
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         // Validate supported types
         if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java
@@ -16,6 +16,7 @@
 import uk.ac.manchester.tornado.api.types.arrays.*;
 
 import java.io.IOException;
+import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
 import java.nio.ByteOrder;
@@ -122,6 +123,9 @@ public static FloatTensor loadTensor(GGMLTensorEntry entry) {
             case F32 -> new FP32FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
             case Q8_0 -> new Q8_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
             case Q4_0 -> new Q4_0FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q4_K -> new Q4_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q5_K -> new Q5_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
+            case Q6_K -> new Q6_KFloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
             case F16 -> new FP16FloatTensor(FloatTensor.numberOfElements(entry.shape()), entry.memorySegment());
             default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
         };
@@ -150,11 +154,69 @@ public static TornadoTensor loadTornadoTensor(GGMLTensorEntry entry) {
             case F32 -> FP32TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
             case F16 -> FP16TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
             case Q8_0 -> Q8_0TornadoTensor.fromTornadoMemorySegment(entry.memorySegment());
-            case Q4_0 -> throw new UnsupportedOperationException("Q4 format not supported yet");
+            case Q4_K, Q5_K, Q6_K -> dequantizeToQ8_0TornadoTensor(entry);
+            case Q4_0 -> throw new UnsupportedOperationException("Q4_0 format not supported for TornadoVM yet");
             default -> throw new UnsupportedOperationException("Quantization format " + ggmlType);
         };
     }
 
+    /**
+     * Dequantizes a K-quant tensor (Q4_K, Q5_K, Q6_K) to Q8_0 format for TornadoVM/GPU execution.
+     * This is a load-time conversion that allows K-quant models to run on GPU with existing Q8_0 kernels.
+     */
+    private static Q8_0TornadoTensor dequantizeToQ8_0TornadoTensor(GGMLTensorEntry entry) {
+        // The entry's memorySegment includes a TornadoVM ARRAY_HEADER prefix (16 bytes of zeros).
+        // Slice past it so the K-quant FloatTensor reads raw tensor data starting at byte 0.
+        long headerBytes = TornadoNativeArray.ARRAY_HEADER;
+        GGMLTensorEntry dataEntry = new GGMLTensorEntry(
+                entry.mappedFile(), entry.name(), entry.ggmlType(), entry.shape(),
+                entry.memorySegment().asSlice(headerBytes));
+        FloatTensor sourceTensor = loadTensor(dataEntry);
+        int numElements = sourceTensor.size();
+        int blockSize = 32;
+        int blocksNeeded = (numElements + blockSize - 1) / blockSize;
+        int q8BlockBytes = 34; // 2 bytes scale + 32 bytes quants
+        int q8BytesNeeded = blocksNeeded * q8BlockBytes;
+
+        byte[] q8Data = new byte[q8BytesNeeded];
+
+        for (int b = 0; b < blocksNeeded; b++) {
+            int start = b * blockSize;
+            int end = Math.min(start + blockSize, numElements);
+
+            // Find max absolute value for scale
+            float maxAbs = 0;
+            for (int i = start; i < end; i++) {
+                maxAbs = Math.max(maxAbs, Math.abs(sourceTensor.getFloat(i)));
+            }
+            float scale = maxAbs / 127.0f;
+
+            // Write scale as fp16 (little-endian)
+            short scaleF16 = Float.floatToFloat16(scale);
+            int blockOff = b * q8BlockBytes;
+            q8Data[blockOff] = (byte) (scaleF16 & 0xFF);
+            q8Data[blockOff + 1] = (byte) ((scaleF16 >> 8) & 0xFF);
+
+            // Quantize values
+            float invScale = scale != 0 ? 1.0f / scale : 0;
+            for (int i = start; i < end; i++) {
+                int qi = Math.round(sourceTensor.getFloat(i) * invScale);
+                qi = Math.max(-128, Math.min(127, qi));
+                q8Data[blockOff + 2 + (i - start)] = (byte) qi;
+            }
+        }
+
+        // Allocate native memory with TornadoNativeArray header, matching GGUF.loadTensorsTornado layout
+        MemorySegment nativeSegment = Arena.ofAuto().allocate(headerBytes + q8BytesNeeded, 4);
+        // Zero out the header
+        for (int i = 0; i < headerBytes; i++) {
+            nativeSegment.set(ValueLayout.JAVA_BYTE, i, (byte) 0);
+        }
+        // Copy Q8_0 data after header
+        MemorySegment.copy(MemorySegment.ofArray(q8Data), 0, nativeSegment, headerBytes, q8BytesNeeded);
+        return Q8_0TornadoTensor.fromTornadoMemorySegment(nativeSegment);
+    }
+
     /**
      * Dispatcher method for loading a TornadoVM tensor array based on type.
      * Used in GPU-path.
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/Phi3ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/Phi3ModelLoader.java
@@ -126,11 +126,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
     // @formatter:off
     @Override
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Phi3Configuration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings, GGMLTensorEntry outputWeight) {
-        GGMLType ggmlType = outputWeight.ggmlType();
-
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-        }
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         // Validate supported types
         if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/Qwen2ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/Qwen2ModelLoader.java
@@ -126,11 +126,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
     @Override
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Qwen2Configuration config, Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
                                              GGMLTensorEntry outputWeight) {
-        GGMLType ggmlType = outputWeight.ggmlType();
-
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + ggmlType + ")");
-        }
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         // Validate supported types
         if (ggmlType != GGMLType.F16 && ggmlType != GGMLType.Q8_0) {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/Qwen3ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/Qwen3ModelLoader.java
@@ -129,11 +129,7 @@ protected Weights createStandardWeights(Map<String, GGMLTensorEntry> tensorEntri
     protected Weights createTornadoVMWeights(Map<String, GGMLTensorEntry> tensorEntries, Qwen3Configuration config,
                                              Pair<float[], float[]> ropeFreqs, GGMLTensorEntry tokenEmbeddings,
                                              GGMLTensorEntry outputWeight) {
-        if (TornadoVMMasterPlan.ENABLE_TORNADOVM_INIT_TIME) {
-            System.out.println("Loading model weights in TornadoVM format (loading " + outputWeight.ggmlType() + " -> " + GGMLType.F16 + ")");
-        }
-
-        GGMLType ggmlType = outputWeight.ggmlType();
+        GGMLType ggmlType = effectiveGpuWeightType(outputWeight.ggmlType());
 
         final int nl = config.numberOfLayers();
 
diff --git a/src/main/java/org/beehive/gpullama3/tensor/standard/Q4_KFloatTensor.java b/src/main/java/org/beehive/gpullama3/tensor/standard/Q4_KFloatTensor.java
diff --git a/src/main/java/org/beehive/gpullama3/tensor/standard/Q5_KFloatTensor.java b/src/main/java/org/beehive/gpullama3/tensor/standard/Q5_KFloatTensor.java
diff --git a/src/main/java/org/beehive/gpullama3/tensor/standard/Q6_KFloatTensor.java b/src/main/java/org/beehive/gpullama3/tensor/standard/Q6_KFloatTensor.java