@@ -55,7 +55,12 @@ import {
5555 QJL_GGML_BASE_LINK_FILES ,
5656} from "./kernel-patches/cpu-simd-kernels.mjs" ;
5757import { patchCpuThreadParallelism as patchCpuThreadParallelismImpl } from "./kernel-patches/cpu-thread-parallelism.mjs" ;
58+ import {
59+ CUDA_KERNEL_CMAKE_FLAGS ,
60+ patchCudaKernels as patchCudaKernelsImpl ,
61+ } from "./kernel-patches/cuda-kernels.mjs" ;
5862import { patchMetalKernels as patchMetalKernelsImpl } from "./kernel-patches/metal-kernels.mjs" ;
63+ import { patchServerOmnivoiceRoute as patchServerOmnivoiceRouteImpl } from "./kernel-patches/server-omnivoice-route.mjs" ;
5964import { patchServerStructuredOutput as patchServerStructuredOutputImpl } from "./kernel-patches/server-structured-output.mjs" ;
6065import { patchVulkanKernels as patchVulkanKernelsImpl } from "./kernel-patches/vulkan-kernels.mjs" ;
6166import {
@@ -609,6 +614,70 @@ target_include_directories(ggml-base PRIVATE ggml-cpu ggml-cpu/qjl ggml-cpu/qjl/
609614 ) ;
610615}
611616
617+ // Patch `ggml/src/ggml-cuda/CMakeLists.txt` so the staged fused-attn TU
618+ // (fused-attn-qjl-tbq.cu, copied in by patchCudaKernels) compiles its body
619+ // when `-DGGML_CUDA_FUSED_ATTN_QJL=ON` is passed. The fork's ggml-cuda
620+ // CMakeLists already carries `if (GGML_CUDA_QJL) add_compile_definitions(...)`
621+ // style blocks for the W4-B kernels; this adds the matching one for the fused
622+ // kernel right after them. Idempotent via a sentinel; hard-throws if the
623+ // anchor is missing (fork drift — AGENTS.md §3, fail closed rather than ship a
624+ // kernel-missing artifact). CUDA targets only.
625+ function patchGgmlCudaForFusedAttn ( cacheDir , { dryRun = false } = { } ) {
626+ const cmakeListsPath = path . join (
627+ cacheDir ,
628+ "ggml" ,
629+ "src" ,
630+ "ggml-cuda" ,
631+ "CMakeLists.txt" ,
632+ ) ;
633+ if ( ! fs . existsSync ( cmakeListsPath ) ) {
634+ throw new Error (
635+ `[dflash-build] patchGgmlCudaForFusedAttn: ${ cmakeListsPath } missing — ` +
636+ `the elizaOS/llama.cpp fork's ggml-cuda layout has changed.` ,
637+ ) ;
638+ }
639+ const original = fs . readFileSync ( cmakeListsPath , "utf8" ) ;
640+ const sentinel = "# MILADY-CUDA-FUSED-ATTN-QJL" ;
641+ if ( original . includes ( sentinel ) ) return ;
642+ // Anchor on the W4-B TBQ3_TCQ compile-definition block. The fork carries a
643+ // run of `if (GGML_CUDA_<KERNEL>) ... add_compile_definitions(...) ... endif()`
644+ // for QJL / POLARQUANT / TBQ3_TCQ; we append the fused-attn one after the
645+ // last of them.
646+ const anchorRe =
647+ / i f \s * \( \s * G G M L _ C U D A _ T B Q 3 _ T C Q \s * \) [ \s \S ] * ?e n d i f \s * \( \s * \) / ;
648+ if ( ! anchorRe . test ( original ) ) {
649+ throw new Error (
650+ `[dflash-build] patchGgmlCudaForFusedAttn: could not find the ` +
651+ `GGML_CUDA_TBQ3_TCQ if/endif block in ${ cmakeListsPath } ; the fork's ` +
652+ `ggml-cuda CMakeLists has drifted. Fix the anchor before shipping a ` +
653+ `CUDA build (fused_attn kernel would silently compile to an empty TU).` ,
654+ ) ;
655+ }
656+ const block = `
657+
658+ ${ sentinel }
659+ # Fused QJL-K + TBQ-V attention (packages/inference/cuda/fused-attn-qjl-tbq.cu,
660+ # staged in by patchCudaKernels). Body is #ifdef GGML_CUDA_FUSED_ATTN_QJL; this
661+ # flips that define on when -DGGML_CUDA_FUSED_ATTN_QJL=ON is passed. Same shape
662+ # as the GGML_CUDA_QJL / POLARQUANT / TBQ3_TCQ blocks above. Optional kernel
663+ # (packages/inference/AGENTS.md §3) — off by default.
664+ if (GGML_CUDA_FUSED_ATTN_QJL)
665+ add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)
666+ message(STATUS "ggml-cuda: GGML_CUDA_FUSED_ATTN_QJL enabled (fused QJL-K + TBQ-V attention)")
667+ endif()` ;
668+ const patched = original . replace ( anchorRe , ( m ) => `${ m } ${ block } ` ) ;
669+ if ( dryRun ) {
670+ console . log (
671+ `[dflash-build] (dry-run) would patch ${ cmakeListsPath } with GGML_CUDA_FUSED_ATTN_QJL block` ,
672+ ) ;
673+ return ;
674+ }
675+ fs . writeFileSync ( cmakeListsPath , patched , "utf8" ) ;
676+ console . log (
677+ "[dflash-build] patched ggml/src/ggml-cuda/CMakeLists.txt: add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)" ,
678+ ) ;
679+ }
680+
612681// The fork's `ggml-vulkan.cpp` includes <vulkan/vulkan.hpp> (Vulkan-Headers)
613682// and <spirv/unified1/spirv.hpp> (SPIRV-Headers). The Android NDK ships only
614683// the C-level vulkan.h and no SPIRV headers, so a cross-compile against the
@@ -798,6 +867,16 @@ function cmakeFlagsForTarget(target, ctx) {
798867 } else if ( backend === "cuda" ) {
799868 flags [ flags . indexOf ( "-DGGML_CUDA=OFF" ) ] = "-DGGML_CUDA=ON" ;
800869 flags . push ( "-DGGML_CUDA_FA=ON" , "-DGGML_CUDA_FA_ALL_QUANTS=ON" ) ;
870+ // Fused QJL-K + TBQ-V attention CUDA kernel (packages/inference/cuda/
871+ // fused-attn-qjl-tbq.cu, staged into ggml-cuda/ by patchCudaKernels).
872+ // The kernel body is `#ifdef GGML_CUDA_FUSED_ATTN_QJL`; this flag plus
873+ // the `add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)` line
874+ // patchGgmlCudaForFusedAttn() injects into ggml-cuda/CMakeLists.txt are
875+ // what turn the staged TU from an empty object into the live kernel.
876+ // Same shape as the GGML_CUDA_QJL / GGML_CUDA_POLARQUANT /
877+ // GGML_CUDA_TBQ3_TCQ flags the W4-B fork already carries. Optional
878+ // (AGENTS.md §3) — fused_attn sits on top of the five required kernels.
879+ flags . push ( ...CUDA_KERNEL_CMAKE_FLAGS ) ;
801880 // Multi-arch fat-binary pin (see cudaArchListFlag). Without this the
802881 // build host's GPU (or sm_52 default on a GPU-less host) decides the
803882 // emitted PTX/SASS — wrong for a redistributable artifact, and the
@@ -1304,6 +1383,17 @@ function applyForkPatches(cacheDir, backend, target, { dryRun = false } = {}) {
13041383 if ( backend === "vulkan" ) {
13051384 patchVulkanKernelsImpl ( cacheDir , { dryRun, target } ) ;
13061385 }
1386+ if ( backend === "cuda" ) {
1387+ // Stage packages/inference/cuda/fused-attn-qjl-tbq.cu into ggml-cuda/
1388+ // (the fork GLOBs *.cu) and flip the matching add_compile_definitions in
1389+ // ggml-cuda/CMakeLists.txt so -DGGML_CUDA_FUSED_ATTN_QJL=ON (pushed in the
1390+ // cuda branch of buildCmakeFlags) actually compiles the kernel body. Both
1391+ // halves together — the staged TU is inert without the define, the define
1392+ // is meaningless without the TU. AUTHORED, hardware-verify pending (no
1393+ // NVIDIA host here); a no-flag/empty-TU build stays byte-for-byte normal.
1394+ patchCudaKernelsImpl ( cacheDir , { dryRun } ) ;
1395+ patchGgmlCudaForFusedAttn ( cacheDir , { dryRun } ) ;
1396+ }
13071397 // llama-server structured-output + DFlash verifier-stream patch (Eliza-1
13081398 // voice swarm, W4): assert grammar_lazy / json_schema / response_format /
13091399 // continue_final_message are present in the fork's server.cpp (upstream
@@ -1334,6 +1424,17 @@ function applyForkPatches(cacheDir, backend, target, { dryRun = false } = {}) {
13341424 patchServerStructuredOutputImpl ( cacheDir , { dryRun } ) ;
13351425 }
13361426 }
1427+ // Fused omnivoice TTS: mount `POST /v1/audio/speech` onto the same
1428+ // `llama-server` that serves `/completion` + `/v1/chat/completions` + the
1429+ // DFlash speculative loop (packages/inference/AGENTS.md §4 — one process,
1430+ // not two over IPC; remaining-work-ledger P0 #3 merged-route item). The
1431+ // route handler is guarded by `#ifdef MILADY_FUSE_OMNIVOICE` so non-fused
1432+ // builds are byte-for-byte unchanged; the cmake-graft separately links
1433+ // `omnivoice-core` into `llama-server` and sets that define for fused
1434+ // targets. Idempotent via the route patch's own sentinel.
1435+ if ( isFusedTarget ( target ) && ( ! target || ! target . startsWith ( "ios-" ) ) ) {
1436+ patchServerOmnivoiceRouteImpl ( cacheDir , { dryRun } ) ;
1437+ }
13371438 // ggml.c (in ggml-base) calls quantize_qjl1_256 /
13381439 // dequantize_row_qjl1_256 / quantize_row_qjl1_256_ref, which live in
13391440 // ggml-cpu/qjl/. Any build where ggml-base is its own shared object
0 commit comments