elizaOS
diff --git a/‎ELIZA_1_RELEASE_ASSET_STATUS.md‎
Lines changed: 32 additions & 0 deletions b/‎ELIZA_1_RELEASE_ASSET_STATUS.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎ELIZA_1_TESTING_TODO.md‎
Lines changed: 22 additions & 0 deletions b/‎ELIZA_1_TESTING_TODO.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎packages/app-core/scripts/build-llama-cpp-dflash.mjs‎
Lines changed: 101 additions & 0 deletions b/‎packages/app-core/scripts/build-llama-cpp-dflash.mjs‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎packages/app-core/scripts/kernel-patches/cuda-kernels.mjs‎
Lines changed: 9 additions & 8 deletions b/‎packages/app-core/scripts/kernel-patches/cuda-kernels.mjs‎
Lines changed: 9 additions & 8 deletions
@@ -161,6 +161,38 @@ route yet.
   `elizaos/eliza-1-assets`, but no publishable per-tier
   `elizaos/eliza-1-*` release repos with final evidence.
 
+## Publish Pipeline / Downloader State (2026-05-11, this checkout)
+
+- `packages/training/scripts/publish_all_eliza1.sh` now prints the per-tier
+  publish summary and propagates the orchestrator's structured exit code on
+  the first failing tier (so callers can tell `EXIT_RELEASE_EVIDENCE_FAIL`
+  = `16` from `EXIT_BUNDLE_LAYOUT_FAIL` = `10`, etc.). The
+  abort-on-first-failure behavior from §6 is unchanged.
+- Dry-run was executed against a hand-built `releaseState=upload-candidate`
+  stand-in bundle for the `0_6b` tier (`final.weights=false`): the
+  orchestrator rejects it at stage 2 (`exit 16`, `EXIT_RELEASE_EVIDENCE_FAIL`)
+  — exactly as the contract requires. **No tier would publish; all are
+  blocked by non-final release evidence.** This checkout's state dir contains
+  no staged Eliza-1 bundle; producing one requires the asset/source staging
+  scripts (`stage_eliza1_bundle_assets.py`, `stage_eliza1_source_weights.py`,
+  `stage_local_eliza1_bundle.py`) which need HF network access and real
+  text/DFlash weights.
+- No `HF_TOKEN` / `HUGGINGFACE_TOKEN` / `HUGGINGFACE_HUB_TOKEN` is present
+  in this environment and `huggingface-cli` is not installed. **No upload
+  was performed.** `defaultEligible` and `publishEligible` stay `false` for
+  every tier.
+- §7 device-side downloader contract hardened (see
+  `packages/app-core/src/services/local-inference/downloader.ts`): the
+  manifest is read first, then RAM budget and verified-backend availability
+  are checked against the device **before any weight byte is fetched**
+  (abort → structured `BundleIncompatibleError` → `failed` download event);
+  schema version is enforced by `parseManifestOrThrow`; per-file sha256 +
+  resume already existed; a new injectable `verifyOnDevice` hook (load →
+  1-token text → 1-phrase voice → barge-in cancel) gates readiness and
+  default-slot fill, recorded via `InstalledModel.bundleVerifiedAt`. Tests
+  added in `downloader.test.ts`. Wiring the hook from the engine in
+  `service.ts` is the remaining gap.
+
 ## Next Release Actions
 
 1. Train/fine-tune the Eliza-1 text checkpoints for each tier.
 
@@ -235,3 +235,25 @@ are complete enough for runtime-layout smoke: every tier has required local
 `checksums/SHA256SUMS` has been revalidated. They are not recordable release
 artifacts because `evidence/release.json` is intentionally
 `releaseState=local-standin` and `publishEligible=false`.
+
+Note (this checkout / Linux x64, 2026-05-11): no staged Eliza-1 bundle exists
+in this checkout's state dir and no HF write token is present, so no upload
+was attempted. A publish dry-run against a hand-built
+`releaseState=upload-candidate` stand-in bundle exits `16`
+(`EXIT_RELEASE_EVIDENCE_FAIL`) at stage 2 — the orchestrator correctly
+refuses it. The publish-pipeline machinery is covered by
+`pytest packages/training/scripts/{test_hf_publish.py,publish/test_orchestrator.py,manifest/test_eliza1_*.py,manifest/test_stage_local_eliza1_bundle.py}`
+(97 passed, 1 skipped).
+
+### Device-side downloader contract (§7)
+
+The §7 device-side download contract is exercised by
+`bun test packages/app-core/src/services/local-inference/downloader.test.ts`:
+manifest-first read, schema-version rejection (via `parseManifestOrThrow`),
+RAM-budget abort before any weight byte, no-overlapping-verified-backend
+abort before any weight byte, per-file sha256 + resume, and the
+`verifyOnDevice` hook gating readiness / default-slot fill. Remaining:
+the engine has not yet wired the real `verifyOnDevice` smoke (load →
+1-token text → 1-phrase voice → barge-in cancel) into `service.ts`, and the
+recommendation engine does not yet call `canSetAsDefault` against the
+device's available backends.
@@ -55,7 +55,12 @@ import {
   QJL_GGML_BASE_LINK_FILES,
 } from "./kernel-patches/cpu-simd-kernels.mjs";
 import { patchCpuThreadParallelism as patchCpuThreadParallelismImpl } from "./kernel-patches/cpu-thread-parallelism.mjs";
+import {
+  CUDA_KERNEL_CMAKE_FLAGS,
+  patchCudaKernels as patchCudaKernelsImpl,
+} from "./kernel-patches/cuda-kernels.mjs";
 import { patchMetalKernels as patchMetalKernelsImpl } from "./kernel-patches/metal-kernels.mjs";
+import { patchServerOmnivoiceRoute as patchServerOmnivoiceRouteImpl } from "./kernel-patches/server-omnivoice-route.mjs";
 import { patchServerStructuredOutput as patchServerStructuredOutputImpl } from "./kernel-patches/server-structured-output.mjs";
 import { patchVulkanKernels as patchVulkanKernelsImpl } from "./kernel-patches/vulkan-kernels.mjs";
 import {
@@ -609,6 +614,70 @@ target_include_directories(ggml-base PRIVATE ggml-cpu ggml-cpu/qjl ggml-cpu/qjl/
   );
 }
 
+// Patch `ggml/src/ggml-cuda/CMakeLists.txt` so the staged fused-attn TU
+// (fused-attn-qjl-tbq.cu, copied in by patchCudaKernels) compiles its body
+// when `-DGGML_CUDA_FUSED_ATTN_QJL=ON` is passed. The fork's ggml-cuda
+// CMakeLists already carries `if (GGML_CUDA_QJL) add_compile_definitions(...)`
+// style blocks for the W4-B kernels; this adds the matching one for the fused
+// kernel right after them. Idempotent via a sentinel; hard-throws if the
+// anchor is missing (fork drift — AGENTS.md §3, fail closed rather than ship a
+// kernel-missing artifact). CUDA targets only.
+function patchGgmlCudaForFusedAttn(cacheDir, { dryRun = false } = {}) {
+  const cmakeListsPath = path.join(
+    cacheDir,
+    "ggml",
+    "src",
+    "ggml-cuda",
+    "CMakeLists.txt",
+  );
+  if (!fs.existsSync(cmakeListsPath)) {
+    throw new Error(
+      `[dflash-build] patchGgmlCudaForFusedAttn: ${cmakeListsPath} missing — ` +
+        `the elizaOS/llama.cpp fork's ggml-cuda layout has changed.`,
+    );
+  }
+  const original = fs.readFileSync(cmakeListsPath, "utf8");
+  const sentinel = "# MILADY-CUDA-FUSED-ATTN-QJL";
+  if (original.includes(sentinel)) return;
+  // Anchor on the W4-B TBQ3_TCQ compile-definition block. The fork carries a
+  // run of `if (GGML_CUDA_<KERNEL>) ... add_compile_definitions(...) ... endif()`
+  // for QJL / POLARQUANT / TBQ3_TCQ; we append the fused-attn one after the
+  // last of them.
+  const anchorRe =
+    /if\s*\(\s*GGML_CUDA_TBQ3_TCQ\s*\)[\s\S]*?endif\s*\(\s*\)/;
+  if (!anchorRe.test(original)) {
+    throw new Error(
+      `[dflash-build] patchGgmlCudaForFusedAttn: could not find the ` +
+        `GGML_CUDA_TBQ3_TCQ if/endif block in ${cmakeListsPath}; the fork's ` +
+        `ggml-cuda CMakeLists has drifted. Fix the anchor before shipping a ` +
+        `CUDA build (fused_attn kernel would silently compile to an empty TU).`,
+    );
+  }
+  const block = `
+
+${sentinel}
+# Fused QJL-K + TBQ-V attention (packages/inference/cuda/fused-attn-qjl-tbq.cu,
+# staged in by patchCudaKernels). Body is #ifdef GGML_CUDA_FUSED_ATTN_QJL; this
+# flips that define on when -DGGML_CUDA_FUSED_ATTN_QJL=ON is passed. Same shape
+# as the GGML_CUDA_QJL / POLARQUANT / TBQ3_TCQ blocks above. Optional kernel
+# (packages/inference/AGENTS.md §3) — off by default.
+if (GGML_CUDA_FUSED_ATTN_QJL)
+    add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)
+    message(STATUS "ggml-cuda: GGML_CUDA_FUSED_ATTN_QJL enabled (fused QJL-K + TBQ-V attention)")
+endif()`;
+  const patched = original.replace(anchorRe, (m) => `${m}${block}`);
+  if (dryRun) {
+    console.log(
+      `[dflash-build] (dry-run) would patch ${cmakeListsPath} with GGML_CUDA_FUSED_ATTN_QJL block`,
+    );
+    return;
+  }
+  fs.writeFileSync(cmakeListsPath, patched, "utf8");
+  console.log(
+    "[dflash-build] patched ggml/src/ggml-cuda/CMakeLists.txt: add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)",
+  );
+}
+
 // The fork's `ggml-vulkan.cpp` includes <vulkan/vulkan.hpp> (Vulkan-Headers)
 // and <spirv/unified1/spirv.hpp> (SPIRV-Headers). The Android NDK ships only
 // the C-level vulkan.h and no SPIRV headers, so a cross-compile against the
@@ -798,6 +867,16 @@ function cmakeFlagsForTarget(target, ctx) {
   } else if (backend === "cuda") {
     flags[flags.indexOf("-DGGML_CUDA=OFF")] = "-DGGML_CUDA=ON";
     flags.push("-DGGML_CUDA_FA=ON", "-DGGML_CUDA_FA_ALL_QUANTS=ON");
+    // Fused QJL-K + TBQ-V attention CUDA kernel (packages/inference/cuda/
+    // fused-attn-qjl-tbq.cu, staged into ggml-cuda/ by patchCudaKernels).
+    // The kernel body is `#ifdef GGML_CUDA_FUSED_ATTN_QJL`; this flag plus
+    // the `add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)` line
+    // patchGgmlCudaForFusedAttn() injects into ggml-cuda/CMakeLists.txt are
+    // what turn the staged TU from an empty object into the live kernel.
+    // Same shape as the GGML_CUDA_QJL / GGML_CUDA_POLARQUANT /
+    // GGML_CUDA_TBQ3_TCQ flags the W4-B fork already carries. Optional
+    // (AGENTS.md §3) — fused_attn sits on top of the five required kernels.
+    flags.push(...CUDA_KERNEL_CMAKE_FLAGS);
     // Multi-arch fat-binary pin (see cudaArchListFlag). Without this the
     // build host's GPU (or sm_52 default on a GPU-less host) decides the
     // emitted PTX/SASS — wrong for a redistributable artifact, and the
@@ -1304,6 +1383,17 @@ function applyForkPatches(cacheDir, backend, target, { dryRun = false } = {}) {
   if (backend === "vulkan") {
     patchVulkanKernelsImpl(cacheDir, { dryRun, target });
   }
+  if (backend === "cuda") {
+    // Stage packages/inference/cuda/fused-attn-qjl-tbq.cu into ggml-cuda/
+    // (the fork GLOBs *.cu) and flip the matching add_compile_definitions in
+    // ggml-cuda/CMakeLists.txt so -DGGML_CUDA_FUSED_ATTN_QJL=ON (pushed in the
+    // cuda branch of buildCmakeFlags) actually compiles the kernel body. Both
+    // halves together — the staged TU is inert without the define, the define
+    // is meaningless without the TU. AUTHORED, hardware-verify pending (no
+    // NVIDIA host here); a no-flag/empty-TU build stays byte-for-byte normal.
+    patchCudaKernelsImpl(cacheDir, { dryRun });
+    patchGgmlCudaForFusedAttn(cacheDir, { dryRun });
+  }
   // llama-server structured-output + DFlash verifier-stream patch (Eliza-1
   // voice swarm, W4): assert grammar_lazy / json_schema / response_format /
   // continue_final_message are present in the fork's server.cpp (upstream
@@ -1334,6 +1424,17 @@ function applyForkPatches(cacheDir, backend, target, { dryRun = false } = {}) {
       patchServerStructuredOutputImpl(cacheDir, { dryRun });
     }
   }
+  // Fused omnivoice TTS: mount `POST /v1/audio/speech` onto the same
+  // `llama-server` that serves `/completion` + `/v1/chat/completions` + the
+  // DFlash speculative loop (packages/inference/AGENTS.md §4 — one process,
+  // not two over IPC; remaining-work-ledger P0 #3 merged-route item). The
+  // route handler is guarded by `#ifdef MILADY_FUSE_OMNIVOICE` so non-fused
+  // builds are byte-for-byte unchanged; the cmake-graft separately links
+  // `omnivoice-core` into `llama-server` and sets that define for fused
+  // targets. Idempotent via the route patch's own sentinel.
+  if (isFusedTarget(target) && (!target || !target.startsWith("ios-"))) {
+    patchServerOmnivoiceRouteImpl(cacheDir, { dryRun });
+  }
   // ggml.c (in ggml-base) calls quantize_qjl1_256 /
   // dequantize_row_qjl1_256 / quantize_row_qjl1_256_ref, which live in
   // ggml-cpu/qjl/. Any build where ggml-base is its own shared object
 
@@ -7,14 +7,15 @@
 // picked up unconditionally; the file body is gated by GGML_CUDA_FUSED_ATTN_QJL
 // so a no-flag build still emits an empty object.
 //
-// The matching cmake flag (-DGGML_CUDA_FUSED_ATTN_QJL=ON) and the
-// add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL) line that goes next to the
-// existing GGML_CUDA_QJL / GGML_CUDA_POLARQUANT / GGML_CUDA_TBQ3_TCQ block in
-// ggml-cuda/CMakeLists.txt are NOT applied here — that change lives in
-// build-llama-cpp-dflash.mjs (owned by the build-script agent). Until both land
-// the fused CUDA kernel is staged-but-inert: the symbol is absent from a
-// production build, which is the correct state — fused_attn is an optimization
-// on top of the five required kernels (AGENTS.md §3), not a required kernel.
+// The matching cmake flag (-DGGML_CUDA_FUSED_ATTN_QJL=ON, exported as
+// CUDA_KERNEL_CMAKE_FLAGS) and the add_compile_definitions(GGML_CUDA_FUSED_ATTN_QJL)
+// CMakeLists patch (patchGgmlCudaForFusedAttn) both live in
+// build-llama-cpp-dflash.mjs; its applyForkPatches() calls patchCudaKernels +
+// patchGgmlCudaForFusedAttn for CUDA targets and its cuda branch pushes
+// CUDA_KERNEL_CMAKE_FLAGS. A build without the flag (or anyone running this
+// staging step alone) gets a staged-but-inert TU — the symbol compiles to an
+// empty object, which is the correct state: fused_attn is an optimization on
+// top of the five required kernels (AGENTS.md §3), not a required kernel.
 //
 // Hard-throws on any error (missing source, missing fork dir, fs failure) — per
 // AGENTS.md §3 the build must exit non-zero rather than silently produce a