elizaOS
diff --git a/‎.github/workflows/local-inference-matrix.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/local-inference-matrix.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.swarm/STATUS.md‎
Lines changed: 127 additions & 0 deletions b/‎.swarm/STATUS.md‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎packages/app-core/deploy/Dockerfile.cloud‎
Lines changed: 23 additions & 8 deletions b/‎packages/app-core/deploy/Dockerfile.cloud‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎packages/app-core/src/services/local-inference/structured-output/index.ts‎
Lines changed: 4 additions & 0 deletions b/‎packages/app-core/src/services/local-inference/structured-output/index.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎packages/app-core/vitest.node-sqlite.config.ts‎
Lines changed: 9 additions & 4 deletions b/‎packages/app-core/vitest.node-sqlite.config.ts‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎packages/core/src/utils/state-dir.test.ts‎
Lines changed: 7 additions & 2 deletions b/‎packages/core/src/utils/state-dir.test.ts‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎packages/shared/src/contracts/__tests__/schema-vs-type-drift.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎packages/shared/src/contracts/__tests__/schema-vs-type-drift.test.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/shared/src/contracts/conversation-routes.test.ts‎
Lines changed: 3 additions & 3 deletions b/‎packages/shared/src/contracts/conversation-routes.test.ts‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎plugins.json‎
Lines changed: 1 addition & 0 deletions b/‎plugins.json‎
Lines changed: 1 addition & 0 deletions
@@ -204,12 +204,12 @@ jobs:
 
       - name: Install Hugging Face CLI
         run: |
-          python3 -m pip install --user --upgrade "huggingface_hub[cli]"
-          if ! command -v huggingface-cli >/dev/null 2>&1; then
+          python3 -m pip install --user --upgrade "huggingface_hub[hf_transfer]"
+          if ! command -v hf >/dev/null 2>&1; then
             export PATH="$HOME/.local/bin:$PATH"
             echo "$HOME/.local/bin" >> "$GITHUB_PATH"
           fi
-          huggingface-cli --help >/dev/null
+          hf --help >/dev/null
 
       - name: Cache buun-llama-cpp checkout + build
         uses: actions/cache@v5
@@ -250,7 +250,7 @@ jobs:
             ls -lh "$MODEL_DIR/${SMOKE_MODEL_FILE}"
             exit 0
           fi
-          huggingface-cli download "${SMOKE_MODEL_REPO}" "${SMOKE_MODEL_FILE}" \
+          hf download "${SMOKE_MODEL_REPO}" "${SMOKE_MODEL_FILE}" \
             --local-dir "$MODEL_DIR" \
             --local-dir-use-symlinks False
           ls -lh "$MODEL_DIR/${SMOKE_MODEL_FILE}"
 
@@ -160,3 +160,130 @@ Then the user must still teardown the VM via nebius CLI after re-auth.
 - Re-run cuda-fused on additional sm classes (sm_89 Ada / sm_90 H100 / sm_100 datacenter Blackwell) to confirm no arch regression in `CMAKE_CUDA_ARCHITECTURES=90a;90;89;86;80;100;120a`.
 - `llama-bench` is not in the fused-target list — adding it would unblock `runtime_graph_smoke.sh --gen-check` against the fused install. Non-blocking; the cuda-verify-fused parity + e2e_loop_bench publish-gate pass cover the substance.
 
+
+# H200-MONITOR-4 status — UPDATE 2026-05-13 03:36 UTC
+
+## v4 RUN TERMINATED at step ~1241 due to driver's built-in 6h watchdog
+
+### What happened
+- 2026-05-13T03:33:34Z (6h after `run_remote` started polling): the driver hit `scripts/train_nebius.sh` line 439 cap: `if [ "$i" -gt 360 ]; then echo "ERROR: still running after 6h — bailing"; return 1; fi`. 
+- `run_remote` returned 1 → bash's `set -euo pipefail` aborted the `full` flow → `fetch` was SKIPPED → EXIT trap ran only `teardown`.
+- The EXIT trap's `teardown` function (line 544) called `instance_id_by_name` → `nebius compute v1 instance list` → expired-auth hang → I had to kill the driver process (3652060) manually at 03:35:51Z.
+- **Final remote step: 1003** (per remote `run_eliza-1-0_8b-apollo-fullcorpus-h200-1778619044.log`).
+- Note: the local launch.log showed step 1241 in the local tail — that's because my Ctrl-C-via-tmux send-keys at 03:34:48Z killed the python training between the 1003 eval finish and the next eval/save. Eval at step 1000 finished: **eval_loss=1.145 at epoch 0.104**.
+
+### Artifacts on remote VM (89.169.122.196 still up)
+- `/opt/training/checkpoints/eliza-1-0_8b-apollo-fullcorpus-h200-1778619044/checkpoint-500/` (3.4 GB, eval done)
+- `/opt/training/checkpoints/eliza-1-0_8b-apollo-fullcorpus-h200-1778619044/checkpoint-1000/` (eval done, eval_loss=1.145)
+- No checkpoint-1500 (only got to step 1241ish before my Ctrl-C; would have needed step 1500).
+- README.md, environment.json present.
+
+### Manual rsync fetch in progress
+- PID 4024662: `rsync -avhz ubuntu@89.169.122.196:/opt/training/checkpoints/eliza-1-0_8b-apollo-fullcorpus-h200-1778619044/` → local.
+- Network speed thrashing 300-1500 KB/s. At ~500 KB/s sustained, the two 3.4GB checkpoints = ~7 GB total → **~4h fetch time worst case**.
+- Log: `/tmp/q35-0_8b-v4-manual-fetch.log`
+
+### CRITICAL: VM is up and billing, nebius CLI auth still broken
+- VM `eliza-train-h200-0_8b-v4` (instance + boot disk) is still active. GPU idle (0%) since 03:34:48Z, but VM compute is billing at ~$3-4/h.
+- Driver's EXIT trap teardown hung indefinitely on expired-token `nebius compute v1 instance list`.
+- My fallback watcher (`/tmp/nebius-finish-q35-0_8b-v4b.sh`) cannot teardown either (same auth issue).
+- **USER ACTION REQUIRED**: complete nebius OAuth federation (browser at PID 3643529 from 13:42Z, expired URL — likely needs `nebius iam get-access-token` triggered fresh). Then run:
+  ```
+  export NEBIUS_PROJECT_ID=project-e00kfz6cpr00q21z892vec
+  cd /home/shaw/milady/eliza/packages/training
+  NEBIUS_VM_NAME=eliza-train-h200-0_8b-v4 bash scripts/train_nebius.sh teardown
+  ```
+
+### State of training quality at the artifact step
+- ~1000 steps done out of 9615 (10.4% of 1 epoch).
+- eval_loss 1.145 — meaningful but very early. Gate's format_ok ≥ 0.70 threshold cannot be evaluated without the SFT pipeline running its eval gate (`run_pipeline.py` post-train eval). Since we hit the driver's 6h cap before SFT completed and triggered the eval gate, **no gate_report.json was produced**.
+- This is a **Case 2 outcome** per the agent brief (partial checkpoint, no gate_report).
+
+### Decision: relaunch as v5 with `--max-steps` is NOT yet possible
+- Reasons:
+  - Need nebius auth restored before any new VM can be provisioned.
+  - `train_local.py` has no `--max-steps` flag (only `--epochs` / `--max-samples`); patching `train_nebius.sh` to plumb max-steps requires source edits to `scripts/run_pipeline.py` too.
+  - Bigger structural problem: the driver hit 6h not 12h. The `train_nebius.sh` 6h cap is hardcoded at line 439. A successful 1500-step run within the cap needs either: shorter eval (saw 50min eval at step 500), smaller test set, or per-step rate above ~6 it/s.
+- Recommended v5 plan documented separately in v5 below.
+
+## Cleanup state
+- Driver (3652060): killed at 03:35:51Z
+- Watcher v4 (3652514): self-terminated at 22:28:02Z (false positive)
+- Watcher v4b (3768788): killed at 03:36:11Z (would fail teardown)
+- Manual rsync (4024662): in progress
+- v4 launch.log: still on disk at `/tmp/q35-0_8b-v4-launch.log` for forensics
+- VM `eliza-train-h200-0_8b-v4`: **STILL UP, NEEDS USER MANUAL TEARDOWN**
+
+## v5 plan (post user re-auth)
+1. Patch `train_nebius.sh` line 439 to honor an `ELIZA_REMOTE_RUN_TIMEOUT_H` env var (default 12 to match watcher).
+2. Patch `scripts/run_pipeline.py` (or `train_local.py`) to honor `MAX_STEPS` env (the trainer.Trainer supports `max_steps` kwarg).
+3. Reduce eval frequency: change `save_steps` from 500 → 1500 (so single mid-run eval doesn't burn 50 min). OR keep save_steps=500 but reduce test set size.
+4. Relaunch v5 with `MAX_STEPS=1500 ELIZA_REMOTE_RUN_TIMEOUT_H=12 NEBIUS_VM_NAME=eliza-train-h200-0_8b-v5 bash scripts/train_nebius.sh full --registry-key qwen3.5-0.8b ...`
+5. Create proper v5 watcher with SSH-based liveness (use `/tmp/nebius-finish-q35-0_8b-v4b.sh` as template).
+
+I am DONE for this agent run — handing off to next H200-MONITOR or to user for nebius re-auth.
+
+
+# H200-MONITOR-4 — FINAL UPDATE 2026-05-13 05:14 UTC
+
+## Manual rsync fetch COMPLETED
+- Both checkpoints fully local at `/home/shaw/milady/eliza/packages/training/checkpoints/eliza-1-0_8b-apollo-fullcorpus-h200-1778619044/`
+- `checkpoint-500/`: 3.3 GB (model.safetensors 1.5GB + optimizer.pt 2.0GB + tokenizer/config)
+- `checkpoint-1000/`: 3.4 GB (same shape)
+- Total fetched: 7.14 GB at ~1.3 MB/s avg (1h35m wall)
+- rsync log: `/tmp/q35-0_8b-v4-manual-fetch.log` (PID 4024662 exited cleanly)
+
+## Training loss curve from trainer_state.json
+- **step 490 → 500**: train loss 8.95 → 8.82, eval_loss=1.255 (eval ran 37 min)
+- **step 990 → 1000**: train loss 7.22 → 7.06, eval_loss=1.145 (eval ran 15 min, eval cache warm)
+- LR schedule: linear warmup from 1e-5, currently 9.86e-6 at step 1000
+- grad_norm 83→100→125→144 (volatile, expected at very early epoch)
+- Conclusion: model is clearly learning, but eval_loss=1.145 at 10.4% of epoch 1 is too early for a quality `format_ok ≥ 0.70` gate clear. Loss curve trajectory looks sane and matches the 0.6b reference.
+
+## Gate eval: NOT RUN
+- The pipeline's gate eval (`run_pipeline.py --eval-mode full`) only runs AFTER training completes. We hit the driver's 6h cap mid-training. No `gate_report.json` exists.
+- Per Case 2 in the brief, this is a "partial checkpoint, no gate_report" outcome → iterate (not publish).
+
+## v5 cannot start until USER re-auths nebius
+
+### Steps for user
+1. `~/.nebius/bin/nebius iam get-access-token` (opens browser, complete federation OAuth)
+2. `~/.nebius/bin/nebius iam whoami` (verify)
+3. Teardown v4 VM:
+   ```
+   export PATH="$HOME/.nebius/bin:$PATH"
+   export NEBIUS_PROJECT_ID=project-e00kfz6cpr00q21z892vec
+   cd /home/shaw/milady/eliza/packages/training
+   NEBIUS_VM_NAME=eliza-train-h200-0_8b-v4 bash scripts/train_nebius.sh teardown
+   ```
+
+### v5 patch required first (before relaunch)
+1. **Patch `scripts/train_nebius.sh` line 439** — raise the 6h hardcoded cap to `${ELIZA_REMOTE_RUN_TIMEOUT_H:-12}*60`. Without this, any retry will hit the same 6h wall.
+2. **Patch the EXIT trap (line 582)** — change `teardown || true` to `fetch || true; teardown || true` so a 6h-cap bail still pulls partial checkpoints back before attempting nebius teardown. Right now `set -euo pipefail` causes `fetch` to be skipped after `run_remote` returns 1.
+3. **Patch `instance_up()` in watcher scripts** — don't swallow nebius CLI failures as "no" (the v4 watcher bug). Use SSH-based liveness as primary, nebius CLI as confirmation only.
+4. Consider plumbing `MAX_STEPS` from env → `run_pipeline.py` → `train_local.py` for budget-bound runs (1500 steps in 12h target).
+
+### v5 launch (after patches + auth)
+```
+NEBIUS_VM_NAME=eliza-train-h200-0_8b-v5 \
+  ELIZA_REMOTE_RUN_TIMEOUT_H=12 \
+  bash packages/training/scripts/train_nebius.sh full \
+  --registry-key qwen3.5-0.8b \
+  --run-name eliza-1-0_8b-apollo-fullcorpus-h200-v5-$(date +%s)
+```
+Then arm a fresh watcher copied from `/tmp/nebius-finish-q35-0_8b-v4b.sh` (SSH-based liveness).
+
+## Sibling agents NOT TOUCHED
+- CUDA-FINISH-3's failed cuda-fused build (no retry from me).
+- ACTION-PERSONALITY-BENCH's local llama-server (3712834) — left running.
+
+## Files for next agent
+- `/home/shaw/milady/eliza/.swarm/STATUS.md` — this file (state of affairs)
+- `/tmp/URGENT-NEBIUS-TEARDOWN-NEEDED.md` — user-facing teardown instructions
+- `/tmp/q35-0_8b-v4-launch.log` — full v4 driver log
+- `/tmp/q35-0_8b-v4-watcher.log` — original (broken) v4 watcher log
+- `/tmp/q35-0_8b-v4b-watcher.log` — fallback watcher log
+- `/tmp/q35-0_8b-v4-manual-fetch.log` — manual rsync log
+- `/tmp/nebius-finish-q35-0_8b-v4b.sh` — SSH-based watcher template for v5
+- `packages/training/checkpoints/eliza-1-0_8b-apollo-fullcorpus-h200-1778619044/` — both partial checkpoints (500 + 1000) with full trainer state
+
@@ -24,6 +24,8 @@ WORKDIR /src
 
 # Copy only the manifests Bun needs to resolve the production dependency graph.
 COPY package.json bun.lock ./
+# Root-level patches (e.g. @capacitor/barcode-scanner) referenced by package.json patchedDependencies.
+COPY patches ./patches
 COPY packages/app-core/patches ./packages/app-core/patches
 COPY packages/core/package.json ./packages/core/package.json
 COPY packages/agent/package.json ./packages/agent/package.json
@@ -42,17 +44,24 @@ ENV NODE_LLAMA_CPP_SKIP_DOWNLOAD=true
 # Materialize the standalone cloud-agent template at the Docker deploy
 # boundary. Source manifests stay workspace-local; the image install uses the
 # local package versions as exact npm dependencies.
-RUN node <<'EOF'
+# Note: bun's node shim does not support reading scripts from stdin (heredoc),
+# so we write to a temp file and execute with bun instead of `node <<'EOF'`.
+RUN <<'SCRIPT'
+set -e
+cat > /tmp/prune-workspace.mjs << 'JS'
 const fs = require("fs");
 
 const KEEP = [
+  "packages/core",
   "packages/agent",
   "packages/app-core",
   "packages/shared",
   "packages/ui",
   "packages/app-core/platforms/electrobun",
   "packages/app-core/deploy/cloud-agent-template",
   "packages/workflows",
+  "plugins/plugin-sql",
+  "plugins/plugin-elizacloud",
   "plugins/plugin-workflow",
   "packages/app",
 ];
@@ -117,7 +126,7 @@ function materializeWorkspaceDeps(path, dependencyNames, versions) {
     const spec = pkg.dependencies?.[name];
     if (typeof spec !== "string" || !spec.startsWith("workspace:")) continue;
     const version = versions.get(name);
-    if (!version) throw new Error(`No local package version found for ${name}`);
+    if (!version) throw new Error("No local package version found for " + name);
     pkg.dependencies[name] = version;
     changed = true;
   }
@@ -131,10 +140,11 @@ function prune(path) {
     const deps = pkg[section];
     if (!deps) continue;
     for (const [name, spec] of Object.entries(deps)) {
-      const isWorkspace =
-        typeof spec === "string" && spec.startsWith("workspace:");
+      const isLocal =
+        typeof spec === "string" &&
+        (spec.startsWith("workspace:") || spec.startsWith("file:"));
       const isStripName = STRIP_NAMES.test(name);
-      if ((isWorkspace && !ALLOW.has(name)) || isStripName) {
+      if ((isLocal && !ALLOW.has(name)) || isStripName) {
         delete deps[name];
       }
     }
@@ -162,7 +172,10 @@ writeJson("package.json", root);
   "packages/app-core/deploy/cloud-agent-template/package.json",
 ].forEach(prune);
 if (fs.existsSync("bun.lock")) fs.unlinkSync("bun.lock");
-EOF
+JS
+bun /tmp/prune-workspace.mjs
+rm /tmp/prune-workspace.mjs
+SCRIPT
 # Drop --frozen-lockfile: bun.lock has been removed above so bun regenerates
 # fresh against the pruned workspace set.
 RUN bun install --ignore-scripts --no-progress
@@ -174,6 +187,7 @@ RUN npm install --prefix /opt/tsx --ignore-scripts --no-save tsx@4.21.0 \
 FROM node:${NODE_VERSION}-slim AS prep
 WORKDIR /src
 ARG VERSION_CLEAN
+ARG APP_ENTRYPOINT=""
 # Re-declare so it's in scope for this build stage.
 ARG VITE_ASSET_BASE_URL=""
 ENV VITE_ASSET_BASE_URL=${VITE_ASSET_BASE_URL}
@@ -185,7 +199,7 @@ COPY packages/app-core ./packages/app-core
 COPY packages/shared ./packages/shared
 COPY packages/workflows ./packages/workflows
 COPY plugins/plugin-workflow ./plugins/plugin-workflow
-COPY ${APP_ENTRYPOINT} package.json plugins.json ./
+COPY package.json plugins.json ./
 COPY packages/app-core/scripts/docker-entrypoint.sh ./scripts/docker-entrypoint.sh
 
 RUN set -eux; \
@@ -204,7 +218,8 @@ RUN set -eux; \
   done; \
   cp -a packages/app/dist /runtime/packages/app/dist; \
   cp -a packages/app-core/scripts/docker-entrypoint.sh /runtime/scripts/docker-entrypoint.sh; \
-  cp -a "${APP_ENTRYPOINT}" package.json plugins.json /runtime/; \
+  cp -a package.json plugins.json /runtime/; \
+  if [ -n "${APP_ENTRYPOINT}" ]; then cp -a "${APP_ENTRYPOINT}" /runtime/; fi; \
   sed -i 's/const stats = service.getCatalogStats();/const stats = typeof service.getCatalogStats === "function" ? service.getCatalogStats() : { loaded: 0, total: 0, storageType: "unknown" };/g' /runtime/node_modules/@elizaos/plugin-agent-skills/dist/index.js 2>/dev/null || true; \
   if [ -n "${VERSION_CLEAN}" ]; then \
     node -e "const fs=require('fs'); const v=process.argv[1]; const p='/runtime/node_modules/@elizaos/agent/package.json'; try { const pkg=JSON.parse(fs.readFileSync(p,'utf8')); pkg.version=v; fs.writeFileSync(p, JSON.stringify(pkg, null, 2)); console.log('Patched autonomous to', v); } catch (err) { console.log('skip:', err.message); }" "${VERSION_CLEAN}"; \
 
@@ -0,0 +1,4 @@
+// Node.js ESM can't resolve extensionless directory imports without an index
+// file. Re-export everything from the parent module; rewriteRelativeImportExtensions
+// rewrites the `.ts` suffix to `.js` in the compiled output so the runtime finds it.
+export * from "../structured-output.ts";
@@ -7,15 +7,20 @@
  * CI usage (test.yml — "Run node:sqlite tests" step):
  *   node node_modules/vitest/vitest.mjs run --config vitest.node-sqlite.config.ts
  */
-import { mergeConfig } from "vitest/config";
 import base from "./vitest.config.ts";
 
-export default mergeConfig(base, {
+// mergeConfig concatenates arrays (include, exclude) rather than replacing
+// them, so the training-benchmarks.test.ts file would end up in both include
+// and exclude — and exclude wins, leaving no test files. Use object spread
+// to replace the arrays instead of appending to them.
+export default {
+  ...base,
   test: {
+    ...base.test,
     include: ["src/api/training-benchmarks.test.ts"],
-    exclude: (base.test?.exclude ?? []).filter(
+    exclude: ((base.test?.exclude ?? []) as unknown[]).filter(
       (p: unknown) =>
         typeof p !== "string" || !p.includes("training-benchmarks"),
     ),
   },
-});
+};
@@ -64,13 +64,18 @@ describe("resolveStateDir", () => {
 		expect(resolveStateDir({}, fakeHomedir)).toBe(join(FAKE_HOME, ".eliza"));
 	});
 
-	it("treats whitespace-only env values as unset", () => {
+	it("treats whitespace-only env values as unset, falling through to aliases", () => {
+		// ELIZA_STATE_DIR is whitespace-only (treated as unset) → alias consulted.
 		expect(
 			resolveStateDir(
 				{ ELIZA_STATE_DIR: "   ", MILADY_STATE_DIR: "/tmp/bar" },
 				fakeHomedir,
 			),
-		).toBe(join(FAKE_HOME, ".eliza"));
+		).toBe("/tmp/bar");
+		// When neither canonical nor alias is set the default applies.
+		expect(resolveStateDir({ ELIZA_STATE_DIR: "   " }, fakeHomedir)).toBe(
+			join(FAKE_HOME, ".eliza"),
+		);
 	});
 
 	it("expands a leading ~ in env overrides via the real homedir", () => {
 
@@ -15,7 +15,7 @@
  * Adding a new schema: import both sides + assert membership equality.
  */
 
-import { describe, expect, it } from "bun:test";
+import { describe, expect, it } from "vitest";
 import {
   ConversationAutomationTypeSchema,
   ConversationScopeSchema,
 
@@ -16,16 +16,16 @@ describe("PostConversationRequestSchema", () => {
       title: "Hello",
       includeGreeting: true,
       lang: "en",
-      metadata: { scope: "user", taskId: "t1" },
+      metadata: { scope: "general", taskId: "t1" },
     });
     expect(parsed.title).toBe("Hello");
-    expect(parsed.metadata?.scope).toBe("user");
+    expect(parsed.metadata?.scope).toBe("general");
   });
 
   it("rejects unknown metadata field", () => {
     expect(() =>
       PostConversationRequestSchema.parse({
-        metadata: { scope: "user", custom: 1 },
+        metadata: { scope: "general", custom: 1 },
       }),
     ).toThrow();
   });
 
@@ -0,0 +1 @@
+{ "plugins": [] }