Project-N-E-K-O
diff --git a/‎.dockerignore‎
Lines changed: 7 additions & 0 deletions b/‎.dockerignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/build-desktop-linux.yml‎
Lines changed: 16 additions & 0 deletions b/‎.github/workflows/build-desktop-linux.yml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎.github/workflows/build-desktop.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/build-desktop.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/workflows/docker-cleanup.yml‎
Lines changed: 7 additions & 2 deletions b/‎.github/workflows/docker-cleanup.yml‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎.github/workflows/docker-multi-arch.yml‎
Lines changed: 57 additions & 0 deletions b/‎.github/workflows/docker-multi-arch.yml‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎.github/workflows/health-check.yml‎
Lines changed: 31 additions & 9 deletions b/‎.github/workflows/health-check.yml‎
Lines changed: 31 additions & 9 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎app/agent_server.py‎
Lines changed: 29 additions & 1 deletion b/‎app/agent_server.py‎
Lines changed: 29 additions & 1 deletion
@@ -41,6 +41,13 @@ config/user_preferences.json
 config/voice_storage.json
 N.E.K.O/
 
+# Embedding model weights are intentionally NOT ignored: CI pre-fetches the
+# pinned revision on the native runner (cached via actions/cache) and ships it in
+# the build context so the in-image step runs fully offline — see step 6b in
+# docker/Dockerfile{,.full}. That step force-re-downloads when the bundled
+# (repo, revision) doesn't match the pin, so a developer's stale/partial local
+# copy riding in via `COPY . /app` is corrected rather than shipped.
+
 # Build artifacts
 build/
 dist/
 
@@ -178,6 +178,16 @@ jobs:
       # anonymous profile folder so the bundled app can run vector memory
       # offline. The runtime (memory/embeddings.py) auto-falls back to bundled
       # assets when the user's app-data profile is incomplete.
+      # Cache the weights by revision so huggingface.co is hit at most once per
+      # pin (per OS) instead of every build — the same per-IP HTTP 429 throttle
+      # that broke the Docker build can bite here too. Cache hit -> the next step
+      # is a no-op ("keep existing"); only a cold cache actually downloads.
+      - name: Cache embedding model weights
+        uses: actions/cache@v4
+        with:
+          path: data/embedding_models
+          key: embedding-model-${{ runner.os }}-${{ env.EMBEDDING_MODEL_REVISION }}-both
+
       - name: Prepare embedding model assets
         shell: bash
         run: |
@@ -278,6 +288,12 @@ jobs:
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tiktoken"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tiktoken_ext"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=onnxruntime"
+          # onnxruntime.transformers/ is a benchmark + model-conversion toolbox
+          # nothing at runtime imports (rapidocr uses only the core InferenceSession).
+          # --include-package=onnxruntime would recurse and compile the whole subtree,
+          # including a 130k+ line gpt2 benchmark C unit that crashes the C backend.
+          # Mirrors build-desktop.yml.
+          NUITKA_OPTS="$NUITKA_OPTS --nofollow-import-to=onnxruntime.transformers"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tokenizers"
 
           # Package data
 
@@ -209,6 +209,16 @@ jobs:
       # anonymous profile folder so the bundled app can run vector memory
       # offline. The runtime (memory/embeddings.py) auto-falls back to bundled
       # assets when the user's app-data profile is incomplete.
+      # Cache the weights by revision so huggingface.co is hit at most once per
+      # pin (per OS) instead of every build — the same per-IP HTTP 429 throttle
+      # that broke the Docker build can bite here too. Cache hit -> the next step
+      # is a no-op ("keep existing"); only a cold cache actually downloads.
+      - name: Cache embedding model weights
+        uses: actions/cache@v4
+        with:
+          path: data/embedding_models
+          key: embedding-model-${{ runner.os }}-${{ env.EMBEDDING_MODEL_REVISION }}-both
+
       - name: Prepare embedding model assets
         shell: bash
         run: |
@@ -337,6 +347,12 @@ jobs:
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tiktoken"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tiktoken_ext"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=onnxruntime"
+          # onnxruntime.transformers/ is a benchmark + model-conversion toolbox
+          # nothing at runtime imports (rapidocr uses only the core InferenceSession).
+          # --include-package=onnxruntime would recurse and compile the whole subtree,
+          # including a 130k+ line gpt2 benchmark C unit that crashes the C backend.
+          # Synced with build_nuitka.bat.
+          NUITKA_OPTS="$NUITKA_OPTS --nofollow-import-to=onnxruntime.transformers"
           NUITKA_OPTS="$NUITKA_OPTS --include-package=tokenizers"
           # bilibili_dm/bilibili_danmaku plugins import bilibili_api; compile the
           # package itself (CI previously only carried its package-data).
@@ -502,6 +518,12 @@ jobs:
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package=tiktoken
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package=tiktoken_ext
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package=onnxruntime
+          rem onnxruntime.transformers/ is a benchmark + model-conversion toolbox
+          rem nothing at runtime imports (rapidocr uses only the core InferenceSession).
+          rem --include-package=onnxruntime would recurse and compile the whole subtree,
+          rem including a 130k+ line gpt2 benchmark C unit that crashes the C backend.
+          rem Synced with build_nuitka.bat.
+          set NUITKA_OPTS=%NUITKA_OPTS% --nofollow-import-to=onnxruntime.transformers
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package=tokenizers
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package-data=jinja2
           set NUITKA_OPTS=%NUITKA_OPTS% --include-package-data=certifi
 
@@ -42,8 +42,13 @@ jobs:
       # 会连带处理子镜像、不会对级联删除的 digest 二次删。
       - name: Delete old container versions
         # 第三方社区 action + GHCR 删除权限/token，固定到 commit SHA 防 tag 被改写。
-        # 下方 SHA 对应 v1.2.0；升级时一并更新 SHA 和行尾版本注释。
-        uses: dataaxiom/ghcr-cleanup-action@374e2028c8fb93b7219f3771cd405fab95d3dec4 # v1.2.0
+        # 下方 SHA 对应 v1.2.1；升级时一并更新 SHA 和行尾版本注释。
+        #
+        # 为什么不是 v1.2.0：v1.2.0 对级联回收只吞掉「第一个」404，删父 manifest
+        # 触发 GHCR 连带删子 digest 后，第二个已消失 digest 的 404 不被 catch，
+        # 直接 ##[error]Package not found 把 job 挂掉（每周稳定复现）。v1.2.1 的
+        # "tolerate every 404 on package version delete" 修掉这条路径。
+        uses: dataaxiom/ghcr-cleanup-action@f092b48ba3b604b2a83690dc4b2bbb3392e1045f # v1.2.1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
           owner: project-n-e-k-o
 
@@ -44,6 +44,13 @@ concurrency:
 env:
   REGISTRY_GHCR: ghcr.io
   IMAGE_NAME: project-n-e-k-o/n.e.k.o
+  # Embedding model pin (kept identical to .github/workflows/build-desktop.yml and
+  # docker/Dockerfile{,.full} ARG defaults). The weights are pre-fetched on the
+  # native runner and cached so the Docker build never hits huggingface.co — see
+  # the "Prepare embedding model" steps below and Dockerfile step 6b.
+  EMBEDDING_MODEL_REPO: jinaai/jina-embeddings-v5-text-nano-retrieval
+  EMBEDDING_MODEL_PROFILE_ID: local-text-retrieval-v1
+  EMBEDDING_MODEL_REVISION: ac5d898c8d382b17167c33e5c8af644a3519b47d
 
 jobs:
   # ============================================================================
@@ -154,6 +161,27 @@ jobs:
         with:
           fetch-depth: 0
 
+      # Pre-fetch the embedding weights on the native runner and cache them by
+      # revision, so the (QEMU-emulated, throttle-prone) Docker build below never
+      # touches huggingface.co. Cache hit -> the downloader is a no-op; only a
+      # cold cache (revision bump / 7-day eviction) actually downloads, natively,
+      # once. The weights ride into the build via the context (Dockerfile 6b).
+      # Standard image ships int8 only to stay lightweight.
+      - name: Cache embedding model weights (int8)
+        uses: actions/cache@v4
+        with:
+          path: data/embedding_models
+          key: embedding-model-${{ env.EMBEDDING_MODEL_REVISION }}-int8
+
+      - name: Prepare embedding model assets (int8)
+        run: |
+          python3 scripts/prepare_embedding_model.py \
+            --repo "$EMBEDDING_MODEL_REPO" \
+            --revision "$EMBEDDING_MODEL_REVISION" \
+            --profile-id "$EMBEDDING_MODEL_PROFILE_ID" \
+            --output-root data/embedding_models \
+            --variant int8
+
       - name: Check if Dockerfile exists
         id: check-dockerfile
         run: |
@@ -247,6 +275,10 @@ jobs:
           file: ./docker/Dockerfile
           platforms: ${{ matrix.platform }}
           push: true
+          build-args: |
+            EMBEDDING_MODEL_REPO=${{ env.EMBEDDING_MODEL_REPO }}
+            EMBEDDING_MODEL_REVISION=${{ env.EMBEDDING_MODEL_REVISION }}
+            EMBEDDING_MODEL_PROFILE_ID=${{ env.EMBEDDING_MODEL_PROFILE_ID }}
           tags: ${{ steps.meta_ghcr.outputs.tags || steps.meta_both.outputs.tags }}
           labels: ${{ steps.meta_ghcr.outputs.labels || steps.meta_both.outputs.labels }}
           cache-from: type=gha
@@ -272,6 +304,27 @@ jobs:
         with:
           fetch-depth: 0
 
+      # Pre-fetch the embedding weights on the native runner and cache them by
+      # revision, so the (QEMU-emulated, throttle-prone) Docker build below never
+      # touches huggingface.co. Cache hit -> the downloader is a no-op; only a
+      # cold cache (revision bump / 7-day eviction) actually downloads, natively,
+      # once. The weights ride into the build via the context (Dockerfile.full
+      # 6b). Full image bundles both int8 and fp32.
+      - name: Cache embedding model weights (both)
+        uses: actions/cache@v4
+        with:
+          path: data/embedding_models
+          key: embedding-model-${{ env.EMBEDDING_MODEL_REVISION }}-both
+
+      - name: Prepare embedding model assets (both)
+        run: |
+          python3 scripts/prepare_embedding_model.py \
+            --repo "$EMBEDDING_MODEL_REPO" \
+            --revision "$EMBEDDING_MODEL_REVISION" \
+            --profile-id "$EMBEDDING_MODEL_PROFILE_ID" \
+            --output-root data/embedding_models \
+            --variant both
+
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v3
 
@@ -350,6 +403,10 @@ jobs:
           file: ./docker/Dockerfile.full
           platforms: ${{ matrix.platform }}
           push: true
+          build-args: |
+            EMBEDDING_MODEL_REPO=${{ env.EMBEDDING_MODEL_REPO }}
+            EMBEDDING_MODEL_REVISION=${{ env.EMBEDDING_MODEL_REVISION }}
+            EMBEDDING_MODEL_PROFILE_ID=${{ env.EMBEDDING_MODEL_PROFILE_ID }}
           tags: ${{ steps.meta_ghcr.outputs.tags || steps.meta_both.outputs.tags }}
           labels: ${{ steps.meta_ghcr.outputs.labels || steps.meta_both.outputs.labels }}
           cache-from: type=gha
 
@@ -15,15 +15,17 @@ jobs:
       - name: Check lanlan.app (old)
         id: check_app
         run: |
+          CURL_EXIT=0
           HTTP_CODE=$(curl -s -o /tmp/resp_app.json -w "%{http_code}" \
             --max-time 30 --connect-timeout 10 \
             -X POST "https://lanlan.app/text/v1/chat/completions" \
             -H "Content-Type: application/json" \
             -H "Authorization: Bearer free-access" \
             -d '{"model":"free-mini-model","messages":[{"role":"user","content":"sends some useful information"}],"max_completion_tokens":5}' \
-            2>/dev/null || echo "000")
+            2>/dev/null) || CURL_EXIT=$?
           BODY=$(cat /tmp/resp_app.json 2>/dev/null || echo "{}")
           echo "http_code=$HTTP_CODE" >> "$GITHUB_OUTPUT"
+          echo "curl_exit=$CURL_EXIT" >> "$GITHUB_OUTPUT"
 
           if [ "$HTTP_CODE" = "200" ]; then
             if echo "$BODY" | jq -e '(.choices | type) == "array" and (.choices | length) > 0 and (.choices[0].message | type) == "object" and .choices[0].message.role == "assistant" and (.choices[0].message | has("content"))' >/dev/null 2>&1; then
@@ -39,15 +41,17 @@ jobs:
       - name: Check www.lanlan.app (new)
         id: check_api_app
         run: |
+          CURL_EXIT=0
           HTTP_CODE=$(curl -s -o /tmp/resp_api_app.json -w "%{http_code}" \
             --max-time 30 --connect-timeout 10 \
             -X POST "https://www.lanlan.app/text/v1/chat/completions" \
             -H "Content-Type: application/json" \
             -H "Authorization: Bearer free-access" \
             -d '{"model":"free-mini-model","messages":[{"role":"user","content":"sends some useful information"}],"max_completion_tokens":5}' \
-            2>/dev/null || echo "000")
+            2>/dev/null) || CURL_EXIT=$?
           BODY=$(cat /tmp/resp_api_app.json 2>/dev/null || echo "{}")
           echo "http_code=$HTTP_CODE" >> "$GITHUB_OUTPUT"
+          echo "curl_exit=$CURL_EXIT" >> "$GITHUB_OUTPUT"
 
           if [ "$HTTP_CODE" = "200" ]; then
             if echo "$BODY" | jq -e '(.choices | type) == "array" and (.choices | length) > 0 and (.choices[0].message | type) == "object" and .choices[0].message.role == "assistant" and (.choices[0].message | has("content"))' >/dev/null 2>&1; then
@@ -63,15 +67,17 @@ jobs:
       - name: Check www.lanlan.tech (new)
         id: check_api_tech
         run: |
+          CURL_EXIT=0
           HTTP_CODE=$(curl -s -o /tmp/resp_api_tech.json -w "%{http_code}" \
             --max-time 30 --connect-timeout 10 \
             -X POST "https://www.lanlan.tech/text/v1/chat/completions" \
             -H "Content-Type: application/json" \
             -H "Authorization: Bearer free-access" \
             -d '{"model":"free-mini-model","messages":[{"role":"user","content":"sends some useful information"}],"max_completion_tokens":5}' \
-            2>/dev/null || echo "000")
+            2>/dev/null) || CURL_EXIT=$?
           BODY=$(cat /tmp/resp_api_tech.json 2>/dev/null || echo "{}")
           echo "http_code=$HTTP_CODE" >> "$GITHUB_OUTPUT"
+          echo "curl_exit=$CURL_EXIT" >> "$GITHUB_OUTPUT"
 
           if [ "$HTTP_CODE" = "200" ]; then
             if echo "$BODY" | jq -e '(.choices | type) == "array" and (.choices | length) > 0 and (.choices[0].message | type) == "object" and .choices[0].message.role == "assistant" and (.choices[0].message | has("content"))' >/dev/null 2>&1; then
@@ -99,6 +105,22 @@ jobs:
           API_APP_CODE="${{ steps.check_api_app.outputs.http_code }}"
           API_TECH_CODE="${{ steps.check_api_tech.outputs.http_code }}"
 
+          APP_EXIT="${{ steps.check_app.outputs.curl_exit }}"
+          API_APP_EXIT="${{ steps.check_api_app.outputs.curl_exit }}"
+          API_TECH_EXIT="${{ steps.check_api_tech.outputs.curl_exit }}"
+
+          # curl 退出码非零（拿不到响应、即 http_code=000）时附带退出码（28=超时, 7=拒连, 6=DNS, 35=TLS）以区分故障类型
+          fmt_code() {
+            if [ -n "$2" ] && [ "$2" != "0" ]; then
+              echo "$1, curl exit $2"
+            else
+              echo "$1"
+            fi
+          }
+          APP_CODE_DISP=$(fmt_code "$APP_CODE" "$APP_EXIT")
+          API_APP_CODE_DISP=$(fmt_code "$API_APP_CODE" "$API_APP_EXIT")
+          API_TECH_CODE_DISP=$(fmt_code "$API_TECH_CODE" "$API_TECH_EXIT")
+
           ALL_HEALTHY="true"
           ANY_DOWN="false"
           for h in "$APP_HEALTHY" "$API_APP_HEALTHY" "$API_TECH_HEALTHY"; do
@@ -120,21 +142,21 @@ jobs:
           fi
 
           if [ "$APP_HEALTHY" = "true" ]; then
-            APP_STATUS=":white_check_mark: OK (HTTP $APP_CODE)"
+            APP_STATUS=":white_check_mark: OK (HTTP $APP_CODE_DISP)"
           else
-            APP_STATUS=":x: DOWN (HTTP $APP_CODE)"
+            APP_STATUS=":x: DOWN (HTTP $APP_CODE_DISP)"
           fi
 
           if [ "$API_APP_HEALTHY" = "true" ]; then
-            API_APP_STATUS=":white_check_mark: OK (HTTP $API_APP_CODE)"
+            API_APP_STATUS=":white_check_mark: OK (HTTP $API_APP_CODE_DISP)"
           else
-            API_APP_STATUS=":x: DOWN (HTTP $API_APP_CODE)"
+            API_APP_STATUS=":x: DOWN (HTTP $API_APP_CODE_DISP)"
           fi
 
           if [ "$API_TECH_HEALTHY" = "true" ]; then
-            API_TECH_STATUS=":white_check_mark: OK (HTTP $API_TECH_CODE)"
+            API_TECH_STATUS=":white_check_mark: OK (HTTP $API_TECH_CODE_DISP)"
           else
-            API_TECH_STATUS=":x: DOWN (HTTP $API_TECH_CODE)"
+            API_TECH_STATUS=":x: DOWN (HTTP $API_TECH_CODE_DISP)"
           fi
 
           # 只在有故障时 @everyone
 
@@ -58,6 +58,9 @@ __pycache__/
 .venv_monitor/
 .matplotlib/
 .pyinstaller-config/
+.hypothesis/
+.agent-logs/
+.kiro/
 # Local Windows Nuitka build script — maintainer-only, not for repo
 build_nuitka.bat
 # Playwright browsers cache (used by build_nuitka.bat, not for repo)
@@ -94,6 +97,7 @@ node_modules/
 dist/
 build/
 *.tsbuildinfo
+docs/.vitepress/cache/
 static/react/neko-chat/
 
 # Python venvs and wheels
 
@@ -1342,7 +1342,9 @@ def _check_agent_api_gate() -> Dict[str, Any]:
     try:
         cm = get_config_manager()
         ok, reasons = cm.is_agent_api_ready()
-        return {"ready": ok, "reasons": reasons, "is_free_version": cm.is_free_version()}
+        # 字段名保留 is_free_version（前端/下游 gate 消费者沿用），值取 agent 维度的
+        # is_agent_free()：判 agent 是否走内置免费模型，而非 core/assist 的版本免费。
+        return {"ready": ok, "reasons": reasons, "is_free_version": cm.is_agent_free()}
     except Exception as e:
         return {"ready": False, "reasons": [f"Agent API check failed: {e}"], "is_free_version": False}
 
@@ -2245,7 +2247,15 @@ async def _on_plugin_progress(
                 async def _run_user_plugin_dispatch():
                     try:
                         from utils.instrument import counter as _ic
+                        # agent_invoked 只按 agent_type 分，保持单 key 即"plugin
+                        # 总计"——本地 admin 视图 get_top_counters 按完整 metric_key
+                        # GROUP BY、不做 dim 聚合，若把 plugin_id 塞进这里会把该
+                        # 总计行打散成 per-plugin 行、丢掉聚合。per-plugin 细分另发
+                        # 独立指标 plugin_invoked，其全量之和恒等于本行，互不重复
+                        # 计数。plugin_id 基数由已安装插件数限定，截断兜底防异常长
+                        # id 撑爆 counter key 空间。
                         _ic("agent_invoked", agent_type="plugin")
+                        _ic("plugin_invoked", plugin_id=str(plugin_id or "unknown")[:48])
                     except Exception:
                         pass  # 埋点 best-effort，不阻塞 plugin 分派
                     # Default delivery mode; overridden after the plugin result
@@ -3401,6 +3411,24 @@ async def _http_plugin_provider(force_refresh: bool = False):
         await Modules.agent_bridge.start()
     except Exception as e:
         logger.warning(f"[Agent] Event bridge startup failed: {e}")
+    # 免费版 Agent 每日配额耗尽 → 节流通知前端弹提示（最多每 10 秒一次）。
+    # consume_agent_daily_quota 跑在 worker 线程里调这个回调，用 run_coroutine_threadsafe
+    # 把异步 ZeroMQ emit 调度回 agent_server 的事件循环；不 .result()，保持非阻塞。
+    try:
+        _quota_notify_loop = asyncio.get_running_loop()
+
+        def _notify_agent_quota_exceeded(used: int, limit: int) -> None:
+            try:
+                asyncio.run_coroutine_threadsafe(
+                    _emit_main_event("agent_quota_exceeded", None, used=used, limit=limit),
+                    _quota_notify_loop,
+                )
+            except Exception as e:
+                logger.debug("[Agent] schedule agent_quota_exceeded emit failed: %s", e)
+
+        get_config_manager().register_quota_exceeded_notifier(_notify_agent_quota_exceeded)
+    except Exception as e:
+        logger.warning(f"[Agent] register quota-exceeded notifier failed: {e}")
     # Push initial server status so frontend can render Agent popup without waiting.
     _bump_state_revision()