ok Merge branch 'main' of github.com:modelscope/sirchmunk into release/0.0.6

wangxingjun778 · wangxingjun778 · commit 65e8e2a7737f · 2026-03-12T02:48:27.000+08:00
diff --git a/README.md b/README.md
@@ -462,6 +462,8 @@ docker pull modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/sirch
 # Start the service
 docker run -d \
   --name sirchmunk \
+  --cpus="4" \
+  --memory="2g" \
   -p 8584:8584 \
   -e LLM_API_KEY="your-api-key-here" \
   -e LLM_BASE_URL="https://api.openai.com/v1" \
diff --git a/README_zh.md b/README_zh.md
@@ -461,6 +461,8 @@ docker pull modelscope-registry.cn-beijing.cr.aliyuncs.com/modelscope-repo/sirch
 # 启动服务
 docker run -d \
   --name sirchmunk \
+  --cpus="4" \
+  --memory="2g" \
   -p 8584:8584 \
   -e LLM_API_KEY="your-api-key-here" \
   -e LLM_BASE_URL="https://api.openai.com/v1" \
diff --git a/src/sirchmunk/api/main.py b/src/sirchmunk/api/main.py
@@ -75,6 +75,19 @@
     allow_headers=["*"],
 )
 
+
+@app.on_event("startup")
+def _prewarm_chat_search():
+    """Create the chat search singleton at startup so the embedding model starts loading immediately.
+    This reduces the chance of the first user request blocking on model load (e.g. in Docker).
+    """
+    try:
+        from .chat import get_search_instance
+        get_search_instance()
+    except Exception:
+        pass
+
+
 # Include all API routers (registered before static mount so they take priority)
 app.include_router(knowledge_router)
 app.include_router(settings_router)
diff --git a/src/sirchmunk/search.py b/src/sirchmunk/search.py
@@ -351,13 +351,13 @@ async def _try_reuse_cluster(self, query: str) -> Optional[KnowledgeCluster]:
             return None
 
         try:
-            # Wait for the model (non-blocking via executor) instead of
-            # returning None immediately — this ensures reuse works even
-            # on the very first search call.
+            # Wait briefly for the model so reuse can work when it's already loading.
+            # Use a short timeout to avoid blocking the first request (e.g. in Docker
+            # the model may take 30–60s to load; we skip reuse and do full search instead).
             if not self.embedding_client.is_ready():
                 self.embedding_client.start_loading()
                 try:
-                    await self.embedding_client._ensure_model_async(timeout=60)
+                    await self.embedding_client._ensure_model_async(timeout=5)
                 except Exception:
                     await self._logger.debug(
                         "Embedding model not ready yet, skipping cluster reuse"
@@ -486,34 +486,42 @@ async def _save_cluster_with_embedding(self, cluster: KnowledgeCluster) -> None:
                 await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
                 return
 
-        # Compute and store embedding for the cluster.
-        # embed() internally awaits model readiness via _ensure_model_async(),
-        # so even if the background loading thread hasn't finished yet, we
-        # block (non-blocking async) until the model is ready rather than
-        # silently skipping the embedding — which would make the cluster
-        # invisible to future similarity searches.
+        # Compute and store embedding for the cluster when the model is ready.
+        # Use a short wait to avoid blocking the response if the model is still
+        # loading (e.g. first request in Docker). If not ready, skip embedding
+        # so the cluster is still saved and can be reused after the next load.
         if self.embedding_client:
             try:
-                from sirchmunk.utils.embedding_util import compute_text_hash
-
-                combined_text = self.knowledge_storage.combine_cluster_fields(
-                    cluster.queries
-                )
-                text_hash = compute_text_hash(combined_text)
+                if not self.embedding_client.is_ready():
+                    try:
+                        await self.embedding_client._ensure_model_async(timeout=3)
+                    except Exception:
+                        pass
+                if self.embedding_client.is_ready():
+                    from sirchmunk.utils.embedding_util import compute_text_hash
+
+                    combined_text = self.knowledge_storage.combine_cluster_fields(
+                        cluster.queries
+                    )
+                    text_hash = compute_text_hash(combined_text)
 
-                embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
+                    embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
 
-                await self.knowledge_storage.store_embedding(
-                    cluster_id=cluster.id,
-                    embedding_vector=embedding_vector,
-                    embedding_model=self.embedding_client.model_id,
-                    embedding_text_hash=text_hash,
-                )
+                    await self.knowledge_storage.store_embedding(
+                        cluster_id=cluster.id,
+                        embedding_vector=embedding_vector,
+                        embedding_model=self.embedding_client.model_id,
+                        embedding_text_hash=text_hash,
+                    )
 
-                await self._logger.info(
-                    f"Stored embedding for cluster {cluster.id} "
-                    f"(dim={len(embedding_vector)}, model={self.embedding_client.model_id})"
-                )
+                    await self._logger.info(
+                        f"Stored embedding for cluster {cluster.id} "
+                        f"(dim={len(embedding_vector)}, model={self.embedding_client.model_id})"
+                    )
+                else:
+                    await self._logger.debug(
+                        f"Embedding model not ready — skipping embedding for cluster {cluster.id}"
+                    )
 
             except Exception as e:
                 await self._logger.warning(f"Failed to compute embedding for cluster {cluster.id}: {e}")