Optimize websearch pipeline (#259)

JCHAVEROT · web-flow · commit 8ea7a40800d4 · 2026-04-20T12:18:06.000+02:00
* feat: replace snippet truncation with token-aware accumulation in websearch

* refactor: make torch optional for API-only websearch

* chore: apply review changes

* fix: improve generate summary and evaluate subquery relevance prompts

* fix: correct all extra in pyproject

* fix: use LLM tokenizer to retrieve exact token count and use it when truncating

* feat: improve and standardize prompts

* feat: prevent duplicated results

* chore: refactor and apply review changes

* chore: apply review changes

* chore: apply last review changes

* feat: add first version of websearch tests

* fix: improve tests and remove useless ones

* fix: add websearch extra to CI

* fix: apply review changes

* chore: clean tests

* chore: add back heuristic 4 chars/token to count tokens

* fix: add warning in case no local tokenizer found

* fix: use margin in no-tokenizer fallback when truncatings

* chore: apply copilot feedback

* feat: add fast_tokenizer parameter in config file

* chore: update websearch doc

* chore: apply changes following copilot review

* fix: apply review changes

* tests: apply fixes to make tests independant from prompts in src code

* fix: apply review changes
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Install dependencies (using uv)
         run: |
           source .venv/bin/activate
-          uv pip install -e ".[process,index,rag,api,cpu,dev]"
+          uv pip install -e ".[process,index,rag,api,cpu,dev,websearch]"
 
       - name: Show installed cohere and langchain-cohere versions
         run: |
diff --git a/docs/websearch.md b/docs/websearch.md
@@ -48,6 +48,8 @@ Users can adjust the pipeline according to their [requirements](/examples/websea
 - `use_summary`: Activates summarization of retrieved web snippets.
 - `n_loops`: Defines the number of search iterations to refine results.
 - `n_subqueries`: Specifies the number of subqueries generated for each input query.
+- `max_context_tokens`: Maximum token budget for prompts (default: 2048).
+- `fast_tokenizer`: If true, estimates tokens as ~4 chars/token instead of using the LLM tokenizer, faster but approximate (default: false).
 
 ### Workflow
 
@@ -56,7 +58,7 @@ Users can adjust the pipeline according to their [requirements](/examples/websea
 1. **Input Query Processing:**
    - The pipeline processes the user query and generates subqueries for web searches in order to complete the current knowledge.
 2. **WebSearch Execution:**
-   - DuckDuckGo searches are performed for each subquery
+   - Web searches are performed for each subquery using the configured provider
 3. **Summarization:**
    - Retrieved web snippets are summarized using an LLM if `use_summary` is enabled.
 4. **Integration with RAG (if use_rag):**
diff --git a/examples/websearchRAG/config.yaml b/examples/websearchRAG/config.yaml
@@ -11,6 +11,7 @@ websearch:
   search_provider: duckduckgo
   max_retries: 3
   max_context_tokens: 2048
+  fast_tokenizer: false
   mode: local
   llm_config:
     llm_name: OpenMeditron/meditron3-8b # Qwen/Qwen2-0.5B 
diff --git a/pyproject.toml b/pyproject.toml
@@ -126,7 +126,7 @@ api = [
 # --- Composite + variant extras ---
 
 all = [
-    "mmore[process,rag,api]",
+    "mmore[process,rag,api,websearch]",
 ]
 
 cpu = [
diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py
@@ -4,7 +4,11 @@
 # from getpass import getpass
 from typing import ClassVar, Optional, cast
 
-import torch
+try:
+    import torch
+except ImportError:
+    torch = None
+
 from langchain_anthropic import ChatAnthropic
 from langchain_cohere import ChatCohere
 from langchain_core.language_models.chat_models import BaseChatModel
@@ -146,9 +150,16 @@ class LLM(BaseChatModel):
     """Class parsing the model name and arguments to load the correct LangChain model"""
 
     device_count: ClassVar[int] = 0
-    nb_devices: ClassVar[int] = (
-        torch.cuda.device_count() if torch.cuda.is_available() else 1
-    )
+    nb_devices: ClassVar[Optional[int]] = None
+
+    @classmethod
+    def _get_nb_devices(cls) -> int:
+        if cls.nb_devices is None:
+            if torch is not None and torch.cuda.is_available():
+                cls.nb_devices = torch.cuda.device_count()
+            else:
+                cls.nb_devices = 1
+        return cls.nb_devices
 
     @staticmethod
     def _check_key(provider):
@@ -165,6 +176,11 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel:
             config = load_config(config, LLMConfig)
 
         if config.provider == "HF":
+            if torch is None:
+                raise ImportError(
+                    "torch is required for HuggingFace models. "
+                    "Install it with: uv pip install 'mmore[cpu]' or uv pip install 'mmore[cu126]'"
+                )
             if torch.backends.mps.is_available():
                 return ChatHuggingFace(
                     llm=HuggingFacePipeline.from_model_id(
@@ -176,7 +192,7 @@ def from_config(cls, config: str | LLMConfig) -> BaseChatModel:
                 )
             if torch.cuda.is_available():
                 current_device = cls.device_count
-                cls.device_count = (cls.device_count + 1) % cls.nb_devices
+                cls.device_count = (cls.device_count + 1) % cls._get_nb_devices()
             else:
                 current_device = -1
 
diff --git a/src/mmore/run_websearch.py b/src/mmore/run_websearch.py
@@ -5,7 +5,11 @@
 from dataclasses import dataclass
 from typing import Optional, Union
 
-import torch
+try:
+    import torch
+except ImportError:
+    torch = None
+
 import uvicorn
 from dotenv import load_dotenv
 from fastapi import FastAPI
@@ -24,9 +28,10 @@
 
 
 # CUDA tweaks for best perf
-torch.backends.cuda.enable_mem_efficient_sdp(False)
-torch.backends.cuda.enable_flash_sdp(False)
-torch.backends.cuda.enable_math_sdp(True)
+if torch is not None and torch.cuda.is_available():
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
 
 
 @dataclass
diff --git a/src/mmore/websearchRAG/config.py b/src/mmore/websearchRAG/config.py
@@ -26,6 +26,7 @@ class WebsearchConfig:
       max_retries:        (int) Max retries for search on rate limit (default: 3).
       search_provider:    (str) Search provider: 'duckduckgo' (default, free) or 'tavily' (requires TAVILY_API_KEY, pip install "mmore[rag,websearch]").
       max_context_tokens: (int) Maximum number of context tokens for constructing prompts (default: 2048).
+      fast_tokenizer:     (bool) If True, use a fast heuristic (~4 chars/token) instead of the LLM tokenizer (default: False).
       llm_config:         (dict) Passed to rag.llm.LLMConfig (keys: llm_name, max_new_tokens, temperature, etc.)
       mode:               (str) Mode of operation ("local" or "api").
     """
@@ -43,6 +44,7 @@ class WebsearchConfig:
     max_retries: int = 3
     search_provider: Literal["duckduckgo", "tavily"] = "duckduckgo"
     max_context_tokens: int = 2048
+    fast_tokenizer: bool = False
 
     llm_config: LLMConfig = field(
         default_factory=lambda: LLMConfig(
diff --git a/src/mmore/websearchRAG/pipeline.py b/src/mmore/websearchRAG/pipeline.py
diff --git a/tests/test_websearch_pipeline.py b/tests/test_websearch_pipeline.py

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ api = [`
`126`	`126`	`# --- Composite + variant extras ---`
`127`	`127`
`128`	`128`	`all = [`
`129`		`- "mmore[process,rag,api]",`
	`129`	`+ "mmore[process,rag,api,websearch]",`
`130`	`130`	`]`
`131`	`131`
`132`	`132`	`cpu = [`