nuxeo · bdelbosc · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -48,5 +48,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
   CMD curl -f http://localhost:${MCP_PORT}/health || exit 1
 
 # Copy the entrypoint, and use it
-COPY --chown=$NUXEO_USER:0 --chmod=+x Dockerfile-entrypoint.sh entrypoint.sh
+COPY --chown=$NUXEO_USER:0 --chmod=0755 Dockerfile-entrypoint.sh entrypoint.sh
 ENTRYPOINT ["/app/entrypoint.sh"]
diff --git a/USAGE.md b/USAGE.md
@@ -164,6 +164,22 @@ for doc in result['results']:
     print(f"Path: {doc['path']}")
     if 'highlights' in doc:
         print(f"Highlights: {doc['highlights']}")
+
+# Use larger highlight fragments to get meaningful content context
+# without downloading the full ecm:binarytext source field (up to 1 MiB)
+result = use_tool("nuxeo", "search_repository", {
+    "query": "priming sugar carbonation temperature",
+    "limit": 5,
+    "highlight_fragment_size": 2000,       # chars per fragment (default 150, max ~9000000)
+    "highlight_number_of_fragments": 3,    # fragments per doc (default 3, 0=entire field)
+})
+
+# Request extra _source fields when raw content is needed
+result = use_tool("nuxeo", "search_repository", {
+    "query": "water chemistry pH",
+    "limit": 3,
+    "source_fields": ["ecm:binarytext"],   # include full extracted text in results
+})
 ```
 
 ### Elasticsearch Audit Search (NEW - Admin Only)

diff --git a/nuxeo_mcp_config.md b/nuxeo_mcp_config.md
@@ -4,6 +4,12 @@
 
     docker-compose -f docker-compose-demo.yml up
 
+> **Note:** When running the MCP server as a Docker container alongside Nuxeo, set
+> `NUXEO_URL` to the Nuxeo service hostname (e.g. `http://nuxeo:8080/nuxeo`) rather
+> than `localhost`. The MCP container cannot reach Nuxeo via `localhost` — it must
+> use the Docker network service name. This is required for `search_repository` and
+> `search_audit` (Elasticsearch passthrough) to function correctly.
+
 # Nuxeo MCP Server Configuration Examples
 
 This document provides examples of how to configure the Nuxeo MCP server in your Cline MCP settings file.

diff --git a/src/nuxeo_mcp/es_passthrough.py b/src/nuxeo_mcp/es_passthrough.py
@@ -59,6 +59,8 @@ def search_repository(
         limit: int = 20,
         offset: int = 0,
         source_fields: Optional[List[str]] = None,
+        highlight_fragment_size: int = 150,
+        highlight_number_of_fragments: int = 3,
     ) -> Dict[str, Any]:
         """Search repository index using natural language.
 
@@ -69,6 +71,8 @@ def search_repository(
             limit: Maximum number of results
             offset: Pagination offset
             source_fields: Fields to include in response
+            highlight_fragment_size: Size in chars of each highlight fragment (default 150)
+            highlight_number_of_fragments: Number of highlight fragments to return (default 3)
 
         Returns:
             Formatted search results
@@ -84,6 +88,8 @@ def search_repository(
             include_sort=True,
             include_pagination=True,
             include_highlight=True,
+            highlight_fragment_size=highlight_fragment_size,
+            highlight_number_of_fragments=highlight_number_of_fragments,
             apply_acl=True,
             user_principals=[principal] + groups,
             source_includes=source_fields,
@@ -101,7 +107,7 @@ def search_repository(
         )
 
         # Format results
-        return self._format_repository_results(response, json.dumps(es_request))
+        return self._format_repository_results(response, json.dumps(es_request), source_fields=source_fields)
 
     def search_audit(
         self,
@@ -223,7 +229,7 @@ def _get_filter_for_index(self, index: str) -> SearchRequestFilter:
         return self.filters.get(index, self.filters["nuxeo"])
 
     def _format_repository_results(
-        self, es_response: Dict[str, Any], translated_query: str
+        self, es_response: Dict[str, Any], translated_query: str, source_fields: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """Format Elasticsearch results for repository search.
 
@@ -253,6 +259,12 @@ def _format_repository_results(
                 "creator": source.get("dc:creator", ""),
             }
 
+            # Add any explicitly requested extra source fields
+            if source_fields:
+                for key in source_fields:
+                    if key not in result and key in source:
+                        result[key] = source[key]
+
             # Add highlights if available
             if "highlight" in hit:
                 highlights = []

diff --git a/src/nuxeo_mcp/es_query_builder.py b/src/nuxeo_mcp/es_query_builder.py
@@ -80,7 +80,7 @@ def fulltext_query(
     ) -> Dict[str, Any]:
         """Build a fulltext search query."""
         if fields is None:
-            fields = ["ecm:fulltext", "ecm:fulltext.title^2"]
+            fields = ["all_field"]
 
         return {
             "simple_query_string": {

diff --git a/src/nuxeo_mcp/nl_parser.py b/src/nuxeo_mcp/nl_parser.py
@@ -671,6 +671,8 @@ def parse_to_elasticsearch(
         include_sort: bool = False,
         include_pagination: bool = False,
         include_highlight: bool = False,
+        highlight_fragment_size: int = 150,
+        highlight_number_of_fragments: int = 3,
         apply_acl: bool = False,
         user_principals: Optional[List[str]] = None,
         user_principal: Optional[str] = None,
@@ -722,7 +724,17 @@ def parse_to_elasticsearch(
         # Add highlighting if requested
         if include_highlight:
             request["highlight"] = {
-                "fields": {"dc:title": {}, "dc:description": {}, "ecm:fulltext": {}}
+                # require_field_match=false allows highlighting ecm:binarytext
+                # even though the query targets all_field (a copy_to aggregate)
+                "require_field_match": False,
+                "fields": {
+                    "dc:title": {},
+                    "dc:description": {},
+                    "ecm:binarytext": {
+                        "fragment_size": highlight_fragment_size,
+                        "number_of_fragments": highlight_number_of_fragments,
+                    },
+                }
             }
 
         return request

diff --git a/src/nuxeo_mcp/tools.py b/src/nuxeo_mcp/tools.py
@@ -40,6 +40,9 @@ def one_of_path_or_uid(cls, v):
 
 from typing import Union
 
+# Timeout in seconds for Elasticsearch connectivity probes
+ES_PROBE_TIMEOUT = 10
+
 
 def register_tools(mcp, nuxeo, auth_middleware=None, skip_server_selection: bool = False) -> None:
     """
@@ -1044,7 +1047,7 @@ def natural_search(
             return result
 
     @mcp.tool()
-    async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str:
+    async def search_repository(query: str, limit: int = 20, offset: int = 0, source_fields: Optional[List[str]] = None, highlight_fragment_size: int = 150, highlight_number_of_fragments: int = 3) -> str:
         """
         [REQUIRES ELASTICSEARCH] Search the Nuxeo repository using Elasticsearch passthrough.
 
@@ -1058,6 +1061,9 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
             query: Natural language search query (e.g., "PDFs created last week by John")
             limit: Maximum number of results to return (default: 20, max: 100)
             offset: Pagination offset for results (default: 0)
+            source_fields: Extra fields to include in results (e.g. ["ecm:binarytext"])
+            highlight_fragment_size: Size in chars of each highlight fragment (default 150, max ~9000000)
+            highlight_number_of_fragments: Number of highlight fragments to return per doc (default 3, 0=entire field)
 
         Returns:
             JSON string containing search results with document metadata and highlights
@@ -1082,9 +1088,13 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
             if limit > 100:
                 limit = 100
 
+            # Clamp highlight params to safe values
+            highlight_fragment_size = max(1, highlight_fragment_size)
+            highlight_number_of_fragments = max(0, highlight_number_of_fragments)
+
             # Initialize passthrough with Nuxeo URL and auth
             # Get the Nuxeo URL and auth from the global nuxeo client
-            nuxeo_url = nuxeo.client.host
+            nuxeo_url = os.environ.get("NUXEO_URL", nuxeo.client.host)
             auth = nuxeo.client.auth
 
             passthrough = ElasticsearchPassthrough(nuxeo_url=nuxeo_url, auth=auth)
@@ -1098,7 +1108,7 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
                     test_url, 
                     json=test_query,
                     auth=auth,
-                    timeout=2
+                    timeout=ES_PROBE_TIMEOUT
                 )
                 response.raise_for_status()
             except (requests.RequestException, requests.ConnectionError) as e:
@@ -1117,6 +1127,9 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
                 groups=groups,
                 limit=limit,
                 offset=offset,
+                source_fields=source_fields,
+                highlight_fragment_size=highlight_fragment_size,
+                highlight_number_of_fragments=highlight_number_of_fragments,
             )
 
             # Format response
@@ -1179,21 +1192,22 @@ async def search_audit(query: str, limit: int = 20, offset: int = 0) -> str:
 
             # Initialize passthrough with Nuxeo URL and auth
             # Get the Nuxeo URL and auth from the global nuxeo client
-            nuxeo_url = nuxeo.client.host
+            nuxeo_url = os.environ.get("NUXEO_URL", nuxeo.client.host)
             auth = nuxeo.client.auth
 
             passthrough = ElasticsearchPassthrough(nuxeo_url=nuxeo_url, auth=auth)
-
+
+
             # Check if Elasticsearch is accessible through Nuxeo passthrough
             try:
                 # Test with a simple match_all query on audit index
                 test_url = f"{passthrough.base_url}/audit/_search"
                 test_query = {"query": {"match_all": {}}, "size": 0}
                 response = requests.post(
-                    test_url, 
+                    test_url,
                     json=test_query,
                     auth=auth,
-                    timeout=2
+                    timeout=ES_PROBE_TIMEOUT
                 )
                 response.raise_for_status()
             except (requests.RequestException, requests.ConnectionError) as e:

diff --git a/tests/test_es_query_builder.py b/tests/test_es_query_builder.py
@@ -261,7 +261,7 @@ def test_fulltext_query(self):
         expected = {
             "simple_query_string": {
                 "query": "project management document",
-                "fields": ["ecm:fulltext", "ecm:fulltext.title^2"],
+                "fields": ["all_field"],
                 "default_operator": "AND"
             }
         }

diff --git a/tests/test_nl_parser_es.py b/tests/test_nl_parser_es.py
@@ -139,11 +139,31 @@ def test_parse_with_highlighting(self):
         """Test parsing query with highlighting enabled."""
         query = "search for annual reports"
         result = self.parser.parse_to_elasticsearch(query, include_highlight=True)
-        
+
         assert result is not None
         assert "query" in result
         assert "highlight" in result
-        assert "fields" in result["highlight"]
+        highlight = result["highlight"]
+        assert "fields" in highlight
+        # ecm:binarytext is used (not the non-stored all_field)
+        assert "ecm:binarytext" in highlight["fields"]
+        # require_field_match must be False so ecm:binarytext is highlighted
+        # even though the query targets all_field
+        assert highlight.get("require_field_match") is False
+
+    def test_parse_with_highlight_fragment_params(self):
+        """Test that highlight_fragment_size and highlight_number_of_fragments are forwarded."""
+        query = "search for annual reports"
+        result = self.parser.parse_to_elasticsearch(
+            query,
+            include_highlight=True,
+            highlight_fragment_size=2000,
+            highlight_number_of_fragments=5,
+        )
+
+        bt_config = result["highlight"]["fields"]["ecm:binarytext"]
+        assert bt_config["fragment_size"] == 2000
+        assert bt_config["number_of_fragments"] == 5
 
     def test_detect_elasticsearch_intent(self):
         """Test detecting Elasticsearch-specific intent."""