Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:${MCP_PORT}/health || exit 1

# Copy the entrypoint, and use it
COPY --chown=$NUXEO_USER:0 --chmod=+x Dockerfile-entrypoint.sh entrypoint.sh
COPY --chown=$NUXEO_USER:0 --chmod=0755 Dockerfile-entrypoint.sh entrypoint.sh
ENTRYPOINT ["/app/entrypoint.sh"]
16 changes: 16 additions & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,22 @@ for doc in result['results']:
print(f"Path: {doc['path']}")
if 'highlights' in doc:
print(f"Highlights: {doc['highlights']}")

# Use larger highlight fragments to get meaningful content context
# without downloading the full ecm:binarytext source field (up to 1 MiB)
result = use_tool("nuxeo", "search_repository", {
"query": "priming sugar carbonation temperature",
"limit": 5,
"highlight_fragment_size": 2000, # chars per fragment (default 150, max ~9000000)
"highlight_number_of_fragments": 3, # fragments per doc (default 3, 0=entire field)
})

# Request extra _source fields when raw content is needed
result = use_tool("nuxeo", "search_repository", {
"query": "water chemistry pH",
"limit": 3,
"source_fields": ["ecm:binarytext"], # include full extracted text in results
})
```

### Elasticsearch Audit Search (NEW - Admin Only)
Expand Down
6 changes: 6 additions & 0 deletions nuxeo_mcp_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

docker-compose -f docker-compose-demo.yml up

> **Note:** When running the MCP server as a Docker container alongside Nuxeo, set
> `NUXEO_URL` to the Nuxeo service hostname (e.g. `http://nuxeo:8080/nuxeo`) rather
> than `localhost`. The MCP container cannot reach Nuxeo via `localhost` — it must
> use the Docker network service name. This is required for `search_repository` and
> `search_audit` (Elasticsearch passthrough) to function correctly.

# Nuxeo MCP Server Configuration Examples

This document provides examples of how to configure the Nuxeo MCP server in your Cline MCP settings file.
Expand Down
16 changes: 14 additions & 2 deletions src/nuxeo_mcp/es_passthrough.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def search_repository(
limit: int = 20,
offset: int = 0,
source_fields: Optional[List[str]] = None,
highlight_fragment_size: int = 150,
highlight_number_of_fragments: int = 3,
) -> Dict[str, Any]:
"""Search repository index using natural language.

Expand All @@ -69,6 +71,8 @@ def search_repository(
limit: Maximum number of results
offset: Pagination offset
source_fields: Fields to include in response
highlight_fragment_size: Size in chars of each highlight fragment (default 150)
highlight_number_of_fragments: Number of highlight fragments to return (default 3)

Returns:
Formatted search results
Expand All @@ -84,6 +88,8 @@ def search_repository(
include_sort=True,
include_pagination=True,
include_highlight=True,
highlight_fragment_size=highlight_fragment_size,
highlight_number_of_fragments=highlight_number_of_fragments,
apply_acl=True,
user_principals=[principal] + groups,
source_includes=source_fields,
Expand All @@ -101,7 +107,7 @@ def search_repository(
)

# Format results
return self._format_repository_results(response, json.dumps(es_request))
return self._format_repository_results(response, json.dumps(es_request), source_fields=source_fields)

def search_audit(
self,
Expand Down Expand Up @@ -223,7 +229,7 @@ def _get_filter_for_index(self, index: str) -> SearchRequestFilter:
return self.filters.get(index, self.filters["nuxeo"])

def _format_repository_results(
self, es_response: Dict[str, Any], translated_query: str
self, es_response: Dict[str, Any], translated_query: str, source_fields: Optional[List[str]] = None
) -> Dict[str, Any]:
"""Format Elasticsearch results for repository search.

Expand Down Expand Up @@ -253,6 +259,12 @@ def _format_repository_results(
"creator": source.get("dc:creator", ""),
}

# Add any explicitly requested extra source fields
if source_fields:
for key in source_fields:
if key not in result and key in source:
result[key] = source[key]

# Add highlights if available
if "highlight" in hit:
highlights = []
Expand Down
2 changes: 1 addition & 1 deletion src/nuxeo_mcp/es_query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def fulltext_query(
) -> Dict[str, Any]:
"""Build a fulltext search query."""
if fields is None:
fields = ["ecm:fulltext", "ecm:fulltext.title^2"]
fields = ["all_field"]

return {
"simple_query_string": {
Expand Down
14 changes: 13 additions & 1 deletion src/nuxeo_mcp/nl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,8 @@ def parse_to_elasticsearch(
include_sort: bool = False,
include_pagination: bool = False,
include_highlight: bool = False,
highlight_fragment_size: int = 150,
highlight_number_of_fragments: int = 3,
Comment thread
bdelbosc marked this conversation as resolved.
apply_acl: bool = False,
user_principals: Optional[List[str]] = None,
user_principal: Optional[str] = None,
Expand Down Expand Up @@ -722,7 +724,17 @@ def parse_to_elasticsearch(
# Add highlighting if requested
if include_highlight:
request["highlight"] = {
"fields": {"dc:title": {}, "dc:description": {}, "ecm:fulltext": {}}
# require_field_match=false allows highlighting ecm:binarytext
# even though the query targets all_field (a copy_to aggregate)
"require_field_match": False,
"fields": {
"dc:title": {},
"dc:description": {},
"ecm:binarytext": {
"fragment_size": highlight_fragment_size,
"number_of_fragments": highlight_number_of_fragments,
},
}
Comment thread
bdelbosc marked this conversation as resolved.
}

return request
Expand Down
28 changes: 21 additions & 7 deletions src/nuxeo_mcp/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def one_of_path_or_uid(cls, v):

from typing import Union

# Timeout in seconds for Elasticsearch connectivity probes
ES_PROBE_TIMEOUT = 10


def register_tools(mcp, nuxeo, auth_middleware=None, skip_server_selection: bool = False) -> None:
"""
Expand Down Expand Up @@ -1044,7 +1047,7 @@ def natural_search(
return result

@mcp.tool()
async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str:
async def search_repository(query: str, limit: int = 20, offset: int = 0, source_fields: Optional[List[str]] = None, highlight_fragment_size: int = 150, highlight_number_of_fragments: int = 3) -> str:
Comment thread
bdelbosc marked this conversation as resolved.
"""
[REQUIRES ELASTICSEARCH] Search the Nuxeo repository using Elasticsearch passthrough.

Comment thread
bdelbosc marked this conversation as resolved.
Expand All @@ -1058,6 +1061,9 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
query: Natural language search query (e.g., "PDFs created last week by John")
limit: Maximum number of results to return (default: 20, max: 100)
offset: Pagination offset for results (default: 0)
source_fields: Extra fields to include in results (e.g. ["ecm:binarytext"])
highlight_fragment_size: Size in chars of each highlight fragment (default 150, max ~9000000)
highlight_number_of_fragments: Number of highlight fragments to return per doc (default 3, 0=entire field)

Returns:
JSON string containing search results with document metadata and highlights
Expand All @@ -1082,9 +1088,13 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
if limit > 100:
limit = 100

# Clamp highlight params to safe values
highlight_fragment_size = max(1, highlight_fragment_size)
highlight_number_of_fragments = max(0, highlight_number_of_fragments)

# Initialize passthrough with Nuxeo URL and auth
# Get the Nuxeo URL and auth from the global nuxeo client
nuxeo_url = nuxeo.client.host
nuxeo_url = os.environ.get("NUXEO_URL", nuxeo.client.host)
auth = nuxeo.client.auth

passthrough = ElasticsearchPassthrough(nuxeo_url=nuxeo_url, auth=auth)
Expand All @@ -1098,7 +1108,7 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
test_url,
json=test_query,
auth=auth,
timeout=2
timeout=ES_PROBE_TIMEOUT
)
response.raise_for_status()
except (requests.RequestException, requests.ConnectionError) as e:
Expand All @@ -1117,6 +1127,9 @@ async def search_repository(query: str, limit: int = 20, offset: int = 0) -> str
groups=groups,
limit=limit,
offset=offset,
source_fields=source_fields,
highlight_fragment_size=highlight_fragment_size,
highlight_number_of_fragments=highlight_number_of_fragments,
)

# Format response
Expand Down Expand Up @@ -1179,21 +1192,22 @@ async def search_audit(query: str, limit: int = 20, offset: int = 0) -> str:

# Initialize passthrough with Nuxeo URL and auth
# Get the Nuxeo URL and auth from the global nuxeo client
nuxeo_url = nuxeo.client.host
nuxeo_url = os.environ.get("NUXEO_URL", nuxeo.client.host)
auth = nuxeo.client.auth

Comment thread
bdelbosc marked this conversation as resolved.
passthrough = ElasticsearchPassthrough(nuxeo_url=nuxeo_url, auth=auth)



# Check if Elasticsearch is accessible through Nuxeo passthrough
try:
# Test with a simple match_all query on audit index
test_url = f"{passthrough.base_url}/audit/_search"
test_query = {"query": {"match_all": {}}, "size": 0}
response = requests.post(
test_url,
test_url,
json=test_query,
auth=auth,
timeout=2
timeout=ES_PROBE_TIMEOUT
)
response.raise_for_status()
except (requests.RequestException, requests.ConnectionError) as e:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_es_query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def test_fulltext_query(self):
expected = {
"simple_query_string": {
"query": "project management document",
"fields": ["ecm:fulltext", "ecm:fulltext.title^2"],
"fields": ["all_field"],
"default_operator": "AND"
}
}
Expand Down
24 changes: 22 additions & 2 deletions tests/test_nl_parser_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,31 @@ def test_parse_with_highlighting(self):
"""Test parsing query with highlighting enabled."""
query = "search for annual reports"
result = self.parser.parse_to_elasticsearch(query, include_highlight=True)

assert result is not None
assert "query" in result
assert "highlight" in result
assert "fields" in result["highlight"]
highlight = result["highlight"]
assert "fields" in highlight
# ecm:binarytext is used (not the non-stored all_field)
assert "ecm:binarytext" in highlight["fields"]
# require_field_match must be False so ecm:binarytext is highlighted
# even though the query targets all_field
assert highlight.get("require_field_match") is False

def test_parse_with_highlight_fragment_params(self):
"""Test that highlight_fragment_size and highlight_number_of_fragments are forwarded."""
query = "search for annual reports"
result = self.parser.parse_to_elasticsearch(
query,
include_highlight=True,
highlight_fragment_size=2000,
highlight_number_of_fragments=5,
)

bt_config = result["highlight"]["fields"]["ecm:binarytext"]
assert bt_config["fragment_size"] == 2000
assert bt_config["number_of_fragments"] == 5

def test_detect_elasticsearch_intent(self):
"""Test detecting Elasticsearch-specific intent."""
Expand Down
Loading