Skip to content

Commit f4b129f

Browse files
fix(mcp): resolve MCP service breakage from missing deps, asyncio errors, and lazy imports
- Move cachetools to core dependencies (was hidden in vectordb extra, needed by BaseRepository which every service imports) - Make HuggingFaceEmbeddings import lazy in rag/embeddings.py so RAG service works with OpenAI API embeddings without requiring sentence-transformers/PyTorch - Add run_async_safely() utility to prevent "asyncio.run() cannot be called from a running event loop" errors in rag_service and discovery/scheduler - Fix _PromptsChangeHandler to inherit from FileSystemEventHandler (provides required dispatch() method for watchdog) - Fix CLI lazy loading to check sys.argv for mcp/db/setup commands before argparse, and make MCP handler pipeline arg optional - Fix docker-compose.dev.yml: use docker/mcp/Dockerfile with explicit command override and PYTHONPATH for source mounts - Remove unnecessary vectordb extra from all service groups (cachetools is now a core dep) - Remove redundant pydocstyle pre-commit hook (ruff already handles docstring linting and the standalone hook conflicted with the codebase's style) - Fix pre-existing ruff B028 and W505 lint issues in touched files - Bump version to 0.3.0-alpha.3 - Add comprehensive regression tests for MCP config, async utils, and imports
1 parent 636d031 commit f4b129f

File tree

14 files changed

+690
-88
lines changed

14 files changed

+690
-88
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,7 @@ repos:
3333
args: ['-c', 'pyproject.toml']
3434
additional_dependencies: ['bandit[toml]']
3535

36-
# Docstring checks
37-
- repo: https://github.com/pycqa/pydocstyle
38-
rev: 6.3.0
39-
hooks:
40-
- id: pydocstyle
41-
args: [--convention=google]
42-
exclude: ^tests/
43-
36+
# Docstring checks handled by ruff (pydocstyle rules via [tool.ruff.lint.pydocstyle])
4437
# Import sorting handled by ruff (select includes "I" rules)
4538

4639
# Run unit tests before push (catches failures before CI)

docker-compose.dev.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,12 @@ services:
255255
profiles: ["microservices"]
256256
build:
257257
context: .
258-
dockerfile: Dockerfile
259-
# target: development # Not using multi-stage build
258+
dockerfile: docker/mcp/Dockerfile
259+
target: development
260260
args:
261-
SERVICE_EXTRAS: "service-mcp" # Includes api,pdf,mcp,discovery,langchain,vectordb,embeddings,memory
261+
SERVICE_EXTRAS: "service-mcp"
262262
container_name: thoth-dev-mcp
263+
command: ["python", "-m", "thoth", "mcp", "http", "--host", "0.0.0.0", "--port", "8001", "--log-level", "DEBUG"]
263264
user: "1000:1000" # Run as host user for proper file permissions
264265
env_file:
265266
- .env
@@ -284,6 +285,9 @@ services:
284285
# Development settings (REQUIRED - dev-specific configuration)
285286
- THOTH_LOG_LEVEL=DEBUG
286287

288+
# Prioritize mounted source over installed package for hot-reload
289+
- PYTHONPATH=/app/src:${PYTHONPATH:-}
290+
287291
# Hot-reload configuration
288292
# Automatically reload when vault/thoth/_thoth/settings.json changes
289293
# No container restart needed!

pyproject.toml

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ name = "thoth"
77
description = "Thoth AI Research Agent - An autonomous system for academic research paper processing and knowledge management"
88
readme = "README.md"
99
requires-python = ">=3.12, <3.13"
10-
version = '0.3.0-alpha.2'
10+
version = '0.3.0-alpha.3'
1111
authors = [{name="Nick Hallmark", email="NHALLMARK3@BLOOMBERG.NET"}]
1212

1313
dependencies = [
@@ -23,6 +23,7 @@ dependencies = [
2323
"fuzzywuzzy>=0.18.0",
2424
"python-levenshtein>=0.27.3",
2525
"matplotlib>=3.10.3",
26+
"cachetools>=5.3.0", # Required by BaseRepository (used by all services)
2627
]
2728

2829
[project.optional-dependencies]
@@ -101,9 +102,8 @@ langchain = [
101102
]
102103

103104
# Vector database utilities
104-
vectordb = [
105-
"cachetools>=5.3.0", # Required by BaseRepository
106-
]
105+
# NOTE: cachetools was moved to core dependencies since BaseRepository needs it
106+
vectordb = []
107107

108108
# Embeddings (VERY HEAVY: ~2GB with PyTorch)
109109
embeddings = [
@@ -125,35 +125,33 @@ memory = [
125125
# ==============================================================================
126126

127127
# thoth-api: REST API server (~1.5GB)
128-
# Core API server - does NOT need vectordb/embeddings by default
129-
# Add them if you want RAG features with local embeddings
128+
# Core API server - cachetools is now a core dep so vectordb not needed here
130129
service-api = [
131130
"thoth[api,discovery,pdf,langchain]",
132131
]
133132

134133
# thoth-dashboard: Dashboard sync (~1.5GB)
135-
# Needs vectordb for cachetools (BaseRepository), but NOT embeddings
136134
service-dashboard = [
137-
"thoth[api,discovery,pdf,langchain,vectordb]",
135+
"thoth[api,discovery,pdf,langchain]",
138136
]
139137

140138
# thoth-discovery: Discovery scheduler (~1.5GB)
141-
# Needs langchain for AnthropicClient + vectordb for cachetools + pdf for pypdf (pipelines)
139+
# Needs langchain for AnthropicClient + pdf for pypdf (pipelines)
142140
service-discovery = [
143-
"thoth[api,discovery,pdf,langchain,vectordb]",
141+
"thoth[api,discovery,pdf,langchain]",
144142
]
145143

146144
# thoth-mcp: MCP server with 60+ research tools (~2GB)
145+
# thoth-mcp: MCP protocol interface (~1.5GB)
147146
# Provides MCP protocol interface to Thoth research tools
148-
# Includes discovery and document processing capabilities
149147
# embeddings and memory (letta) are OPTIONAL - users can add if needed
150148
service-mcp = [
151149
"thoth[api,pdf,mcp,discovery,langchain]",
152150
]
153151

154152
# thoth-monitor: PDF processing (~2GB)
155153
# Watches for PDFs and processes them
156-
# embeddings and vectordb are OPTIONAL - add if you want local embeddings
154+
# embeddings are OPTIONAL - add if you want local embeddings
157155
service-monitor = [
158156
"thoth[api,pdf,discovery,mcp,langchain]",
159157
]

src/thoth/cli/main.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,35 @@ def _configure_safe_environment() -> None:
3535
def main() -> None:
3636
"""Main entry point for the Thoth CLI."""
3737
logger.info('===== main(): Function called, parsing arguments =====')
38-
parser = argparse.ArgumentParser(
39-
description='Thoth - Academic PDF processing system'
40-
)
41-
subparsers = parser.add_subparsers(
42-
dest='command', help='Command to run', required=True
43-
)
4438

45-
# Always register setup command
46-
setup_cli.configure_subparser(subparsers)
39+
# Quick-check the command from sys.argv before argparse to enable lazy
40+
# loading. This avoids importing heavy dependencies (ThothPipeline,
41+
# initialize_thoth, etc.) for lightweight commands like setup, db, and mcp.
42+
import sys
4743

48-
# Parse args early to check if we need other modules
49-
args = parser.parse_args()
44+
raw_command = sys.argv[1] if len(sys.argv) > 1 else None
45+
46+
if raw_command in ('setup', 'db', 'mcp'):
47+
parser = argparse.ArgumentParser(
48+
description='Thoth - Academic PDF processing system'
49+
)
50+
subparsers = parser.add_subparsers(
51+
dest='command', help='Command to run', required=True
52+
)
53+
54+
# Only register the subparser we actually need
55+
setup_cli.configure_subparser(subparsers)
5056

51-
# Skip heavy imports for setup and db commands
52-
if args.command in ['setup', 'db']:
53-
# For db command, only import database module
54-
if args.command == 'db':
57+
if raw_command == 'db':
5558
from . import database
5659

5760
database.configure_subparser(subparsers)
58-
# Re-parse after adding db subparser
59-
args = parser.parse_args()
61+
elif raw_command == 'mcp':
62+
from . import mcp
63+
64+
mcp.configure_subparser(subparsers)
65+
66+
args = parser.parse_args()
6067

6168
if hasattr(args, 'func'):
6269
if inspect.iscoroutinefunction(args.func):
@@ -65,7 +72,15 @@ def main() -> None:
6572
args.func(args)
6673
return
6774

68-
# Import all other CLI modules (only when not running setup/db)
75+
# Full CLI mode — register all subparsers and initialize pipeline
76+
parser = argparse.ArgumentParser(
77+
description='Thoth - Academic PDF processing system'
78+
)
79+
subparsers = parser.add_subparsers(
80+
dest='command', help='Command to run', required=True
81+
)
82+
83+
# Import all CLI modules (full pipeline mode)
6984
from . import (
7085
database,
7186
discovery,

src/thoth/cli/mcp.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
with different transport configurations.
66
"""
77

8+
from __future__ import annotations
9+
810
import sys
11+
from typing import TYPE_CHECKING
912

1013
from loguru import logger
1114

12-
from thoth.pipeline import ThothPipeline
15+
if TYPE_CHECKING:
16+
from thoth.pipeline import ThothPipeline
1317

1418

1519
def configure_subparser(subparsers):
@@ -103,7 +107,7 @@ def configure_subparser(subparsers):
103107
legacy_parser.set_defaults(func=run_http_server)
104108

105109

106-
def run_stdio_server(args, _pipeline: ThothPipeline):
110+
def run_stdio_server(args, _pipeline: ThothPipeline | None = None):
107111
"""Run MCP server with stdio transport only."""
108112
from thoth.mcp.launcher import run_stdio_server
109113

@@ -115,7 +119,7 @@ def run_stdio_server(args, _pipeline: ThothPipeline):
115119
run_stdio_server()
116120

117121

118-
def run_http_server(args, _pipeline: ThothPipeline):
122+
def run_http_server(args, _pipeline: ThothPipeline | None = None):
119123
"""Run MCP server with HTTP transport only."""
120124
import sys
121125

@@ -129,7 +133,7 @@ def run_http_server(args, _pipeline: ThothPipeline):
129133
run_http_server(host=args.host, port=args.port)
130134

131135

132-
def run_full_server(args, _pipeline: ThothPipeline):
136+
def run_full_server(args, _pipeline: ThothPipeline | None = None):
133137
"""Run MCP server with all transports."""
134138
import sys
135139

src/thoth/discovery/scheduler.py

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -434,8 +434,7 @@ def _load_schedule_state(self) -> dict[str, Any]:
434434

435435
def _load_from_postgres(self) -> dict[str, Any]:
436436
"""Load discovery schedule from PostgreSQL."""
437-
import asyncpg # noqa: I001
438-
import asyncio
437+
import asyncpg
439438

440439
db_url = (
441440
getattr(self.config.secrets, 'database_url', None)
@@ -467,16 +466,10 @@ async def load():
467466
finally:
468467
await conn.close()
469468

470-
# Use asyncio.run() to create a new event loop - safe from any thread
471-
try:
472-
return asyncio.run(load())
473-
except RuntimeError:
474-
# If there's already an event loop running, create a new one in a thread
475-
import concurrent.futures
469+
# Use run_async_safely to handle both sync and async calling contexts.
470+
from thoth.utilities.async_utils import run_async_safely
476471

477-
with concurrent.futures.ThreadPoolExecutor() as executor:
478-
future = executor.submit(lambda: asyncio.run(load()))
479-
return future.result()
472+
return run_async_safely(load())
480473

481474
def _save_schedule_state(self) -> None:
482475
"""
@@ -489,8 +482,7 @@ def _save_schedule_state(self) -> None:
489482

490483
def _save_to_postgres(self) -> None:
491484
"""Save discovery schedule to PostgreSQL."""
492-
import asyncpg # noqa: I001
493-
import asyncio
485+
import asyncpg
494486

495487
db_url = (
496488
getattr(self.config.secrets, 'database_url', None)
@@ -535,16 +527,10 @@ async def save():
535527
finally:
536528
await conn.close()
537529

538-
# Use asyncio.run() to create a new event loop - safe from any thread
539-
try:
540-
asyncio.run(save())
541-
except RuntimeError:
542-
# If there's already an event loop running, create a new one in a thread
543-
import concurrent.futures
544-
545-
with concurrent.futures.ThreadPoolExecutor() as executor:
546-
future = executor.submit(lambda: asyncio.run(save()))
547-
future.result()
530+
# Use run_async_safely to handle both sync and async calling contexts.
531+
from thoth.utilities.async_utils import run_async_safely
532+
533+
run_async_safely(save())
548534

549535
def sync_with_discovery_manager(self) -> None:
550536
"""

src/thoth/rag/embeddings.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88
import os
99
from typing import Any
1010

11-
from langchain_huggingface import HuggingFaceEmbeddings
1211
from langchain_openai import OpenAIEmbeddings
1312
from loguru import logger
1413

1514
from thoth.config import config
1615

16+
# HuggingFaceEmbeddings is imported lazily to avoid requiring the heavy
17+
# sentence-transformers / PyTorch dependencies when using API-based embeddings
18+
# (OpenAI). Install with: uv sync --extra embeddings
19+
HuggingFaceEmbeddings = None # type: ignore[assignment,misc]
20+
1721

1822
class EmbeddingManager:
1923
"""
@@ -133,6 +137,23 @@ def _init_openai_embeddings(self) -> None:
133137

134138
def _init_local_embeddings(self) -> None:
135139
"""Initialize local sentence-transformers embeddings."""
140+
global HuggingFaceEmbeddings
141+
if HuggingFaceEmbeddings is None:
142+
try:
143+
from langchain_huggingface import (
144+
HuggingFaceEmbeddings as _HFE,
145+
)
146+
147+
HuggingFaceEmbeddings = _HFE
148+
except ImportError:
149+
raise ImportError(
150+
'Local embeddings require langchain-huggingface and '
151+
'sentence-transformers. Install with: uv sync --extra embeddings\n'
152+
'Alternatively, configure an OpenAI embedding model '
153+
'(e.g., "openai/text-embedding-3-small") in settings to '
154+
'use API-based embeddings without heavy dependencies.'
155+
) from None
156+
136157
try:
137158
# Create local embeddings using HuggingFace/sentence-transformers
138159
# with safer configuration to prevent segfaults

0 commit comments

Comments
 (0)