From 4b80bd59aa2062cdbc1e2f1584994b9c15a24943 Mon Sep 17 00:00:00 2001 From: txxxxz Date: Tue, 14 Apr 2026 20:58:17 +0800 Subject: [PATCH 1/4] Add structure note workspace and generation flow --- deeptutor/api/main.py | 16 +- deeptutor/api/routers/settings.py | 9 +- deeptutor/api/routers/structure_note.py | 404 ++++ deeptutor/api/routers/system.py | 5 +- deeptutor/services/__init__.py | 3 + deeptutor/services/config/loader.py | 32 +- deeptutor/services/path_service.py | 47 +- deeptutor/services/setup/init.py | 14 +- deeptutor/services/structure_note/__init__.py | 40 + .../services/structure_note/difficulty.py | 64 + .../services/structure_note/generator.py | 571 +++++ .../services/structure_note/image_pipeline.py | 132 ++ deeptutor/services/structure_note/manager.py | 592 ++++++ .../structure_note/markdown_postprocessor.py | 252 +++ deeptutor/services/structure_note/models.py | 184 ++ .../services/structure_note/normalizer.py | 49 + .../services/structure_note/page_index.py | 108 + deeptutor/services/structure_note/planner.py | 125 ++ deeptutor/services/structure_note/renderer.py | 178 ++ deeptutor/services/structure_note/storage.py | 185 ++ .../services/structure_note/tree_builder.py | 205 ++ docs/features/overview.md | 162 ++ docs/guide/data-preparation.md | 185 ++ docs/guide/local-start.md | 190 ++ docs/guide/pre-config.md | 201 ++ docs/guide/troubleshooting.md | 170 ++ docs/index.md | 77 + docs/roadmap.md | 41 + docs/testdoc/structure-note-prd.md | 278 +++ docs/testdoc/structure-note-technical-plan.md | 539 +++++ docs/zh/features/overview.md | 57 + docs/zh/guide/data-preparation.md | 185 ++ docs/zh/guide/local-conda-cursor.md | 56 + docs/zh/guide/local-start.md | 190 ++ docs/zh/guide/pre-config.md | 201 ++ docs/zh/guide/troubleshooting.md | 48 + docs/zh/index.md | 77 + pyproject.toml | 3 + requirements/cli.txt | 1 + requirements/server.txt | 1 + tests/api/test_structure_note_router.py | 325 +++ tests/services/test_path_service.py | 94 +- tests/services/test_runtime_storage_guard.py | 11 +- tests/services/test_structure_note_service.py | 373 ++++ web/app/(workspace)/structure-note/page.tsx | 1876 +++++++++++++++++ web/components/sidebar/SidebarShell.tsx | 151 +- web/components/ui/Button.tsx | 51 +- web/lib/latex.ts | 287 ++- web/lib/structure-note-api.ts | 275 +++ web/locales/en/app.json | 115 +- web/locales/zh/app.json | 117 +- web/package-lock.json | 1 + web/scripts/route_budgets.mjs | 135 +- web/tests/e2e/structure-note.audit.ts | 25 + 54 files changed, 9344 insertions(+), 369 deletions(-) create mode 100644 deeptutor/api/routers/structure_note.py create mode 100644 deeptutor/services/structure_note/__init__.py create mode 100644 deeptutor/services/structure_note/difficulty.py create mode 100644 deeptutor/services/structure_note/generator.py create mode 100644 deeptutor/services/structure_note/image_pipeline.py create mode 100644 deeptutor/services/structure_note/manager.py create mode 100644 deeptutor/services/structure_note/markdown_postprocessor.py create mode 100644 deeptutor/services/structure_note/models.py create mode 100644 deeptutor/services/structure_note/normalizer.py create mode 100644 deeptutor/services/structure_note/page_index.py create mode 100644 deeptutor/services/structure_note/planner.py create mode 100644 deeptutor/services/structure_note/renderer.py create mode 100644 deeptutor/services/structure_note/storage.py create mode 100644 deeptutor/services/structure_note/tree_builder.py create mode 100644 docs/features/overview.md create mode 100644 docs/guide/data-preparation.md create mode 100644 docs/guide/local-start.md create mode 100644 docs/guide/pre-config.md create mode 100644 docs/guide/troubleshooting.md create mode 100644 docs/index.md create mode 100644 docs/roadmap.md create mode 100644 docs/testdoc/structure-note-prd.md create mode 100644 docs/testdoc/structure-note-technical-plan.md create mode 100644 docs/zh/features/overview.md create mode 100644 docs/zh/guide/data-preparation.md create mode 100644 docs/zh/guide/local-conda-cursor.md create mode 100644 docs/zh/guide/local-start.md create mode 100644 docs/zh/guide/pre-config.md create mode 100644 docs/zh/guide/troubleshooting.md create mode 100644 docs/zh/index.md create mode 100644 tests/api/test_structure_note_router.py create mode 100644 tests/services/test_structure_note_service.py create mode 100644 web/app/(workspace)/structure-note/page.tsx create mode 100644 web/lib/structure-note-api.ts create mode 100644 web/tests/e2e/structure-note.audit.ts diff --git a/deeptutor/api/main.py b/deeptutor/api/main.py index ceee8107d..7b4aec318 100644 --- a/deeptutor/api/main.py +++ b/deeptutor/api/main.py @@ -1,9 +1,7 @@ -import logging from contextlib import asynccontextmanager -from pathlib import Path +import logging -from fastapi import FastAPI -from fastapi import HTTPException +from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles @@ -107,6 +105,7 @@ async def lifespan(app: FastAPI): try: from deeptutor.services.tutorbot import get_tutorbot_manager + await get_tutorbot_manager().auto_start_bots() except Exception as e: logger.warning(f"Failed to auto-start TutorBots: {e}") @@ -119,6 +118,7 @@ async def lifespan(app: FastAPI): # Stop TutorBots try: from deeptutor.services.tutorbot import get_tutorbot_manager + await get_tutorbot_manager().stop_all() logger.info("TutorBots stopped") except Exception as e: @@ -207,14 +207,15 @@ async def selective_access_log(request, call_next): notebook, plugins_api, question, + question_notebook, sessions, settings, solve, + structure_note, system, tutorbot, unified_ws, vision_solver, - question_notebook, ) # Include routers @@ -226,9 +227,12 @@ async def selective_access_log(request, call_next): app.include_router(co_writer.router, prefix="/api/v1/co_writer", tags=["co_writer"]) app.include_router(notebook.router, prefix="/api/v1/notebook", tags=["notebook"]) app.include_router(guide.router, prefix="/api/v1/guide", tags=["guide"]) +app.include_router(structure_note.router, prefix="/api/v1/structure-note", tags=["structure-note"]) app.include_router(memory.router, prefix="/api/v1/memory", tags=["memory"]) app.include_router(sessions.router, prefix="/api/v1/sessions", tags=["sessions"]) -app.include_router(question_notebook.router, prefix="/api/v1/question-notebook", tags=["question-notebook"]) +app.include_router( + question_notebook.router, prefix="/api/v1/question-notebook", tags=["question-notebook"] +) app.include_router(settings.router, prefix="/api/v1/settings", tags=["settings"]) app.include_router(system.router, prefix="/api/v1/system", tags=["system"]) app.include_router(plugins_api.router, prefix="/api/v1/plugins", tags=["plugins"]) diff --git a/deeptutor/api/routers/settings.py b/deeptutor/api/routers/settings.py index 23a26ff7c..fb6bdb6a7 100644 --- a/deeptutor/api/routers/settings.py +++ b/deeptutor/api/routers/settings.py @@ -29,7 +29,14 @@ DEFAULT_SIDEBAR_NAV_ORDER = { "start": ["/", "/history", "/knowledge", "/notebook"], - "learnResearch": ["/question", "/solver", "/guide", "/research", "/co_writer"], + "learnResearch": [ + "/question", + "/solver", + "/guide", + "/structure-note", + "/research", + "/co_writer", + ], } DEFAULT_UI_SETTINGS = { diff --git a/deeptutor/api/routers/structure_note.py b/deeptutor/api/routers/structure_note.py new file mode 100644 index 000000000..7c5677392 --- /dev/null +++ b/deeptutor/api/routers/structure_note.py @@ -0,0 +1,404 @@ +""" +Structure Note API Router +========================= + +Independent workspace for turning PDF/PPT/PPTX course materials into structured notes. +""" + +from __future__ import annotations + +from datetime import datetime +import json +from pathlib import Path, PurePosixPath +import shutil +from uuid import uuid4 + +from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile +from fastapi.responses import StreamingResponse + +from deeptutor.api.utils.task_id_manager import TaskIDManager +from deeptutor.api.utils.task_log_stream import get_task_stream_manager +from deeptutor.logging import get_logger +from deeptutor.services.config import PROJECT_ROOT, load_config_with_main +from deeptutor.services.structure_note import ( + DifficultyLevel, + ExplanationStyleLevel, + JobStatus, + NoteLanguage, + StructureNoteManager, +) +from deeptutor.utils.document_validator import DocumentValidator + +router = APIRouter() +_structure_note_manager: StructureNoteManager | None = None +_kb_base_dir = PROJECT_ROOT / "data" / "knowledge_bases" +_accepted_source_extensions = {".pdf", ".ppt", ".pptx"} + +try: + config = load_config_with_main("main.yaml", PROJECT_ROOT) +except FileNotFoundError: + config = {} +log_dir = config.get("paths", {}).get("user_log_dir") or config.get("logging", {}).get("log_dir") +logger = get_logger("StructureNote", level="INFO", log_dir=log_dir) + + +def get_structure_note_manager() -> StructureNoteManager: + global _structure_note_manager + if _structure_note_manager is None: + _structure_note_manager = StructureNoteManager() + return _structure_note_manager + + +def _build_unique_task_id(task_type: str, task_key_prefix: str) -> str: + task_manager = TaskIDManager.get_instance() + task_key = f"{task_key_prefix}_{datetime.now().isoformat()}_{uuid4().hex[:8]}" + return task_manager.generate_task_id(task_type, task_key) + + +def _emit_log(task_id: str, message: str) -> None: + manager = get_task_stream_manager() + manager.ensure_task(task_id) + manager.emit_log(task_id, message) + logger.info(f"[{task_id}] {message}") + + +def _save_upload(file: UploadFile, target_dir: Path) -> tuple[Path, str, int]: + safe_name = DocumentValidator.validate_upload_safety( + file.filename or "upload", + None, + allowed_extensions={".pdf", ".ppt", ".pptx"}, + ) + target_dir.mkdir(parents=True, exist_ok=True) + target_path = target_dir / safe_name + written_bytes = 0 + with open(target_path, "wb") as handle: + for chunk in iter(lambda: file.file.read(8192), b""): + written_bytes += len(chunk) + if written_bytes > DocumentValidator.MAX_FILE_SIZE: + raise HTTPException(status_code=400, detail="Uploaded file exceeds the size limit.") + handle.write(chunk) + + DocumentValidator.validate_upload_safety( + safe_name, + written_bytes, + allowed_extensions={".pdf", ".ppt", ".pptx"}, + ) + return target_path, safe_name, written_bytes + + +def _validate_kb_file_id(file_id: str) -> PurePosixPath: + relative_path = PurePosixPath(file_id) + if ( + relative_path.is_absolute() + or not relative_path.parts + or any(part in {"", ".", ".."} for part in relative_path.parts) + ): + raise HTTPException(status_code=400, detail="Invalid Knowledge Base file id.") + return relative_path + + +def _is_safe_kb_name(kb_name: str) -> bool: + relative_name = PurePosixPath(kb_name) + return ( + not relative_name.is_absolute() + and len(relative_name.parts) == 1 + and all(part not in {"", ".", ".."} for part in relative_name.parts) + ) + + +def _validate_kb_name(kb_name: str) -> str: + if not _is_safe_kb_name(kb_name): + raise HTTPException(status_code=400, detail="Invalid Knowledge Base name.") + return PurePosixPath(kb_name).name + + +def _list_kb_names_readonly() -> list[str]: + kb_names: set[str] = set() + config_path = _kb_base_dir / "kb_config.json" + if config_path.exists(): + try: + payload = json.loads(config_path.read_text(encoding="utf-8")) + knowledge_bases = payload.get("knowledge_bases", {}) + if isinstance(knowledge_bases, dict): + kb_names.update( + str(name) for name in knowledge_bases.keys() if _is_safe_kb_name(str(name)) + ) + except Exception as exc: + logger.warning(f"Failed to read Knowledge Base config for Structure Note: {exc}") + + if _kb_base_dir.exists(): + for item in _kb_base_dir.iterdir(): + if not item.is_dir() or item.name.startswith(("__", ".")): + continue + if ( + (item / "raw").exists() + or (item / "llamaindex_storage").exists() + or (item / "rag_storage").exists() + ): + kb_names.add(item.name) + + return sorted(kb_names) + + +def _kb_raw_dir(kb_name: str) -> Path: + safe_kb_name = _validate_kb_name(kb_name) + if safe_kb_name not in _list_kb_names_readonly(): + raise HTTPException(status_code=404, detail="Knowledge Base not found.") + + base_dir = _kb_base_dir.resolve() + kb_dir = (base_dir / safe_kb_name).resolve() + try: + kb_dir.relative_to(base_dir) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid Knowledge Base name.") + return kb_dir / "raw" + + +def _resolve_kb_source_file(kb_name: str, file_id: str) -> Path: + relative_path = _validate_kb_file_id(file_id) + raw_dir = _kb_raw_dir(kb_name).resolve() + source_path = (raw_dir / Path(*relative_path.parts)).resolve() + try: + source_path.relative_to(raw_dir) + except ValueError: + raise HTTPException(status_code=400, detail="Invalid Knowledge Base file id.") + + if not source_path.exists() or not source_path.is_file(): + raise HTTPException(status_code=404, detail="Knowledge Base file not found.") + if source_path.suffix.lower() not in _accepted_source_extensions: + raise HTTPException( + status_code=400, detail="Structure Note accepts PDF, PPT, or PPTX only." + ) + return source_path + + +def _list_kb_source_files() -> list[dict]: + groups: list[dict] = [] + for kb_name in _list_kb_names_readonly(): + raw_dir = _kb_raw_dir(kb_name) + files: list[dict] = [] + if raw_dir.exists(): + for source_path in sorted(raw_dir.rglob("*"), key=lambda item: item.as_posix()): + if not source_path.is_file(): + continue + if source_path.suffix.lower() not in _accepted_source_extensions: + continue + stat = source_path.stat() + file_id = source_path.relative_to(raw_dir).as_posix() + files.append( + { + "file_id": file_id, + "file_name": source_path.name, + "display_path": file_id, + "size_bytes": stat.st_size, + "updated_at": datetime.fromtimestamp(stat.st_mtime).isoformat(), + } + ) + + groups.append({"kb_name": kb_name, "files": files}) + return groups + + +async def _run_structure_note_job(job_id: str, task_id: str) -> None: + stream_manager = get_task_stream_manager() + stream_manager.ensure_task(task_id) + manager = get_structure_note_manager() + artifact = await manager.run_job(job_id, task_id, _emit_log) + if artifact.status == JobStatus.READY: + stream_manager.emit_complete(task_id, "Structure Note completed") + else: + stream_manager.emit_failed(task_id, artifact.error or "Structure Note failed") + + +@router.post("/jobs") +async def create_job( + background_tasks: BackgroundTasks, + file: UploadFile = File(...), + difficulty_level: DifficultyLevel = Form(DifficultyLevel.MEDIUM), + note_language: NoteLanguage = Form(NoteLanguage.ZH), + style_level: ExplanationStyleLevel = Form(ExplanationStyleLevel.MEDIUM), + project_name: str | None = Form(None), +): + manager = get_structure_note_manager() + task_id = _build_unique_task_id("structure_note", file.filename or "upload") + get_task_stream_manager().ensure_task(task_id) + + source_format = Path(file.filename or "").suffix.lower().lstrip(".") + if source_format not in {"pdf", "ppt", "pptx"}: + raise HTTPException( + status_code=400, detail="Structure Note accepts PDF, PPT, or PPTX only." + ) + + job_id = f"structure_note_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid4().hex[:8]}" + job_dirs = manager.storage.ensure_job_dirs(job_id) + source_path, safe_name, _ = _save_upload(file, job_dirs["source"]) + + target_project_name = ( + project_name.strip() if project_name and project_name.strip() else "Local Uploads" + ) + try: + artifact = manager.create_job( + file_name=safe_name, + source_format=source_format, + difficulty_level=difficulty_level, + note_language=note_language, + style_level=style_level, + source_path=source_path, + task_id=task_id, + job_id=job_id, + project_name=target_project_name, + note_title=Path(safe_name).stem, + source_kind="upload", + source_ref={"file_name": safe_name}, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + _emit_log(task_id, f"Created Structure Note job for `{safe_name}`.") + background_tasks.add_task(_run_structure_note_job, artifact.job_id, task_id) + return manager.serialize_job(artifact) + + +@router.get("/jobs") +async def list_jobs(): + manager = get_structure_note_manager() + return {"jobs": [manager.serialize_job(job) for job in manager.list_jobs()]} + + +@router.get("/projects") +async def list_projects(): + manager = get_structure_note_manager() + return {"projects": [project.model_dump(mode="json") for project in manager.list_projects()]} + + +@router.post("/projects") +async def create_project(name: str = Form(...)): + manager = get_structure_note_manager() + try: + project = manager.create_project(name) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return project.model_dump(mode="json") + + +@router.post("/projects/{project_name}/rename") +async def rename_project(project_name: str, new_name: str = Form(...)): + manager = get_structure_note_manager() + try: + project = manager.rename_project(project_name, new_name) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Project not found.") + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return project.model_dump(mode="json") + + +@router.delete("/projects/{project_name}") +async def delete_project(project_name: str): + manager = get_structure_note_manager() + try: + deleted_job_ids = manager.delete_project(project_name) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Project not found.") + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return {"deleted_job_ids": deleted_job_ids} + + +@router.get("/kb/files") +async def list_knowledge_base_source_files(): + return {"knowledge_bases": _list_kb_source_files()} + + +@router.post("/jobs/from-kb") +async def create_job_from_knowledge_base( + background_tasks: BackgroundTasks, + kb_name: str = Form(...), + file_id: str = Form(...), + difficulty_level: DifficultyLevel = Form(DifficultyLevel.MEDIUM), + note_language: NoteLanguage = Form(NoteLanguage.ZH), + style_level: ExplanationStyleLevel = Form(ExplanationStyleLevel.MEDIUM), + project_name: str | None = Form(None), +): + source_file = _resolve_kb_source_file(kb_name, file_id) + safe_name = DocumentValidator.validate_upload_safety( + source_file.name, + source_file.stat().st_size, + allowed_extensions=_accepted_source_extensions, + ) + source_format = source_file.suffix.lower().lstrip(".") + + manager = get_structure_note_manager() + task_id = _build_unique_task_id("structure_note", f"{kb_name}_{safe_name}") + get_task_stream_manager().ensure_task(task_id) + + job_id = f"structure_note_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid4().hex[:8]}" + job_dirs = manager.storage.ensure_job_dirs(job_id) + snapshot_path = job_dirs["source"] / safe_name + shutil.copy2(source_file, snapshot_path) + + target_project_name = project_name.strip() if project_name and project_name.strip() else kb_name + try: + artifact = manager.create_job( + file_name=safe_name, + source_format=source_format, + difficulty_level=difficulty_level, + note_language=note_language, + style_level=style_level, + source_path=snapshot_path, + task_id=task_id, + job_id=job_id, + project_name=target_project_name, + note_title=Path(file_id).stem, + source_kind="knowledge_base", + source_ref={ + "kb_name": kb_name, + "file_id": file_id, + "file_name": source_file.name, + }, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + _emit_log( + task_id, f"Created Structure Note job for `{safe_name}` from Knowledge Base `{kb_name}`." + ) + background_tasks.add_task(_run_structure_note_job, artifact.job_id, task_id) + return manager.serialize_job(artifact) + + +@router.get("/jobs/{job_id}") +async def get_job(job_id: str): + manager = get_structure_note_manager() + try: + return manager.serialize_job(manager.get_job(job_id)) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Structure Note job not found") + + +@router.post("/jobs/{job_id}/retry") +async def retry_job(job_id: str, background_tasks: BackgroundTasks): + manager = get_structure_note_manager() + try: + artifact = manager.get_job(job_id) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Structure Note job not found") + + if artifact.status != JobStatus.FAILED: + raise HTTPException(status_code=409, detail="Only failed jobs can be retried.") + + task_id = _build_unique_task_id("structure_note_retry", job_id) + get_task_stream_manager().ensure_task(task_id) + artifact = manager.update_status(artifact, JobStatus.QUEUED, error=None, task_id=task_id) + _emit_log(task_id, f"Retrying Structure Note job `{artifact.file_name}`.") + background_tasks.add_task(_run_structure_note_job, artifact.job_id, task_id) + return manager.serialize_job(artifact) + + +@router.get("/tasks/{task_id}/stream") +async def stream_task(task_id: str): + manager = get_task_stream_manager() + manager.ensure_task(task_id) + return StreamingResponse( + manager.stream(task_id), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, + ) diff --git a/deeptutor/api/routers/system.py b/deeptutor/api/routers/system.py index bfe061b05..0463a78fa 100644 --- a/deeptutor/api/routers/system.py +++ b/deeptutor/api/routers/system.py @@ -52,6 +52,7 @@ async def get_runtime_topology(): ], "isolated_subsystems": [ {"router": "guide", "mode": "independent_subsystem"}, + {"router": "structure_note", "mode": "independent_subsystem"}, {"router": "co_writer", "mode": "independent_subsystem"}, {"router": "plugins_api", "mode": "playground_transport"}, ], @@ -292,7 +293,9 @@ async def test_search_connection(): ) except ValueError as e: - return TestResponse(success=False, message=f"Search configuration error: {e!s}", error=str(e)) + return TestResponse( + success=False, message=f"Search configuration error: {e!s}", error=str(e) + ) except Exception as e: response_time = (time.time() - start_time) * 1000 return TestResponse( diff --git a/deeptutor/services/__init__.py b/deeptutor/services/__init__.py index 167b90c3e..75e938001 100644 --- a/deeptutor/services/__init__.py +++ b/deeptutor/services/__init__.py @@ -50,6 +50,7 @@ "rag", "prompt", "search", + "structure_note", "setup", "session", "config", @@ -71,6 +72,8 @@ def __getattr__(name: str): return importlib.import_module(f"{__name__}.search") if name == "setup": return importlib.import_module(f"{__name__}.setup") + if name == "structure_note": + return importlib.import_module(f"{__name__}.structure_note") if name == "session": return importlib.import_module(f"{__name__}.session") if name == "config": diff --git a/deeptutor/services/config/loader.py b/deeptutor/services/config/loader.py index 30e46f996..08bf5cab6 100644 --- a/deeptutor/services/config/loader.py +++ b/deeptutor/services/config/loader.py @@ -23,12 +23,19 @@ # .parent.parent.parent.parent = DeepTutor/ (project root) PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent.parent +_DEFAULT_MAIN_CONFIG: dict[str, Any] = { + "system": {"language": "en"}, + "logging": {"level": "WARNING", "save_to_file": True, "console_output": True}, + "tools": {"run_code": {}}, +} + def get_runtime_settings_dir(project_root: Path | None = None) -> Path: """Return the canonical runtime settings directory under ``data/user/settings``.""" root = project_root or PROJECT_ROOT return root / "data" / "user" / "settings" + def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: """ Deep merge two dictionaries, values in override will override values in base @@ -74,6 +81,7 @@ def _inject_runtime_paths(config: dict[str, Any]) -> dict[str, Any]: "user_log_dir": str(path_service.get_logs_dir()), "performance_log_dir": str(path_service.get_logs_dir() / "performance"), "guide_output_dir": str(path_service.get_guide_dir()), + "structure_note_output_dir": str(path_service.get_structure_note_dir()), "question_output_dir": str(path_service.get_chat_feature_dir("deep_question")), "research_output_dir": str(path_service.get_research_dir()), "research_reports_dir": str(path_service.get_research_reports_dir()), @@ -87,6 +95,14 @@ async def _load_yaml_file_async(file_path: Path) -> dict[str, Any]: return await asyncio.to_thread(_load_yaml_file, file_path) +def _load_main_config(project_root: Path) -> dict[str, Any]: + settings_dir = get_runtime_settings_dir(project_root) + main_path = settings_dir / "main.yaml" + if main_path.exists(): + return _load_yaml_file(main_path) + return _DEFAULT_MAIN_CONFIG.copy() + + def resolve_config_path( config_file: str, project_root: Path | None = None, @@ -108,8 +124,7 @@ def resolve_config_path( if config_path.exists(): return config_path, False raise FileNotFoundError( - f"Configuration file not found: {config_file} " - f"(expected under {settings_dir})" + f"Configuration file not found: {config_file} (expected under {settings_dir})" ) @@ -127,8 +142,12 @@ def load_config_with_main(config_file: str, project_root: Path | None = None) -> if project_root is None: project_root = PROJECT_ROOT + base_config = _load_main_config(project_root) + if config_file == "main.yaml": + return _inject_runtime_paths(base_config) + config_path, _ = resolve_config_path(config_file, project_root) - return _inject_runtime_paths(_load_yaml_file(config_path)) + return _inject_runtime_paths(_deep_merge(base_config, _load_yaml_file(config_path))) async def load_config_with_main_async( @@ -149,8 +168,13 @@ async def load_config_with_main_async( if project_root is None: project_root = PROJECT_ROOT + base_config = _load_main_config(project_root) + if config_file == "main.yaml": + return _inject_runtime_paths(base_config) + config_path, _ = resolve_config_path(config_file, project_root) - return _inject_runtime_paths(await _load_yaml_file_async(config_path)) + module_config = await _load_yaml_file_async(config_path) + return _inject_runtime_paths(_deep_merge(base_config, module_config)) def get_path_from_config(config: dict[str, Any], path_key: str, default: str = None) -> str: diff --git a/deeptutor/services/path_service.py b/deeptutor/services/path_service.py index c3f7cd2b4..15f99a8b0 100644 --- a/deeptutor/services/path_service.py +++ b/deeptutor/services/path_service.py @@ -13,6 +13,7 @@ ├── notebook/ ├── co-writer/ ├── guide/ + ├── structure_note/ └── chat/ ├── chat/ ├── deep_solve/ @@ -32,6 +33,7 @@ "research", "co-writer", "guide", + "structure_note", "run_code_workspace", "logs", "math_animator", @@ -51,6 +53,7 @@ "notebook", "co-writer", "guide", + "structure_note", "chat", ] @@ -68,6 +71,7 @@ class PathService: "math_animator": ("chat", "math_animator"), "co-writer": ("co-writer", None), "guide": ("guide", None), + "structure_note": ("structure_note", None), "run_code_workspace": ("chat", "_detached_code_execution"), } _PRIVATE_SUFFIXES = {".json", ".sqlite", ".db", ".md", ".yaml", ".yml", ".py", ".log"} @@ -128,17 +132,32 @@ def is_public_output_path(self, path: str | Path) -> bool: if not candidate.is_file(): return False - if candidate.suffix.lower() in self._PRIVATE_SUFFIXES: + parts = relative.parts + suffix = candidate.suffix.lower() + if len(parts) >= 5 and parts[:2] == ("workspace", "structure_note"): + if parts[3] == "final" and suffix in {".pdf", ".md"}: + return True + if parts[3] == "images" and suffix in {".png", ".jpg", ".jpeg", ".webp", ".gif"}: + return True + + if suffix in self._PRIVATE_SUFFIXES: return False - parts = relative.parts if parts[:3] == ("workspace", "co-writer", "audio"): return True - if len(parts) >= 5 and parts[:3] == ("workspace", "chat", "deep_solve") and "artifacts" in parts[4:]: + if ( + len(parts) >= 5 + and parts[:3] == ("workspace", "chat", "deep_solve") + and "artifacts" in parts[4:] + ): return True - if len(parts) >= 5 and parts[:3] == ("workspace", "chat", "math_animator") and "artifacts" in parts[4:]: + if ( + len(parts) >= 5 + and parts[:3] == ("workspace", "chat", "math_animator") + and "artifacts" in parts[4:] + ): return True if len(parts) >= 5 and parts[:2] == ("workspace", "chat") and "code_runs" in parts[3:]: @@ -183,9 +202,16 @@ def get_session_workspace(self, feature: str, session_id: str) -> Path: return session_root / session_id def _resolve_feature_root(self, feature: str) -> Path: - if feature in {"chat", "deep_solve", "deep_question", "deep_research", "math_animator", "_detached_code_execution"}: + if feature in { + "chat", + "deep_solve", + "deep_question", + "deep_research", + "math_animator", + "_detached_code_execution", + }: return self.get_chat_feature_dir(feature) # type: ignore[arg-type] - if feature in {"memory", "notebook", "co-writer", "guide"}: + if feature in {"memory", "notebook", "co-writer", "guide", "structure_note"}: return self.get_workspace_feature_dir(feature) # type: ignore[arg-type] raise ValueError(f"Unknown workspace feature: {feature}") @@ -224,6 +250,7 @@ def get_memory_dir(self) -> Path: target = new_dir / f.name if not target.exists(): import shutil + shutil.copy2(f, target) return new_dir @@ -272,6 +299,12 @@ def get_guide_dir(self) -> Path: def get_guide_session_file(self, session_id: str) -> Path: return self.get_guide_dir() / f"session_{session_id}.json" + def get_structure_note_dir(self) -> Path: + return self.get_workspace_feature_dir("structure_note") + + def get_structure_note_job_dir(self, job_id: str) -> Path: + return self.get_structure_note_dir() / job_id + def get_run_code_workspace_dir(self) -> Path: return self.get_chat_feature_dir("_detached_code_execution") @@ -314,7 +347,7 @@ def ensure_all_directories(self) -> None: self.ensure_memory_dir() self.ensure_notebook_dir() self.get_logs_dir().mkdir(parents=True, exist_ok=True) - for feature in ("co-writer", "guide"): + for feature in ("co-writer", "guide", "structure_note"): self.get_workspace_feature_dir(feature).mkdir(parents=True, exist_ok=True) for feature in ( "chat", diff --git a/deeptutor/services/setup/init.py b/deeptutor/services/setup/init.py index 7feecc8ce..0d21470b9 100644 --- a/deeptutor/services/setup/init.py +++ b/deeptutor/services/setup/init.py @@ -22,7 +22,14 @@ "sidebar_description": "✨ Data Intelligence Lab @ HKU", "sidebar_nav_order": { "start": ["/", "/history", "/knowledge", "/notebook"], - "learnResearch": ["/question", "/solver", "/guide", "/research", "/co_writer"], + "learnResearch": [ + "/question", + "/solver", + "/guide", + "/structure-note", + "/research", + "/co_writer", + ], }, } @@ -120,7 +127,7 @@ def init_user_directories(project_root: Path | None = None) -> None: This function uses lazy initialization - directories are created on-demand when files are saved, rather than pre-creating all directories at startup. - + Only essential configuration files (like settings/interface.json) are created at startup if they don't exist. @@ -137,6 +144,7 @@ def init_user_directories(project_root: Path | None = None) -> None: ├── memory/ ├── co-writer/ ├── guide/ + ├── structure_note/ └── chat/ ├── chat/ ├── deep_solve/ @@ -160,7 +168,7 @@ def init_user_directories(project_root: Path | None = None) -> None: def _ensure_essential_settings(path_service) -> None: """ Ensure essential settings files exist. - + This is the minimal initialization needed at startup. All other directories are created on-demand when files are saved. """ diff --git a/deeptutor/services/structure_note/__init__.py b/deeptutor/services/structure_note/__init__.py new file mode 100644 index 000000000..2b2e7d614 --- /dev/null +++ b/deeptutor/services/structure_note/__init__.py @@ -0,0 +1,40 @@ +from .difficulty import DifficultyPreset, get_difficulty_preset +from .manager import StructureNoteManager +from .models import ( + CitationEntry, + DifficultyLevel, + DocumentPlan, + ExplanationStyleLevel, + GenerationChunk, + ImagePlaceholder, + JobStatus, + NoteLanguage, + PageIndexPage, + SectionEvidence, + SectionPlan, + SectionTreeNode, + StructureNoteArtifact, + StructureNoteProject, +) +from .storage import StructureNoteStorage + +__all__ = [ + "CitationEntry", + "DifficultyLevel", + "DifficultyPreset", + "DocumentPlan", + "ExplanationStyleLevel", + "GenerationChunk", + "ImagePlaceholder", + "JobStatus", + "NoteLanguage", + "PageIndexPage", + "SectionEvidence", + "SectionPlan", + "SectionTreeNode", + "StructureNoteArtifact", + "StructureNoteManager", + "StructureNoteProject", + "StructureNoteStorage", + "get_difficulty_preset", +] diff --git a/deeptutor/services/structure_note/difficulty.py b/deeptutor/services/structure_note/difficulty.py new file mode 100644 index 000000000..ad7cc3b8e --- /dev/null +++ b/deeptutor/services/structure_note/difficulty.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from .models import DifficultyLevel + + +@dataclass(frozen=True) +class DifficultyPreset: + level: DifficultyLevel + page_window: int + depth_instruction: str + compression_instruction: str + placeholder_purpose: str + + +PRESETS: dict[DifficultyLevel, DifficultyPreset] = { + DifficultyLevel.SIMPLE: DifficultyPreset( + level=DifficultyLevel.SIMPLE, + page_window=10, + depth_instruction=( + "Simple controls how much to cover: keep only the core thread, key concepts, " + "essential conclusions, and any indispensable bridge needed to understand them. " + "Short does not mean shallow." + ), + compression_instruction=( + "Compress by deleting repeated background, template transitions, low-information summaries, " + "and meta commentary. Preserve precise definitions, key mechanisms, critical formulas or " + "arguments, and the shortest logical bridge between ideas." + ), + placeholder_purpose="key_figure", + ), + DifficultyLevel.MEDIUM: DifficultyPreset( + level=DifficultyLevel.MEDIUM, + page_window=10, + depth_instruction=( + "Medium controls how much to cover: include the main knowledge points and the core logic chain " + "needed for a normal classroom handout." + ), + compression_instruction=( + "Compress by merging duplicated examples and background while retaining the main concepts, " + "mechanisms, evidence, and topic-to-topic reasoning." + ), + placeholder_purpose="supporting_figure", + ), + DifficultyLevel.DETAILED: DifficultyPreset( + level=DifficultyLevel.DETAILED, + page_window=6, + depth_instruction=( + "Detailed controls how much to cover: preserve a fuller knowledge structure, including " + "intermediate steps, boundary cases, supporting examples, and derivation or argument details " + "when they are present in the evidence." + ), + compression_instruction=( + "Compress only low-value repetition and boilerplate. Keep the complete conceptual chain, " + "important qualifications, examples, mechanisms, and source-supported derivation details." + ), + placeholder_purpose="detailed_figure", + ), +} + + +def get_difficulty_preset(level: DifficultyLevel) -> DifficultyPreset: + return PRESETS[level] diff --git a/deeptutor/services/structure_note/generator.py b/deeptutor/services/structure_note/generator.py new file mode 100644 index 000000000..a43d125e6 --- /dev/null +++ b/deeptutor/services/structure_note/generator.py @@ -0,0 +1,571 @@ +from __future__ import annotations + +from collections.abc import Iterable +import re + +from deeptutor.services.llm import complete as llm_complete + +from .difficulty import DifficultyPreset +from .markdown_postprocessor import normalize_structure_note_markdown +from .models import ( + DifficultyLevel, + DocumentPlan, + ExplanationStyleLevel, + GenerationChunk, + NoteLanguage, + PageIndexPage, + SectionEvidence, + SectionPlan, + SectionTreeNode, +) + + +def _pages_by_number(pages: Iterable[PageIndexPage]) -> dict[int, PageIndexPage]: + return {page.page_number: page for page in pages} + + +def build_generation_chunks( + pages: list[PageIndexPage], + sections: list[SectionTreeNode], + preset: DifficultyPreset, + document_plan: DocumentPlan | None = None, +) -> list[GenerationChunk]: + chunks: list[GenerationChunk] = [] + chunk_index = 1 + + if document_plan and document_plan.outline: + section_lookup = {section.section_id: section for section in document_plan.outline} + tree_lookup = {section.section_id: section for section in sections} + ordered_plans = [ + section_lookup[section_id] + for section_id in document_plan.section_order + if section_id in section_lookup + ] + for plan in ordered_plans: + page_numbers = plan.page_numbers + evidence = plan.evidence + tree_node = tree_lookup.get(plan.section_id) + if tree_node and tree_node.child_ids: + child_starts = [ + tree_lookup[child_id].page_start + for child_id in tree_node.child_ids + if child_id in tree_lookup + ] + if child_starts: + overview_end = min(plan.page_end, min(child_starts) - 1) + page_numbers = list(range(plan.page_start, overview_end + 1)) + if not page_numbers: + page_numbers = [plan.page_start] + evidence = [ + item for item in plan.evidence if item.page_number in set(page_numbers) + ] + if not page_numbers: + continue + chunks.append( + _chunk_from_plan(plan, chunk_index, page_numbers=page_numbers, evidence=evidence) + ) + chunk_index += 1 + elif sections: + ordered_sections = sorted( + sections, key=lambda item: (item.page_start, item.level, item.section_id) + ) + for section in ordered_sections: + page_numbers = list(range(section.page_start, section.page_end + 1)) + if not page_numbers: + continue + chunks.append( + GenerationChunk( + chunk_id=f"chunk-{chunk_index:03d}", + section_id=section.section_id, + section_title=section.title, + section_path=section.path or [section.title], + section_summary=section.summary, + heading_level=max(2, min(section.level, 5)), + page_start=page_numbers[0], + page_end=page_numbers[-1], + page_numbers=page_numbers, + ) + ) + chunk_index += 1 + else: + for start in range(1, len(pages) + 1, preset.page_window): + window = list(range(start, min(start + preset.page_window, len(pages) + 1))) + title = f"Pages {window[0]}-{window[-1]}" if len(window) > 1 else f"Page {window[0]}" + chunks.append( + GenerationChunk( + chunk_id=f"chunk-{chunk_index:03d}", + section_title=title, + section_path=[title], + page_start=window[0], + page_end=window[-1], + page_numbers=window, + ) + ) + chunk_index += 1 + + return chunks + + +def _chunk_from_plan( + plan: SectionPlan, + chunk_index: int, + *, + page_numbers: list[int] | None = None, + evidence: list[SectionEvidence] | None = None, +) -> GenerationChunk: + resolved_pages = page_numbers or plan.page_numbers + return GenerationChunk( + chunk_id=f"chunk-{chunk_index:03d}", + section_id=plan.section_id, + section_title=plan.title, + section_path=plan.section_path or [plan.title], + section_summary=plan.summary, + heading_level=max(2, min(plan.level, 5)), + page_start=resolved_pages[0], + page_end=resolved_pages[-1], + page_numbers=resolved_pages, + evidence=evidence if evidence is not None else plan.evidence, + dependencies=plan.dependencies, + ) + + +def _build_page_context( + page_lookup: dict[int, PageIndexPage], + page_numbers: list[int], + evidence: list[SectionEvidence], +) -> str: + parts: list[str] = [] + evidence_lookup = {item.page_number: item for item in evidence} + for page_number in page_numbers: + page = page_lookup.get(page_number) + if not page: + continue + evidence_item = evidence_lookup.get(page_number) + excerpt = evidence_item.excerpt if evidence_item else page.text.strip() + if len(excerpt) > 1800: + excerpt = f"{excerpt[:1800]}\n..." + title_candidates = ( + evidence_item.title_candidates + if evidence_item + else [candidate.text for candidate in page.title_candidates[:5]] + ) + image_candidate_ids = ( + evidence_item.image_candidate_ids + if evidence_item + else [candidate.candidate_id for candidate in page.image_candidates[:8]] + ) + metadata = [] + if title_candidates: + metadata.append(f"title candidates: {title_candidates}") + if image_candidate_ids: + metadata.append(f"image candidates: {image_candidate_ids}") + suffix = f"\n({'; '.join(metadata)})" if metadata else "" + parts.append(f"[Page {page_number}]{suffix}\n{excerpt}") + return "\n\n".join(parts) + + +def _fallback_markdown( + chunk: GenerationChunk, page_lookup: dict[int, PageIndexPage], language: str +) -> str: + heading = "#" * max(2, min(chunk.heading_level, 5)) + title = " / ".join(chunk.section_path) + source_range = ( + f"Pages {chunk.page_start}-{chunk.page_end}" + if chunk.page_start != chunk.page_end + else f"Page {chunk.page_start}" + ) + if language == "zh": + intro = f"本节根据第 {chunk.page_start}-{chunk.page_end} 页内容整理。" + summary_label = "本节小结" + source_label = "来源线索" + empty = "该页范围未提取到可用文本。" + else: + intro = f"This section synthesizes the material from {source_range.lower()}." + summary_label = "Section Summary" + source_label = "Source Notes" + empty = "No extractable text was found for this section range." + source_notes: list[str] = [] + for page_number in chunk.page_numbers: + page = page_lookup.get(page_number) + if not page or not page.text.strip(): + continue + excerpt = page.text.strip().replace("\n", " ") + if len(excerpt) > 360: + excerpt = f"{excerpt[:360]}..." + source_notes.append(f"- Page {page_number}: {excerpt}") + if not source_notes: + source_notes.append(f"- {empty}") + summary = chunk.section_summary or empty + return ( + f"{heading} {title}\n\n" + f"{intro}\n\n" + f"### {source_label}\n\n" + + "\n".join(source_notes) + + f"\n\n> **{summary_label}:** {summary}\n" + ) + + +def _strip_markdown_fence(content: str) -> str: + match = re.fullmatch( + r"\s*```(?:markdown)?\s*(.*?)\s*```\s*", content, flags=re.DOTALL | re.IGNORECASE + ) + return match.group(1).strip() if match else content.strip() + + +def _document_outline(document_plan: DocumentPlan | None) -> str: + if not document_plan: + return "" + lines: list[str] = [] + for section in document_plan.outline: + indent = " " * max(0, section.level - 2) + page_range = ( + f"pages {section.page_start}-{section.page_end}" + if section.page_start != section.page_end + else f"page {section.page_start}" + ) + lines.append(f"{indent}- {section.title} ({page_range}): {section.summary}") + return "\n".join(lines) + + +def _language_name(language: str) -> str: + if language == NoteLanguage.ZH.value: + return "Chinese" + if language == NoteLanguage.EN.value: + return "English" + return language + + +def _style_instruction(style_level: ExplanationStyleLevel, language: str) -> str: + if language == NoteLanguage.ZH.value: + if style_level == ExplanationStyleLevel.LOW: + return ( + "Low 只控制怎么讲:科普式讲解,强调直观、易懂、低门槛,减少术语和公式负担。" + "不要减少本节按 depth 要覆盖的核心内容。" + ) + if style_level == ExplanationStyleLevel.HIGH: + return ( + "High 只控制怎么讲:学术讲义风格,定义更精确,边界更清楚,逻辑链更严格," + "强调机制、论证链和术语精度。理工/数学/计算机可保留关键公式、推导、矩阵、定理或算法机制;" + "生物/医学保留机制链路、因果过程和术语定义;社科/人文保留概念辨析、理论框架和论证结构。" + "High 的本质是 rigor,不是强行造公式。" + ) + return ( + "Medium 只控制怎么讲:标准课堂讲义风格,兼顾清晰度、完整性和一定理论性," + "不要改变 depth 决定的覆盖范围。" + ) + + if style_level == ExplanationStyleLevel.LOW: + return ( + "Low controls how to explain: popular-science style, intuitive, approachable, and low-friction, " + "with less terminology and formula burden. Do not reduce the content coverage selected by depth." + ) + if style_level == ExplanationStyleLevel.HIGH: + return ( + "High controls how to explain: academic lecture-note style with precise definitions, clear concept " + "boundaries, strict logic, mechanisms, argument chains, and terminology precision. For STEM, math, " + "CS, or engineering, preserve key formulas, derivations, matrices, theorems, or algorithm mechanisms " + "when supported. For biology or medicine, preserve mechanism chains, causal processes, and definitions. " + "For social sciences or humanities, preserve conceptual distinctions, theoretical frameworks, and " + "argument structure. High means rigor; do not invent formulas." + ) + return ( + "Medium controls how to explain: standard classroom handout style, balancing clarity, completeness, " + "and moderate theoretical density without changing the depth coverage." + ) + + +def _depth_label(level: DifficultyLevel) -> str: + return { + DifficultyLevel.SIMPLE: "Simple", + DifficultyLevel.MEDIUM: "Medium", + DifficultyLevel.DETAILED: "Detailed", + }[level] + + +def _style_label(style_level: ExplanationStyleLevel) -> str: + return { + ExplanationStyleLevel.LOW: "Low", + ExplanationStyleLevel.MEDIUM: "Medium", + ExplanationStyleLevel.HIGH: "High", + }[style_level] + + +def _transition_excerpt(markdown: str, limit: int = 800) -> str: + excerpt = markdown.strip() + if len(excerpt) <= limit: + return excerpt + return f"{excerpt[:limit].rstrip()}..." + + +def _clean_transition_markdown(content: str) -> str: + cleaned = normalize_structure_note_markdown(_strip_markdown_fence(content)) + lines = [line.strip() for line in cleaned.splitlines() if line.strip()] + normalized_lines: list[str] = [] + for line in lines: + line = re.sub(r"^(?:>\s*)+", "", line).strip() + line = re.sub(r"^(?:[-*+]|\d+[.)])\s+", "", line).strip() + if line: + normalized_lines.append(line) + return " ".join(normalized_lines).strip() + + +def _transition_style_instruction(style_level: ExplanationStyleLevel) -> str: + if style_level == ExplanationStyleLevel.LOW: + return "LOW: intuitive, simple explanation" + if style_level == ExplanationStyleLevel.HIGH: + return "HIGH: emphasize logical necessity, limitation, or theoretical gap" + return "MEDIUM: standard lecture explanation (default)" + + +def _minimal_transition_fallback(previous: GenerationChunk, current: GenerationChunk) -> str: + return f"{previous.section_title} leads naturally to {current.section_title}." + + +async def generate_transition_markdown( + previous: GenerationChunk, + current: GenerationChunk, + *, + language: str, + style_level: ExplanationStyleLevel, + document_plan: DocumentPlan | None = None, +) -> str: + outline = _document_outline(document_plan) + prompt = ( + "Write a short transition paragraph for a lecture-style note.\n\n" + "Goal:\n" + "Connect the previous section to the current section in a natural, knowledge-driven way.\n\n" + "Requirements:\n" + "- 1-3 sentences ONLY\n" + "- Do NOT use template phrases like:\n" + ' "上一部分...", "接下来...", "本节将..."\n' + "- Do NOT mention slides or pages\n" + "- Explain the logical bridge:\n" + " 1. what the previous section established\n" + " 2. what is still missing or limited\n" + " 3. why the current section naturally follows\n" + "- Write like a human lecture note, not a system connector\n" + "- No bullet points\n" + "- Output plain Markdown paragraph only\n\n" + "Style:\n" + "- LOW: intuitive, simple explanation\n" + "- MEDIUM: standard lecture explanation (default)\n" + "- HIGH: emphasize logical necessity, limitation, or theoretical gap\n" + f"Selected style: {_transition_style_instruction(style_level)}\n\n" + f"Language:\n{_language_name(language)}\n\n" + f"Document Outline:\n{outline or '(not provided)'}\n\n" + "Previous Section:\n" + f"Title: {previous.section_title}\n" + f"Summary: {previous.section_summary}\n" + f"Excerpt: {_transition_excerpt(previous.markdown)}\n\n" + "Current Section:\n" + f"Title: {current.section_title}\n" + f"Summary: {current.section_summary}\n" + f"Excerpt: {_transition_excerpt(current.markdown)}\n" + ) + try: + response = await llm_complete( + prompt=prompt, + system_prompt=( + "You write concise connective paragraphs for online lecture notes. " + "Use the given section excerpts to explain the knowledge flow without using a fixed template." + ), + temperature=0.45, + ) + cleaned = _clean_transition_markdown(response) + return cleaned or _minimal_transition_fallback(previous, current) + except Exception: + return _minimal_transition_fallback(previous, current) + + +def _combination_instruction( + depth_level: DifficultyLevel, + style_level: ExplanationStyleLevel, + language: str, +) -> str: + if language == NoteLanguage.ZH.value: + matrix = { + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.LOW): ( + "Simple + Low:短篇幅、低门槛、科普化。只讲最核心内容,用直觉化表达帮助快速入门。" + ), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.MEDIUM): ( + "Simple + Medium:短篇幅、标准课堂风格。只保留核心知识骨架,但表述清晰、正常、不泛化。" + ), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.HIGH): ( + "Simple + High:短篇幅、高密度、学术型核心讲义。只讲最重要内容,但保留严谨定义、关键机制、" + "关键公式或关键论证的作用说明。这是 short but dense,不是 short and shallow。" + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.LOW): ( + "Medium + Low:中等篇幅、科普风格。覆盖主要知识点,但仍以直观解释为主,减少抽象负担。" + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.MEDIUM): ( + "Medium + Medium:中等篇幅、标准课堂讲义。作为默认模式,兼顾覆盖面、逻辑和可读性。" + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.HIGH): ( + "Medium + High:中等篇幅、学术课堂风格。覆盖主要知识点,同时保留较强理论解释和关键数学、机制或论证说明。" + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.LOW): ( + "Detailed + Low:长篇幅、低门槛。讲得更全、更慢、更细,但仍以易懂为优先,不强行学术化。" + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.MEDIUM): ( + "Detailed + Medium:长篇幅、完整课堂讲义。比较全面,适合复习和系统整理。" + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.HIGH): ( + "Detailed + High:长篇幅、学术讲义/课程笔记风格。完整覆盖,强理论解释,并保留材料支持的必要推导、公式或严谨论证。" + ), + } + return matrix[(depth_level, style_level)] + + matrix = { + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.LOW): ( + "Simple + Low: short, low-barrier, popular-science explanation. Cover only the core ideas " + "and use intuitive language for fast entry-level understanding." + ), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.MEDIUM): ( + "Simple + Medium: short standard classroom note. Keep the core knowledge skeleton, but explain it clearly " + "without flattening it into a vague summary." + ), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.HIGH): ( + "Simple + High: short, dense, academic core note. Cover only the most important material while preserving " + "precise definitions, key mechanisms, and the role of any essential formula or argument. This is short " + "but dense, not short and shallow." + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.LOW): ( + "Medium + Low: medium length, popular-science style. Cover the main knowledge points with intuitive " + "explanations and a lighter abstraction burden." + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.MEDIUM): ( + "Medium + Medium: medium length standard classroom note. This is the default balance of coverage, " + "logic, and readability." + ), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.HIGH): ( + "Medium + High: medium length academic classroom note. Cover the main knowledge points while preserving " + "stronger theoretical explanation and key mathematical, mechanistic, or argumentative structure." + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.LOW): ( + "Detailed + Low: long, low-barrier explanation. Cover more material slowly and carefully while keeping " + "accessibility ahead of academic density." + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.MEDIUM): ( + "Detailed + Medium: long, complete classroom note for review and systematic organization." + ), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.HIGH): ( + "Detailed + High: long academic lecture note. Complete coverage with strong theoretical explanation and " + "source-supported derivations, formulas, or rigorous arguments when appropriate." + ), + } + return matrix[(depth_level, style_level)] + + +def _prompt_contract( + preset: DifficultyPreset, + style_level: ExplanationStyleLevel, + language: str, +) -> str: + if language == NoteLanguage.ZH.value: + return ( + "Parameter contract:\n" + f"- Explanation Depth = {_depth_label(preset.level)},只控制“讲多少”:{preset.depth_instruction}\n" + f"- Lecture Style Level = {_style_label(style_level)},只控制“怎么讲”:{_style_instruction(style_level, language)}\n" + f"- 组合语义:{_combination_instruction(preset.level, style_level, language)}\n" + "Compression policy:\n" + f"- {preset.compression_instruction}\n" + "- 优先删除:模板过渡、重复背景、空泛总结、“这一部分讲什么”的元描述、低信息量套话。\n" + "- 优先保留:核心概念、关键机制、关键公式或关键论证、关键推导桥梁句、方法之间为什么衔接的逻辑。\n" + ) + + return ( + "Parameter contract:\n" + f"- Explanation Depth = {_depth_label(preset.level)} controls only how much to cover: {preset.depth_instruction}\n" + f"- Lecture Style Level = {_style_label(style_level)} controls only how to explain: {_style_instruction(style_level, language)}\n" + f"- Combination meaning: {_combination_instruction(preset.level, style_level, language)}\n" + "Compression policy:\n" + f"- {preset.compression_instruction}\n" + "- Delete first: template transitions, repeated background, vague summaries, meta descriptions of what the section covers, and low-information filler.\n" + "- Preserve first: core concepts, key mechanisms, essential formulas or arguments, derivation bridge sentences, and the logic explaining why methods or ideas connect.\n" + ) + + +async def generate_chunk_markdown( + chunk: GenerationChunk, + pages: list[PageIndexPage], + preset: DifficultyPreset, + language: str = "en", + style_level: ExplanationStyleLevel = ExplanationStyleLevel.MEDIUM, + document_plan: DocumentPlan | None = None, +) -> str: + page_lookup = _pages_by_number(pages) + context = _build_page_context(page_lookup, chunk.page_numbers, chunk.evidence) + if not context.strip(): + return _fallback_markdown(chunk, page_lookup, language) + + heading = "#" * max(2, min(chunk.heading_level, 5)) + prompt = ( + f"Write one Markdown-first study-note section in {_language_name(language)}.\n" + f"The final note content itself must be in {_language_name(language)}; this is independent of UI language.\n" + "Return Markdown only. Write a real online lecture note section, not a page digest, slide script, " + "summary expansion, or fixed template.\n" + f"Section path: {' > '.join(chunk.section_path)}\n" + f"Page range: {chunk.page_start}-{chunk.page_end}\n" + f"Section summary: {chunk.section_summary}\n" + f"Required heading: {heading} {' / '.join(chunk.section_path)}\n" + f"{_prompt_contract(preset, style_level, language)}\n" + f"Document outline:\n{_document_outline(document_plan)}\n\n" + "Requirements:\n" + f"- Start exactly with the required Markdown heading.\n" + "- Organize by the knowledge thread of this section: what problem/concept is being explained, what mechanism or argument makes it work, and why the ideas connect.\n" + "- Do not average-compress every page. Select evidence according to the depth/style contract above.\n" + '- Do not write page-by-page explanations, presentation speech, or template phrases such as "this section introduces".\n' + "- Use natural explanatory paragraphs as the main body. Lists, tables, and callouts are supporting material only.\n" + "- Use ### subheadings only when they reflect real conceptual turns, not a fixed Definition/Example/Summary template.\n" + "- When a formula, theorem, algorithm, mechanism, causal chain, or argument is essential, explain the problem it solves, why it is introduced, and what role it plays in the method.\n" + "- For formulas or algorithmic updates, do not merely quote the expression. When the evidence supports it, derive it step by step from the preceding definition, objective, constraint, or mechanism; otherwise explain the missing derivation assumption clearly.\n" + "- For algorithms, identify the input/state/objective/update, why each update is shaped that way, and how the update reduces the original problem.\n" + "- For mechanisms, explain the chain from condition to process to consequence, including the point where the mechanism changes the outcome.\n" + "- For High style, increase rigor through precise definitions, boundaries, mechanisms, and argument quality; do not force formulas for non-mathematical material.\n" + "- Use numbered lists for algorithms, procedures, or causal sequences." + "- End with a short section summary callout that states the knowledge takeaway, not a generic recap.\n" + "- Markdown math contract: inline math must use `$...$`; display math and multi-step derivations must use `$$...$$` on their own lines.\n" + "- Never use unsupported wrappers such as `\\(...\\)` or `\\[...\\]`, and never mix wrappers like `$$\\(...\\)$$`.\n" + "- For function names, API names, code-like expressions, or pseudocode in prose, use backticks such as `f(x)` or `softmax(x)`, not math delimiters.\n" + "- Keep grounding traceable by mentioning source page ranges only when it clarifies evidence.\n\n" + "- Use bold sparingly for key concepts, mechanism names, theorem names, algorithm names, and likely confusion points. Do not overuse bold. Prefer 1–3 bold phrases per paragraph and never bold full sentences." + "- When appropriate, briefly point out one common misunderstanding, confusion, or misuse of the concept, and clarify it directly." + f"Section-grounded evidence:\n{context}" + ) + + try: + response = await llm_complete( + prompt=prompt, + system_prompt=( + "You turn a PageIndex-style section plan and page evidence into coherent online lecture notes. " + "You never produce slide narration or page-by-page commentary." + ), + temperature=0.35, + ) + cleaned = normalize_structure_note_markdown(_strip_markdown_fence(response)) + return cleaned if cleaned else _fallback_markdown(chunk, page_lookup, language) + except Exception: + return _fallback_markdown(chunk, page_lookup, language) + + +def inject_image_placeholders( + chunks: list[GenerationChunk], + pages: list[PageIndexPage], + purpose: str, +) -> list[GenerationChunk]: + page_lookup = _pages_by_number(pages) + for chunk in chunks: + image_pages = [ + page_number + for page_number in chunk.page_numbers + if page_lookup.get(page_number) and page_lookup[page_number].image_candidates + ] + if not image_pages: + continue + page_hint = image_pages[0] + placeholder_id = f"{chunk.chunk_id}-image-1" + token = f"[[IMAGE_PLACEHOLDER:{placeholder_id}:{page_hint}:{purpose}]]" + if token not in chunk.markdown: + chunk.markdown = f"{chunk.markdown.rstrip()}\n\n{token}\n" + chunk.placeholder_ids.append(placeholder_id) + return chunks diff --git a/deeptutor/services/structure_note/image_pipeline.py b/deeptutor/services/structure_note/image_pipeline.py new file mode 100644 index 000000000..6c2532087 --- /dev/null +++ b/deeptutor/services/structure_note/image_pipeline.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from pathlib import Path +import re + +from .models import CitationEntry, GenerationChunk, ImagePlaceholder, PageIndexPage + +_PLACEHOLDER_RE = re.compile( + r"\[\[IMAGE_PLACEHOLDER:(?P[^:\]]+):(?P\d+):(?P[^\]]+)\]\]" +) + + +def _pages_map(pages: list[PageIndexPage]) -> dict[int, PageIndexPage]: + return {page.page_number: page for page in pages} + + +def _figure_caption(chunk: GenerationChunk, page_number: int | None, language: str) -> str: + topic = (chunk.section_summary or chunk.section_title).strip() + if len(topic) > 120: + topic = f"{topic[:120].rstrip()}..." + if language == "zh": + source = f"第 {page_number} 页" if page_number else "对应页" + return ( + f"图示来源:{source}。该图对应本节“{chunk.section_title}”的核心内容:" + f"{topic or '结构、过程或关键例子'}" + ) + source = f"page {page_number}" if page_number else "the source page" + return ( + f"Figure from {source}. It supports the explanation of {chunk.section_title} by showing " + f"{topic or 'the structure, process, or key example'} discussed in the section." + ) + + +def _render_page_crop( + pdf_path: Path, page_number: int, output_path: Path, clip: list[float] | None = None +) -> None: + import fitz + + document = fitz.open(pdf_path) + try: + page = document[page_number - 1] + rect = fitz.Rect(clip) if clip else page.rect + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=rect, alpha=False) + pix.save(output_path) + finally: + document.close() + + +def process_images( + chunks: list[GenerationChunk], + pages: list[PageIndexPage], + pdf_path: Path, + images_dir: Path, + source_file: str, + language: str = "en", +) -> tuple[list[GenerationChunk], list[ImagePlaceholder], list[CitationEntry]]: + images_dir.mkdir(parents=True, exist_ok=True) + page_lookup = _pages_map(pages) + placeholders: list[ImagePlaceholder] = [] + citations: list[CitationEntry] = [] + + for chunk in chunks: + if not chunk.placeholder_ids: + continue + + def replace(match: re.Match[str]) -> str: + placeholder_id = match.group("placeholder_id") + page_hint = int(match.group("page_hint")) + purpose = match.group("purpose") + candidates = [] + for page_number in chunk.page_numbers: + page = page_lookup.get(page_number) + if not page: + continue + candidates.extend(page.image_candidates) + + placeholder = ImagePlaceholder( + placeholder_id=placeholder_id, + chunk_id=chunk.chunk_id, + page_hint=page_hint, + purpose=purpose, + ) + + image_name = f"{placeholder_id}.png" + image_path = images_dir / image_name + markdown_image_path = f"images/{image_name}" + + try: + if len(candidates) == 1: + candidate = candidates[0] + _render_page_crop(pdf_path, candidate.page_number, image_path, candidate.bbox) + placeholder.status = "filled" + placeholder.image_path = markdown_image_path + placeholder.resolved_page = candidate.page_number + placeholder.resolved_region = candidate.bbox + else: + fallback_page = page_hint if page_hint in page_lookup else chunk.page_start + _render_page_crop(pdf_path, fallback_page, image_path, None) + placeholder.status = "fallback_page" + placeholder.image_path = markdown_image_path + placeholder.resolved_page = fallback_page + placeholder.resolved_region = None + + placeholders.append(placeholder) + citations.append( + CitationEntry( + citation_id=f"cite-{placeholder_id}", + section_path=chunk.section_path, + page_start=chunk.page_start, + page_end=chunk.page_end, + source_file=source_file, + source_kind="image", + image_page=placeholder.resolved_page, + image_region=placeholder.resolved_region, + excerpt=purpose, + ) + ) + caption = _figure_caption(chunk, placeholder.resolved_page, language) + return f"![{purpose}]({markdown_image_path})\n\n*{caption}*" + except Exception as exc: + placeholder.status = "fallback_text" + placeholder.error = str(exc) + placeholders.append(placeholder) + return ( + "> Figure reference unavailable for this page range." + if language != "zh" + else "> 当前页范围的图片引用暂不可用。" + ) + + chunk.markdown = _PLACEHOLDER_RE.sub(replace, chunk.markdown) + + return chunks, placeholders, citations diff --git a/deeptutor/services/structure_note/manager.py b/deeptutor/services/structure_note/manager.py new file mode 100644 index 000000000..b4cd9bc8c --- /dev/null +++ b/deeptutor/services/structure_note/manager.py @@ -0,0 +1,592 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Any, Literal +import uuid + +from deeptutor.logging import get_logger + +from .difficulty import get_difficulty_preset +from .generator import ( + build_generation_chunks, + generate_chunk_markdown, + generate_transition_markdown, + inject_image_placeholders, +) +from .image_pipeline import process_images +from .markdown_postprocessor import normalize_structure_note_markdown +from .models import ( + CitationEntry, + DifficultyLevel, + DocumentPlan, + ExplanationStyleLevel, + GenerationChunk, + JobStatus, + NoteLanguage, + PageIndexPage, + SectionTreeNode, + StructureNoteArtifact, + StructureNoteProject, +) +from .normalizer import normalize_to_pdf +from .page_index import build_page_index +from .planner import build_document_plan +from .renderer import render_pdf +from .storage import StructureNoteStorage +from .tree_builder import build_section_tree + + +class StructureNoteManager: + def __init__(self, storage: StructureNoteStorage | None = None): + self.storage = storage or StructureNoteStorage() + self.logger = get_logger("StructureNote") + + def _normalize_project_name(self, project_name: str | None) -> str: + normalized = (project_name or "").strip() + if not normalized: + raise ValueError("Project name is required.") + if "/" in normalized or "\\" in normalized: + raise ValueError("Project name cannot contain path separators.") + return normalized + + def _artifact_project_name(self, artifact: StructureNoteArtifact) -> str: + return artifact.project_name or artifact.source_ref.get("kb_name") or "Local Uploads" + + def list_projects(self) -> list[StructureNoteProject]: + projects_by_name: dict[str, StructureNoteProject] = { + self._normalize_project_name(project.name): project + for project in self.storage.read_projects() + } + for artifact in self.list_jobs(): + project_name = self._artifact_project_name(artifact) + existing = projects_by_name.get(project_name) + if existing is None: + projects_by_name[project_name] = StructureNoteProject( + name=project_name, + created_at=artifact.created_at, + updated_at=artifact.updated_at, + ) + elif artifact.updated_at > existing.updated_at: + existing.updated_at = artifact.updated_at + return sorted(projects_by_name.values(), key=lambda item: item.name.lower()) + + def create_project(self, project_name: str) -> StructureNoteProject: + name = self._normalize_project_name(project_name) + projects = {project.name: project for project in self.list_projects()} + if name in projects: + raise ValueError(f"Project already exists: {name}") + timestamp = self.storage.new_timestamp() + project = StructureNoteProject(name=name, created_at=timestamp, updated_at=timestamp) + projects[name] = project + self.storage.write_projects(sorted(projects.values(), key=lambda item: item.name.lower())) + return project + + def ensure_project(self, project_name: str) -> StructureNoteProject: + name = self._normalize_project_name(project_name) + for project in self.list_projects(): + if project.name == name: + return project + return self.create_project(name) + + def rename_project(self, old_name: str, new_name: str) -> StructureNoteProject: + old_project_name = self._normalize_project_name(old_name) + new_project_name = self._normalize_project_name(new_name) + if old_project_name == new_project_name: + self.ensure_project(new_project_name) + return next( + project for project in self.list_projects() if project.name == new_project_name + ) + + projects = {project.name: project for project in self.list_projects()} + if old_project_name not in projects: + raise FileNotFoundError(f"Project not found: {old_project_name}") + if new_project_name in projects: + raise ValueError(f"Project already exists: {new_project_name}") + + renamed = projects.pop(old_project_name) + renamed.name = new_project_name + renamed.updated_at = self.storage.new_timestamp() + projects[new_project_name] = renamed + + for artifact in self.list_jobs(): + if self._artifact_project_name(artifact) != old_project_name: + continue + artifact.project_name = new_project_name + self.storage.touch_updated_at(artifact) + self.storage.write_artifact(artifact) + + self.storage.write_projects(sorted(projects.values(), key=lambda item: item.name.lower())) + return renamed + + def delete_project(self, project_name: str) -> list[str]: + name = self._normalize_project_name(project_name) + projects = {project.name: project for project in self.list_projects()} + if name not in projects: + raise FileNotFoundError(f"Project not found: {name}") + + deleted_job_ids: list[str] = [] + for artifact in self.list_jobs(): + if self._artifact_project_name(artifact) != name: + continue + deleted_job_ids.append(artifact.job_id) + self.storage.delete_job_dir(artifact.job_id) + + projects.pop(name, None) + self.storage.write_projects(sorted(projects.values(), key=lambda item: item.name.lower())) + return deleted_job_ids + + def create_job( + self, + file_name: str, + source_format: str, + difficulty_level: DifficultyLevel, + note_language: NoteLanguage, + style_level: ExplanationStyleLevel, + source_path: Path, + task_id: str, + job_id: str | None = None, + project_name: str | None = None, + note_title: str | None = None, + source_kind: Literal["upload", "knowledge_base"] = "upload", + source_ref: dict[str, str] | None = None, + ) -> StructureNoteArtifact: + job_id = job_id or ( + f"structure_note_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}" + ) + self.storage.ensure_job_dirs(job_id) + inferred_title = Path(file_name).stem or file_name + if project_name: + project_name = self.ensure_project(project_name).name + artifact = StructureNoteArtifact( + job_id=job_id, + file_name=file_name, + source_format=source_format, + difficulty_level=difficulty_level, + note_language=note_language, + style_level=style_level, + project_name=project_name, + note_title=note_title or inferred_title, + source_kind=source_kind, + source_ref=source_ref or {}, + status=JobStatus.QUEUED, + source_path=str(source_path), + task_id=task_id, + retry_state=None, + error=None, + created_at=self.storage.new_timestamp(), + updated_at=self.storage.new_timestamp(), + ) + return self.storage.write_artifact(artifact) + + def list_jobs(self) -> list[StructureNoteArtifact]: + return self.storage.list_artifacts() + + def get_job(self, job_id: str) -> StructureNoteArtifact: + return self.storage.read_artifact(job_id) + + def update_status( + self, + artifact: StructureNoteArtifact, + status: JobStatus, + *, + error: str | None = None, + retry_state: str | None = None, + task_id: str | None = None, + ) -> StructureNoteArtifact: + artifact.status = status + artifact.error = error + artifact.retry_state = retry_state + if task_id is not None: + artifact.task_id = task_id + self.storage.touch_updated_at(artifact) + return self.storage.write_artifact(artifact) + + def serialize_job(self, artifact: StructureNoteArtifact) -> dict[str, Any]: + citations: list[dict[str, Any]] = [] + if artifact.citation_manifest_path and Path(artifact.citation_manifest_path).exists(): + citations = self.storage.read_json(Path(artifact.citation_manifest_path)) + section_tree = self._load_section_tree_payload(artifact.section_tree_path) + return { + "job_id": artifact.job_id, + "file_name": artifact.file_name, + "status": artifact.status.value, + "source_format": artifact.source_format, + "difficulty_level": artifact.difficulty_level.value, + "note_language": artifact.note_language.value, + "style_level": artifact.style_level.value, + "project_name": artifact.project_name, + "note_title": artifact.note_title, + "source_kind": artifact.source_kind, + "source_ref": artifact.source_ref, + "final_pdf_url": self.storage.output_url_for(artifact.final_pdf_path), + "rendered_markdown_url": self.storage.output_url_for(artifact.rendered_markdown_path), + "asset_base_url": self.storage.output_url_for( + self.storage.get_job_dir(artifact.job_id) + ), + "sections": section_tree, + "citations": citations, + "retry_available": artifact.status == JobStatus.FAILED, + "error": artifact.error, + "task_id": artifact.task_id, + "created_at": artifact.created_at, + "updated_at": artifact.updated_at, + } + + async def run_job(self, job_id: str, task_id: str, emit_log) -> StructureNoteArtifact: + artifact = self.get_job(job_id) + artifact = self.update_status(artifact, JobStatus.QUEUED, error=None, task_id=task_id) + job_dirs = self.storage.ensure_job_dirs(job_id) + language = artifact.note_language.value + preset = get_difficulty_preset(artifact.difficulty_level) + style_level = artifact.style_level + + try: + emit_log(task_id, "Preparing Structure Note job.") + + normalized_pdf_path = job_dirs["normalized"] / "normalized.pdf" + artifact = self.update_status( + artifact, JobStatus.NORMALIZING, retry_state=JobStatus.NORMALIZING.value + ) + if artifact.normalized_pdf_path and Path(artifact.normalized_pdf_path).exists(): + normalized_pdf_path = Path(artifact.normalized_pdf_path) + emit_log(task_id, "Reusing normalized PDF.") + else: + emit_log(task_id, "Normalizing source file to PDF.") + normalized_pdf_path = normalize_to_pdf( + Path(artifact.source_path), job_dirs["normalized"] + ) + artifact.normalized_pdf_path = str(normalized_pdf_path) + self.storage.write_artifact(artifact) + + page_index_path = job_dirs["index"] / "page_index.json" + artifact = self.update_status( + artifact, JobStatus.INDEXING, retry_state=JobStatus.INDEXING.value + ) + if artifact.page_index_path and Path(artifact.page_index_path).exists(): + emit_log(task_id, "Reusing existing page index.") + page_index = [ + PageIndexPage.model_validate(item) + for item in self.storage.read_json(Path(artifact.page_index_path)) + ] + else: + emit_log(task_id, "Building page-level index.") + page_index = build_page_index(normalized_pdf_path) + artifact.page_index_path = str( + self.storage.write_json( + page_index_path, + [page.model_dump(mode="json") for page in page_index], + ) + ) + self.storage.write_artifact(artifact) + + section_tree_path = job_dirs["index"] / "section_tree.json" + artifact = self.update_status( + artifact, JobStatus.PLANNING, retry_state=JobStatus.PLANNING.value + ) + if artifact.section_tree_path and Path(artifact.section_tree_path).exists(): + emit_log(task_id, "Reusing section tree.") + section_tree = [ + SectionTreeNode.model_validate(item) + for item in self.storage.read_json(Path(artifact.section_tree_path)) + ] + else: + emit_log(task_id, "Deriving section tree.") + section_tree = await build_section_tree( + page_index, preset.page_window, language=language + ) + artifact.section_tree_path = str( + self.storage.write_json( + section_tree_path, + [node.model_dump(mode="json") for node in section_tree], + ) + ) + self.storage.write_artifact(artifact) + + document_plan_path = job_dirs["index"] / "document_plan.json" + if artifact.document_plan_path and Path(artifact.document_plan_path).exists(): + emit_log(task_id, "Reusing document-level plan.") + document_plan = DocumentPlan.model_validate( + self.storage.read_json(Path(artifact.document_plan_path)) + ) + else: + emit_log(task_id, "Building document-level section plan.") + document_plan = build_document_plan( + page_index, + section_tree, + document_title=artifact.file_name, + language=language, + ) + artifact.document_plan_path = str( + self.storage.write_json( + document_plan_path, + document_plan.model_dump(mode="json"), + ) + ) + self.storage.write_artifact(artifact) + + chunks_path = job_dirs["chunks"] / "generation_chunks.json" + artifact = self.update_status( + artifact, JobStatus.GENERATING, retry_state=JobStatus.GENERATING.value + ) + if artifact.generation_chunks_path and Path(artifact.generation_chunks_path).exists(): + emit_log(task_id, "Reusing generated chunks.") + chunks = [ + GenerationChunk.model_validate(item) + for item in self.storage.read_json(Path(artifact.generation_chunks_path)) + ] + else: + emit_log(task_id, "Generating section-level Markdown notes.") + chunks = build_generation_chunks( + page_index, + section_tree, + preset, + document_plan=document_plan, + ) + for chunk in chunks: + chunk.markdown = await generate_chunk_markdown( + chunk, + page_index, + preset, + language=language, + style_level=style_level, + document_plan=document_plan, + ) + chunks = inject_image_placeholders(chunks, page_index, preset.placeholder_purpose) + artifact.generation_chunks_path = str( + self.storage.write_json( + chunks_path, + [chunk.model_dump(mode="json") for chunk in chunks], + ) + ) + self.storage.write_artifact(artifact) + + image_state_path = job_dirs["chunks"] / "image_fill_state.json" + artifact = self.update_status( + artifact, + JobStatus.PROCESSING_IMAGES, + retry_state=JobStatus.PROCESSING_IMAGES.value, + ) + if artifact.rendered_markdown_path and Path(artifact.rendered_markdown_path).exists(): + emit_log(task_id, "Reusing rendered markdown after image processing.") + rendered_path = Path(artifact.rendered_markdown_path) + markdown_text = normalize_structure_note_markdown( + rendered_path.read_text(encoding="utf-8") + ) + rendered_path.write_text(markdown_text, encoding="utf-8") + image_citations = self._load_image_citations(artifact.image_fill_state_path) + else: + emit_log(task_id, "Resolving figure placeholders.") + chunks, placeholders, image_citations = process_images( + chunks, + page_index, + normalized_pdf_path, + job_dirs["images"], + artifact.file_name, + language=language, + ) + self.storage.write_json( + chunks_path, + [chunk.model_dump(mode="json") for chunk in chunks], + ) + artifact.image_fill_state_path = str( + self.storage.write_json( + image_state_path, + { + "placeholders": [item.model_dump(mode="json") for item in placeholders], + "image_citations": [ + item.model_dump(mode="json") for item in image_citations + ], + }, + ) + ) + transition_map = await self._build_transition_map( + chunks, + language=language, + style_level=style_level, + document_plan=document_plan, + ) + markdown_text = normalize_structure_note_markdown( + self._compose_markdown( + artifact, + chunks, + language, + transition_map=transition_map, + ) + ) + artifact.rendered_markdown_path = str( + self.storage.write_text(job_dirs["final"] / "rendered.md", markdown_text) + ) + self.storage.write_artifact(artifact) + + artifact = self.update_status( + artifact, JobStatus.RENDERING, retry_state=JobStatus.RENDERING.value + ) + citations = self._build_text_citations(chunks, artifact.file_name) + citations.extend(image_citations) + emit_log(task_id, "Rendering final PDF.") + final_pdf_path, citation_path = render_pdf( + markdown_text, + title=artifact.file_name, + citation_entries=citations, + job_dir=job_dirs["job"], + final_dir=job_dirs["final"], + ) + artifact.final_pdf_path = str(final_pdf_path) + artifact.citation_manifest_path = str(citation_path) + artifact = self.update_status( + artifact, JobStatus.READY, retry_state=JobStatus.READY.value + ) + self.storage.apply_retention_policy(artifact) + emit_log(task_id, "Structure Note is ready.") + return artifact + except Exception as exc: + self.logger.error(f"Structure Note job failed: {exc}", exc_info=True) + emit_log(task_id, f"Structure Note failed: {exc}") + return self.update_status( + artifact, + JobStatus.FAILED, + error=str(exc), + retry_state=artifact.status.value, + ) + + def _compose_markdown( + self, + artifact: StructureNoteArtifact, + chunks: list[GenerationChunk], + language: str, + *, + transition_map: dict[str, str] | None = None, + ) -> str: + heading = self._compose_markdown_heading(artifact, chunks, language) + sections: list[str] = [] + previous_major: GenerationChunk | None = None + for chunk in chunks: + if not chunk.markdown.strip(): + continue + if chunk.heading_level <= 2: + if previous_major is not None: + transition = (transition_map or {}).get(chunk.section_id or chunk.chunk_id, "") + if transition.strip(): + sections.append(transition.strip()) + previous_major = chunk + sections.append( + f'\n\n{chunk.markdown.strip()}' + ) + return f"{heading}\n\n" + "\n\n".join(sections) + + async def _build_transition_map( + self, + chunks: list[GenerationChunk], + *, + language: str, + style_level: ExplanationStyleLevel, + document_plan: DocumentPlan | None, + ) -> dict[str, str]: + transition_map: dict[str, str] = {} + previous_major: GenerationChunk | None = None + for chunk in chunks: + if not chunk.markdown.strip() or chunk.heading_level > 2: + continue + if previous_major is not None: + key = chunk.section_id or chunk.chunk_id + transition_map[key] = await generate_transition_markdown( + previous_major, + chunk, + language=language, + style_level=style_level, + document_plan=document_plan, + ) + previous_major = chunk + return transition_map + + def _compose_markdown_heading( + self, + artifact: StructureNoteArtifact, + chunks: list[GenerationChunk], + language: str, + ) -> str: + if language == "zh": + lines = [ + f"# {artifact.file_name}", + "", + f"> 结构化讲义。难度:`{artifact.difficulty_level.value}`。", + "", + "## 讲义目录", + ] + for chunk in chunks: + page_range = ( + f"第 {chunk.page_start}-{chunk.page_end} 页" + if chunk.page_start != chunk.page_end + else f"第 {chunk.page_start} 页" + ) + indent = " " * max(0, chunk.heading_level - 2) + lines.append(f"{indent}- {chunk.section_title}({page_range})") + lines.append("") + return "\n".join(lines) + + lines = [ + f"# {artifact.file_name}", + "", + f"> Structured lecture note. Difficulty: `{artifact.difficulty_level.value}`.", + "", + "## Lecture Outline", + ] + for chunk in chunks: + page_range = ( + f"pages {chunk.page_start}-{chunk.page_end}" + if chunk.page_start != chunk.page_end + else f"page {chunk.page_start}" + ) + indent = " " * max(0, chunk.heading_level - 2) + lines.append(f"{indent}- {chunk.section_title} ({page_range})") + lines.append("") + return "\n".join(lines) + + def _build_text_citations( + self, chunks: list[GenerationChunk], source_file: str + ) -> list[CitationEntry]: + citations: list[CitationEntry] = [] + for chunk in chunks: + evidence_excerpt = " ".join( + item.excerpt for item in chunk.evidence if item.excerpt + ).strip() + excerpt = evidence_excerpt or chunk.markdown.replace("\n", " ").strip() + if len(excerpt) > 240: + excerpt = f"{excerpt[:240]}..." + citations.append( + CitationEntry( + citation_id=f"cite-{chunk.chunk_id}", + section_path=chunk.section_path, + page_start=chunk.page_start, + page_end=chunk.page_end, + source_file=source_file, + source_kind="text", + excerpt=excerpt or None, + ) + ) + return citations + + def _load_section_tree_payload(self, section_tree_path: str | None) -> list[dict[str, Any]]: + if not section_tree_path: + return [] + path = Path(section_tree_path) + if not path.exists(): + return [] + payload = self.storage.read_json(path) + if not isinstance(payload, list): + return [] + return [item for item in payload if isinstance(item, dict)] + + def _load_image_citations(self, image_fill_state_path: str | None) -> list[CitationEntry]: + if not image_fill_state_path: + return [] + state_path = Path(image_fill_state_path) + if not state_path.exists(): + return [] + payload = self.storage.read_json(state_path) + return [ + CitationEntry.model_validate(item) + for item in payload.get("image_citations", []) + if isinstance(item, dict) + ] diff --git a/deeptutor/services/structure_note/markdown_postprocessor.py b/deeptutor/services/structure_note/markdown_postprocessor.py new file mode 100644 index 000000000..f39b92a4d --- /dev/null +++ b/deeptutor/services/structure_note/markdown_postprocessor.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass +import re + +_FENCE_RE = re.compile(r"(```[\s\S]*?```|~~~[\s\S]*?~~~)") +_INLINE_CODE_RE = re.compile(r"`[^`\n]+`") +_BRACKET_BLOCK_RE = re.compile(r"\\\[([\s\S]*?)\\\]") +_PAREN_INLINE_RE = re.compile(r"\\\(([\s\S]*?)\\\)") +_SINGLE_DOLLAR_RE = re.compile(r"(?^_{}]|[+\-*/]\s*[A-Za-z0-9]|[A-Za-z0-9]\s*[+\-*/]|" + r"[≤≥≈≠→←×÷±∈∉∪∩∞∑∫√])" +) +_CODE_LIKE_CALL_RE = re.compile(r"^[A-Za-z_][\w.]*\([A-Za-z0-9_.,\s:'\"-]*\)$") + + +@dataclass(frozen=True) +class MarkdownValidationResult: + warnings: list[str] + + @property + def ok(self) -> bool: + return not self.warnings + + +def normalize_structure_note_markdown(markdown_text: str) -> str: + """Normalize Structure Note Markdown for the current online/PDF renderers. + + The Structure Note render path uses remark-math/KaTeX online and a PDF + fallback renderer. Both paths expect inline math as ``$...$`` and display + math as ``$$...$$``. This pass converts unsupported wrappers, repairs common + mixed wrappers, and keeps code-like calls as inline code. + """ + + if not markdown_text: + return "" + + parts = _split_fenced_code(markdown_text) + normalized = [ + part if is_fence else _normalize_non_fenced_markdown(part) for is_fence, part in parts + ] + return re.sub(r"\n{3,}", "\n\n", "".join(normalized)).strip() + "\n" + + +def validate_renderer_compatible_markdown(markdown_text: str) -> MarkdownValidationResult: + warnings: list[str] = [] + for is_fence, part in _split_fenced_code(markdown_text): + if is_fence: + continue + protected, _restore = _protect_inline_code(part) + if re.search(r"\\[\(\)\[\]]", protected): + warnings.append("Unsupported LaTeX wrapper remains after normalization.") + if _has_inline_double_dollar(protected): + warnings.append("Inline double-dollar math remains after normalization.") + warnings.extend(_dangling_math_delimiter_warnings(protected)) + for kind, expression in _iter_math_expressions(protected): + warnings.extend(_validate_math_expression(expression, kind)) + return MarkdownValidationResult(warnings=warnings) + + +def _split_fenced_code(markdown_text: str) -> list[tuple[bool, str]]: + parts: list[tuple[bool, str]] = [] + last = 0 + for match in _FENCE_RE.finditer(markdown_text): + if match.start() > last: + parts.append((False, markdown_text[last : match.start()])) + parts.append((True, match.group(0))) + last = match.end() + if last < len(markdown_text): + parts.append((False, markdown_text[last:])) + return parts or [(False, markdown_text)] + + +def _protect_inline_code(text: str) -> tuple[str, Callable[[str], str]]: + protected: list[str] = [] + + def stash(match: re.Match[str]) -> str: + protected.append(match.group(0)) + return f"\u0000CODE{len(protected) - 1}\u0000" + + def restore(value: str) -> str: + for index, original in enumerate(protected): + value = value.replace(f"\u0000CODE{index}\u0000", original) + return value + + return _INLINE_CODE_RE.sub(stash, text), restore + + +def _normalize_non_fenced_markdown(text: str) -> str: + protected, restore = _protect_inline_code(text) + normalized = _LATEX_ENV_RE.sub(lambda match: _display_math(_latex_env_body(match)), protected) + normalized = _BRACKET_BLOCK_RE.sub(lambda match: _display_math(match.group(1)), normalized) + normalized = _PAREN_INLINE_RE.sub( + lambda match: _inline_math_replacement(match.group(1)), normalized + ) + normalized = _DOUBLE_DOLLAR_RE.sub(lambda match: _display_math(match.group(1)), normalized) + normalized = _SINGLE_DOLLAR_RE.sub( + lambda match: _single_dollar_replacement(match.group(1)), normalized + ) + return restore(normalized) + + +def _latex_env_body(match: re.Match[str]) -> str: + env_name = match.group(1).rstrip("*") + body = match.group(2).strip() + if env_name in {"align", "gather", "multline"}: + return f"\\begin{{aligned}}\n{body}\n\\end{{aligned}}" + return body + + +def _single_dollar_replacement(expression: str) -> str: + expression = _clean_math_expression(expression) + if _is_code_like_expression(expression): + return f"`{expression}`" + if _looks_like_math_expression(expression): + return _inline_math(expression) + return f"${expression}$" + + +def _inline_math_replacement(expression: str) -> str: + expression = _clean_math_expression(expression) + if _is_code_like_expression(expression): + return f"`{expression}`" + return _inline_math(expression) + + +def _inline_math(expression: str) -> str: + body = _clean_math_expression(expression) + return f"${body}$" if body else "" + + +def _display_math(expression: str) -> str: + body = _clean_math_expression(expression) + return f"\n\n$$\n{body}\n$$\n\n" if body else "" + + +def _clean_math_expression(expression: str) -> str: + body = expression.strip() + changed = True + while changed: + changed = False + for opener, closer in (("\\(", "\\)"), ("\\[", "\\]"), ("$$", "$$"), ("$", "$")): + if ( + body.startswith(opener) + and body.endswith(closer) + and len(body) > len(opener) + len(closer) + ): + body = body[len(opener) : len(body) - len(closer)].strip() + changed = True + body = re.sub(r"\n{3,}", "\n\n", body) + return body + + +def _is_code_like_expression(expression: str) -> bool: + if "\\" in expression or "\n" in expression: + return False + if any(symbol in expression for symbol in ("=", "^", "_", "<", ">", "+", "*", "/", "|")): + return False + return bool(_CODE_LIKE_CALL_RE.fullmatch(expression.strip())) + + +def _looks_like_math_expression(expression: str) -> bool: + expr = expression.strip() + if not expr: + return False + if _MATH_SIGNAL_RE.search(expr): + return True + return bool(re.fullmatch(r"[A-Za-z](?:_[A-Za-z0-9]+)?|[A-Za-z]\d*", expr)) + + +def _has_inline_double_dollar(text: str) -> bool: + for match in _DOUBLE_DOLLAR_RE.finditer(text): + before = text[: match.start()].rsplit("\n", 1)[-1].strip() + after = text[match.end() :].split("\n", 1)[0].strip() + if before or after: + return True + return False + + +def _iter_math_expressions(text: str) -> list[tuple[str, str]]: + expressions: list[tuple[str, str]] = [] + display_spans: list[tuple[int, int]] = [] + for match in _DOUBLE_DOLLAR_RE.finditer(text): + expressions.append(("display", match.group(1).strip())) + display_spans.append((match.start(), match.end())) + + def is_inside_display(match: re.Match[str]) -> bool: + return any(start <= match.start() and match.end() <= end for start, end in display_spans) + + for match in _SINGLE_DOLLAR_RE.finditer(text): + if not is_inside_display(match): + expressions.append(("inline", match.group(1).strip())) + return expressions + + +def _validate_math_expression(expression: str, kind: str) -> list[str]: + warnings: list[str] = [] + body = expression.strip() + if not body: + warnings.append(f"Empty {kind} math expression.") + return warnings + if "\\(" in body or "\\)" in body or "\\[" in body or "\\]" in body: + warnings.append(f"Unsupported LaTeX wrapper remains inside {kind} math.") + if "$" in body: + warnings.append(f"Nested dollar delimiter remains inside {kind} math.") + if not _balanced_braces(body, "{", "}"): + warnings.append(f"Unbalanced braces in {kind} math: {body[:80]}") + if not _balanced_braces(body, "(", ")"): + warnings.append(f"Unbalanced parentheses in {kind} math: {body[:80]}") + if not _balanced_braces(body, "[", "]"): + warnings.append(f"Unbalanced brackets in {kind} math: {body[:80]}") + return warnings + + +def _balanced_braces(value: str, opener: str, closer: str) -> bool: + depth = 0 + escaped = False + for char in value: + if escaped: + escaped = False + continue + if char == "\\": + escaped = True + continue + if char == opener: + depth += 1 + elif char == closer: + depth -= 1 + if depth < 0: + return False + return depth == 0 + + +def _dangling_math_delimiter_warnings(text: str) -> list[str]: + warnings: list[str] = [] + without_display = _DOUBLE_DOLLAR_RE.sub("", text) + for line in without_display.splitlines(): + scan = re.sub(r"\\\$", "", line) + scan = re.sub(r"\$\d+(?:[.,]\d+)?", "", scan) + single_dollars = [match.start() for match in re.finditer(r"(? Path: + output_dir.mkdir(parents=True, exist_ok=True) + suffix = source_path.suffix.lower() + target_pdf = output_dir / "normalized.pdf" + + if suffix == ".pdf": + shutil.copy2(source_path, target_pdf) + return target_pdf + + if suffix not in {".ppt", ".pptx"}: + raise NormalizationError(f"Unsupported file type for Structure Note: {suffix}") + + soffice = shutil.which("soffice") + if not soffice: + raise NormalizationError( + "LibreOffice is required for PPT/PPTX uploads. Install `soffice` and retry." + ) + + command = [ + soffice, + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(output_dir), + str(source_path), + ] + result = subprocess.run(command, capture_output=True, text=True, check=False) + if result.returncode != 0: + stderr = result.stderr.strip() or result.stdout.strip() or "Unknown conversion error" + raise NormalizationError(f"Failed to convert PPT/PPTX to PDF: {stderr}") + + converted_pdf = output_dir / f"{source_path.stem}.pdf" + if not converted_pdf.exists(): + raise NormalizationError("LibreOffice reported success but did not produce a PDF output.") + + converted_pdf.replace(target_pdf) + return target_pdf diff --git a/deeptutor/services/structure_note/page_index.py b/deeptutor/services/structure_note/page_index.py new file mode 100644 index 000000000..be559f920 --- /dev/null +++ b/deeptutor/services/structure_note/page_index.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from pathlib import Path +import re +from statistics import median + +from .models import ImageCandidate, PageIndexPage, TextBlock, TitleCandidate + +_HEADING_PATTERN = re.compile(r"^(\d+([.\-]\d+)*|[IVXLC]+|[A-Z])[\).:\s-]") + + +def _bbox_list(bbox: tuple[float, float, float, float] | list[float]) -> list[float]: + return [float(value) for value in bbox] + + +def build_page_index(pdf_path: Path) -> list[PageIndexPage]: + try: + import fitz + except ImportError as exc: # pragma: no cover - dependency is runtime-required + raise RuntimeError("PyMuPDF is required for Structure Note indexing.") from exc + + pages: list[PageIndexPage] = [] + document = fitz.open(pdf_path) + try: + for page_index, page in enumerate(document, start=1): + raw = page.get_text("dict") + blocks: list[TextBlock] = [] + title_candidates: list[TitleCandidate] = [] + image_candidates: list[ImageCandidate] = [] + font_sizes: list[float] = [] + max_font_size = 0.0 + + for block in raw.get("blocks", []): + block_type = int(block.get("type", 0)) + bbox = _bbox_list(block.get("bbox", [0, 0, 0, 0])) + if block_type == 1: + width = float(bbox[2] - bbox[0]) + height = float(bbox[3] - bbox[1]) + page_area = max(page.rect.width * page.rect.height, 1.0) + image_candidates.append( + ImageCandidate( + candidate_id=f"img-{page_index}-{len(image_candidates) + 1}", + page_number=page_index, + bbox=bbox, + width=width, + height=height, + area_ratio=(width * height) / page_area, + ) + ) + continue + + lines = block.get("lines", []) + for line in lines: + spans = line.get("spans", []) + text = "".join(str(span.get("text", "")) for span in spans).strip() + if not text: + continue + span_sizes = [float(span.get("size", 0.0) or 0.0) for span in spans] + line_font_size = max(span_sizes) if span_sizes else None + if line_font_size: + font_sizes.extend(span_sizes) + max_font_size = max(max_font_size, line_font_size) + line_bbox = _bbox_list(line.get("bbox", bbox)) + blocks.append( + TextBlock( + text=text, + bbox=line_bbox, + font_size=line_font_size, + ) + ) + + size_median = median(font_sizes) if font_sizes else 0.0 + top_threshold = page.rect.height * 0.45 + for block in blocks: + font_size = block.font_size or 0.0 + heading_like = bool(_HEADING_PATTERN.match(block.text)) + large_enough = font_size >= max(size_median + 1.5, max_font_size * 0.82, 11.5) + near_top = block.bbox[1] <= top_threshold + if len(block.text) > 160: + continue + if not (heading_like or (large_enough and near_top)): + continue + title_candidates.append( + TitleCandidate( + text=block.text, + page_number=page_index, + bbox=block.bbox, + font_size=font_size or None, + score=round((font_size or 0.0) + (3 if heading_like else 0), 3), + ) + ) + + page_text = "\n".join(block.text for block in blocks).strip() + pages.append( + PageIndexPage( + page_number=page_index, + width=float(page.rect.width), + height=float(page.rect.height), + text=page_text, + text_blocks=blocks, + title_candidates=title_candidates, + image_candidates=image_candidates, + ) + ) + finally: + document.close() + + return pages diff --git a/deeptutor/services/structure_note/planner.py b/deeptutor/services/structure_note/planner.py new file mode 100644 index 000000000..3e7de795b --- /dev/null +++ b/deeptutor/services/structure_note/planner.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +import re + +from .models import DocumentPlan, PageIndexPage, SectionEvidence, SectionPlan, SectionTreeNode + + +def _clean_text(text: str, limit: int) -> str: + cleaned = re.sub(r"\s+", " ", text).strip() + if len(cleaned) <= limit: + return cleaned + return f"{cleaned[:limit].rstrip()}..." + + +def _page_lookup(pages: list[PageIndexPage]) -> dict[int, PageIndexPage]: + return {page.page_number: page for page in pages} + + +def _section_evidence( + pages: list[PageIndexPage], + page_numbers: list[int], + *, + excerpt_limit: int = 900, +) -> list[SectionEvidence]: + lookup = _page_lookup(pages) + evidence: list[SectionEvidence] = [] + for page_number in page_numbers: + page = lookup.get(page_number) + if not page: + continue + excerpt = _clean_text(page.text, excerpt_limit) + if not excerpt and not page.image_candidates and not page.title_candidates: + continue + evidence.append( + SectionEvidence( + page_number=page_number, + excerpt=excerpt, + title_candidates=[candidate.text for candidate in page.title_candidates[:5]], + image_candidate_ids=[ + candidate.candidate_id for candidate in page.image_candidates[:8] + ], + ) + ) + return evidence + + +def _fallback_summary(evidence: list[SectionEvidence]) -> str: + for item in evidence: + if item.excerpt: + return _clean_text(item.excerpt, 220) + return "" + + +def _build_page_to_sections(sections: list[SectionPlan]) -> dict[str, list[str]]: + mapping: dict[str, list[str]] = {} + for section in sections: + for page_number in section.page_numbers: + mapping.setdefault(str(page_number), []).append(section.section_id) + return mapping + + +def _document_summary(sections: list[SectionPlan], language: str) -> str: + titles = [section.title for section in sections[:8]] + if not titles: + return ( + "No extractable section structure was found." + if language != "zh" + else "未提取到可用章节结构。" + ) + joined = " / ".join(titles) + if language == "zh": + return f"本讲义围绕 {joined} 等章节组织内容。" + return f"This note is organized around {joined}." + + +def build_document_plan( + pages: list[PageIndexPage], + sections: list[SectionTreeNode], + *, + document_title: str, + language: str = "en", +) -> DocumentPlan: + """Create the Structure Note planning backbone from the PageIndex-style tree. + + This intentionally stays inside Structure Note. It uses the document tree as the + retrieval surface, then attaches page-grounded evidence for section generation. + """ + + section_plans: list[SectionPlan] = [] + previous_section_id: str | None = None + + for node in sorted(sections, key=lambda item: (item.page_start, item.level, item.section_id)): + page_numbers = list(range(node.page_start, node.page_end + 1)) + evidence = _section_evidence(pages, page_numbers) + summary = node.summary.strip() or _fallback_summary(evidence) + writing_goal = ( + f"Explain {node.title} as a coherent study-note section using pages {node.page_start}-{node.page_end}." + if language != "zh" + else f"基于第 {node.page_start}-{node.page_end} 页,把“{node.title}”写成连贯的学习讲义章节。" + ) + dependencies = [previous_section_id] if previous_section_id else [] + section_plans.append( + SectionPlan( + section_id=node.section_id, + title=node.title, + level=node.level, + section_path=node.path or [node.title], + page_start=node.page_start, + page_end=node.page_end, + page_numbers=page_numbers, + summary=summary, + writing_goal=writing_goal, + dependencies=dependencies, + evidence=evidence, + ) + ) + previous_section_id = node.section_id + + return DocumentPlan( + document_title=document_title, + document_summary=_document_summary(section_plans, language), + outline=section_plans, + section_order=[section.section_id for section in section_plans], + page_to_sections=_build_page_to_sections(section_plans), + ) diff --git a/deeptutor/services/structure_note/renderer.py b/deeptutor/services/structure_note/renderer.py new file mode 100644 index 000000000..0572660e9 --- /dev/null +++ b/deeptutor/services/structure_note/renderer.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +from html import escape +import json +from pathlib import Path +import re + +from .markdown_postprocessor import ( + normalize_structure_note_markdown, + validate_renderer_compatible_markdown, +) +from .models import CitationEntry + +_STYLE = """ +@page { + margin: 20mm 16mm; +} +body { + font-family: "Helvetica", Arial, sans-serif; + color: #202427; + font-size: 11pt; + line-height: 1.65; +} +h1, h2, h3, h4 { + color: #111827; + page-break-after: avoid; + text-wrap: balance; +} +h1 { + font-size: 22pt; + margin-bottom: 8mm; +} +h2 { + font-size: 16pt; + margin-top: 8mm; +} +h3 { + font-size: 13pt; + margin-top: 6mm; +} +p, li { + orphans: 3; + widows: 3; +} +img { + max-width: 100%; + border-radius: 4px; + margin: 6mm 0 2mm; +} +figure, table, pre { + page-break-inside: avoid; +} +code { + background: #f3f4f6; + padding: 0.1rem 0.25rem; + border-radius: 3px; +} +.math-inline { + font-family: "Courier New", monospace; + background: #f9fafb; + border-radius: 3px; + padding: 0.05rem 0.2rem; +} +blockquote { + border-left: 3px solid #d1d5db; + color: #4b5563; + padding-left: 12px; + margin-left: 0; +} +.math-block { + display: block; + margin: 4mm 0; + padding: 3mm 4mm; + background: #f9fafb; + border: 1px solid #e5e7eb; + border-radius: 4px; + overflow-wrap: anywhere; +} +.math-block pre { + margin: 0; + white-space: pre-wrap; + font-family: "Courier New", monospace; + font-size: 10pt; + line-height: 1.45; + background: transparent; +} +""" + + +class RenderError(RuntimeError): + pass + + +_MATH_BLOCK_RE = re.compile(r"(? str: + parts: list[str] = [] + last = 0 + for match in _FENCE_RE.finditer(markdown_text): + if match.start() > last: + parts.append(_render_math_in_non_fenced_text(markdown_text[last : match.start()])) + parts.append(match.group(0)) + last = match.end() + if last < len(markdown_text): + parts.append(_render_math_in_non_fenced_text(markdown_text[last:])) + return "".join(parts) + + +def _render_math_in_non_fenced_text(markdown_text: str) -> str: + def replace_display(match: re.Match[str]) -> str: + expression = match.group(1).strip() + if not expression: + return "" + return f'\n\n
{escape(expression)}
\n\n' + + def replace_inline(match: re.Match[str]) -> str: + expression = match.group(1).strip() + if not expression: + return "" + return f'{escape(expression)}' + + rendered = _MATH_BLOCK_RE.sub(replace_display, markdown_text) + return _MATH_INLINE_RE.sub(replace_inline, rendered) + + +def render_pdf( + markdown_text: str, + title: str, + citation_entries: list[CitationEntry], + job_dir: Path, + final_dir: Path, +) -> tuple[Path, Path]: + try: + from markdown import markdown + except ImportError as exc: # pragma: no cover - runtime dependency + raise RenderError( + "The `markdown` package is required for Structure Note rendering." + ) from exc + + try: + from weasyprint import HTML + except ImportError as exc: + raise RenderError( + "WeasyPrint is required for Structure Note PDF export. Install `weasyprint` and retry." + ) from exc + + markdown_text = normalize_structure_note_markdown(markdown_text) + validation = validate_renderer_compatible_markdown(markdown_text) + if not validation.ok: + detail = "; ".join(validation.warnings) + raise RenderError(f"Structure Note Markdown contains unsupported math syntax: {detail}") + + final_dir.mkdir(parents=True, exist_ok=True) + html_ready_markdown = _render_math_for_pdf(markdown_text) + html_body = markdown(html_ready_markdown, extensions=["extra", "fenced_code", "tables", "toc"]) + html = ( + "" + f"{title}" + "" + f"{html_body}" + ) + + pdf_path = final_dir / "final.pdf" + HTML(string=html, base_url=str(job_dir)).write_pdf(str(pdf_path)) + + citation_path = final_dir / "citation_manifest.json" + with open(citation_path, "w", encoding="utf-8") as handle: + json.dump( + [entry.model_dump(mode="json") for entry in citation_entries], + handle, + ensure_ascii=False, + indent=2, + ) + + return pdf_path, citation_path diff --git a/deeptutor/services/structure_note/storage.py b/deeptutor/services/structure_note/storage.py new file mode 100644 index 000000000..a5d49f0af --- /dev/null +++ b/deeptutor/services/structure_note/storage.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +from datetime import datetime +import json +import os +from pathlib import Path +import shutil +from typing import Any + +from deeptutor.services.path_service import PathService, get_path_service + +from .models import StructureNoteArtifact, StructureNoteProject + + +def _utc_now() -> str: + return datetime.utcnow().isoformat() + + +class StructureNoteStorage: + def __init__(self, path_service: PathService | None = None): + self.path_service = path_service or get_path_service() + + def get_root_dir(self) -> Path: + return self.path_service.get_structure_note_dir() + + def get_job_dir(self, job_id: str) -> Path: + return self.path_service.get_structure_note_job_dir(job_id) + + def ensure_job_dirs(self, job_id: str) -> dict[str, Path]: + job_dir = self.get_job_dir(job_id) + dirs = { + "job": job_dir, + "source": job_dir / "source", + "normalized": job_dir / "normalized", + "index": job_dir / "index", + "chunks": job_dir / "chunks", + "images": job_dir / "images", + "final": job_dir / "final", + } + for directory in dirs.values(): + directory.mkdir(parents=True, exist_ok=True) + return dirs + + def artifact_path(self, job_id: str) -> Path: + return self.get_job_dir(job_id) / "artifact.json" + + def projects_path(self) -> Path: + return self.get_root_dir() / "projects.json" + + def write_artifact(self, artifact: StructureNoteArtifact) -> StructureNoteArtifact: + payload = artifact.model_dump(mode="json") + path = self.artifact_path(artifact.job_id) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False) + return artifact + + def read_artifact(self, job_id: str) -> StructureNoteArtifact: + with open(self.artifact_path(job_id), encoding="utf-8") as handle: + return StructureNoteArtifact.model_validate(json.load(handle)) + + def artifact_exists(self, job_id: str) -> bool: + return self.artifact_path(job_id).exists() + + def delete_job_dir(self, job_id: str) -> None: + shutil.rmtree(self.get_job_dir(job_id), ignore_errors=True) + + def list_artifacts(self) -> list[StructureNoteArtifact]: + artifacts: list[StructureNoteArtifact] = [] + root = self.get_root_dir() + if not root.exists(): + return artifacts + for child in root.iterdir(): + if not child.is_dir(): + continue + artifact_path = child / "artifact.json" + if not artifact_path.exists(): + continue + try: + with open(artifact_path, encoding="utf-8") as handle: + artifacts.append(StructureNoteArtifact.model_validate(json.load(handle))) + except Exception: + continue + artifacts.sort(key=lambda item: item.updated_at, reverse=True) + return artifacts + + def read_projects(self) -> list[StructureNoteProject]: + path = self.projects_path() + if not path.exists(): + return [] + try: + with open(path, encoding="utf-8") as handle: + payload = json.load(handle) + except Exception: + return [] + if not isinstance(payload, list): + return [] + projects: list[StructureNoteProject] = [] + for item in payload: + if not isinstance(item, dict): + continue + try: + projects.append(StructureNoteProject.model_validate(item)) + except Exception: + continue + return projects + + def write_projects(self, projects: list[StructureNoteProject]) -> list[StructureNoteProject]: + path = self.projects_path() + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as handle: + json.dump( + [project.model_dump(mode="json") for project in projects], + handle, + indent=2, + ensure_ascii=False, + ) + return projects + + def write_json(self, path: Path, payload: Any) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2, ensure_ascii=False) + return path + + def read_json(self, path: Path) -> Any: + with open(path, encoding="utf-8") as handle: + return json.load(handle) + + def write_text(self, path: Path, content: str) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + return path + + def output_url_for(self, path: str | Path | None) -> str | None: + if not path: + return None + candidate = Path(path).resolve() + root = self.path_service.get_user_root().resolve() + try: + relative = candidate.relative_to(root) + except ValueError: + return None + return f"/api/outputs/{relative.as_posix()}" + + def get_retention_mode(self) -> str: + default_mode = "full" if os.getenv("PYTEST_CURRENT_TEST") else "minimal" + mode = os.getenv("STRUCTURE_NOTE_RETENTION_MODE", default_mode).strip().lower() + return mode if mode in {"full", "minimal"} else default_mode + + def apply_retention_policy(self, artifact: StructureNoteArtifact) -> None: + if self.get_retention_mode() != "minimal": + return + + preserved: set[Path] = {self.artifact_path(artifact.job_id).resolve()} + for candidate in ( + artifact.final_pdf_path, + artifact.citation_manifest_path, + artifact.rendered_markdown_path, + artifact.page_index_path, + artifact.section_tree_path, + artifact.document_plan_path, + artifact.generation_chunks_path, + artifact.image_fill_state_path, + ): + if candidate: + preserved.add(Path(candidate).resolve()) + + for directory_name in ("normalized", "index", "chunks"): + directory = self.get_job_dir(artifact.job_id) / directory_name + if not directory.exists(): + continue + for child in directory.rglob("*"): + if child.is_dir(): + continue + if child.resolve() in preserved: + continue + child.unlink(missing_ok=True) + + def touch_updated_at(self, artifact: StructureNoteArtifact) -> StructureNoteArtifact: + artifact.updated_at = _utc_now() + return artifact + + def new_timestamp(self) -> str: + return _utc_now() diff --git a/deeptutor/services/structure_note/tree_builder.py b/deeptutor/services/structure_note/tree_builder.py new file mode 100644 index 000000000..bcf0121bd --- /dev/null +++ b/deeptutor/services/structure_note/tree_builder.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +from collections import defaultdict +import re + +from deeptutor.services.llm import complete as llm_complete +from deeptutor.utils.json_parser import parse_json_response + +from .models import PageIndexPage, SectionTreeNode + + +def _collect_heading_candidates(pages: list[PageIndexPage]) -> list[dict[str, object]]: + grouped: dict[tuple[int, str], dict[str, object]] = {} + for page in pages: + for candidate in page.title_candidates: + key = (candidate.page_number, candidate.text.strip()) + current = grouped.get(key) + payload = { + "page_number": candidate.page_number, + "text": candidate.text.strip(), + "font_size": candidate.font_size, + "score": candidate.score, + } + if current is None or float(payload["score"] or 0.0) > float( + current.get("score") or 0.0 + ): + grouped[key] = payload + candidates = list(grouped.values()) + candidates.sort(key=lambda item: (int(item["page_number"]), -float(item.get("score") or 0.0))) + return candidates[:60] + + +def _clean_excerpt(text: str, limit: int = 520) -> str: + cleaned = re.sub(r"\s+", " ", text).strip() + if len(cleaned) <= limit: + return cleaned + return f"{cleaned[:limit].rstrip()}..." + + +def _page_overviews(pages: list[PageIndexPage], limit: int = 36) -> list[dict[str, object]]: + overviews: list[dict[str, object]] = [] + for page in pages[:limit]: + overviews.append( + { + "page_number": page.page_number, + "heading_candidates": [candidate.text for candidate in page.title_candidates[:5]], + "excerpt": _clean_excerpt(page.text, 480), + "image_count": len(page.image_candidates), + } + ) + return overviews + + +def _summary_from_pages(pages: list[PageIndexPage], page_start: int, page_end: int) -> str: + snippets: list[str] = [] + for page in pages: + if page_start <= page.page_number <= page_end and page.text.strip(): + snippets.append(_clean_excerpt(page.text, 180)) + if len(snippets) >= 2: + break + return " ".join(snippets) + + +def _fallback_sections(pages: list[PageIndexPage], page_window: int) -> list[SectionTreeNode]: + nodes: list[SectionTreeNode] = [] + for index, start in enumerate(range(1, len(pages) + 1, page_window), start=1): + end = min(start + page_window - 1, len(pages)) + title = f"Pages {start}-{end}" if start != end else f"Page {start}" + nodes.append( + SectionTreeNode( + section_id=f"fallback-{index:03d}", + title=title, + level=2, + page_start=start, + page_end=end, + summary=_summary_from_pages(pages, start, end), + path=[title], + ) + ) + return nodes + + +def _coerce_sections( + raw_sections: list[dict[str, object]], total_pages: int +) -> list[dict[str, object]]: + sections: list[dict[str, object]] = [] + seen: set[tuple[int, str]] = set() + for raw in raw_sections: + title = str(raw.get("title", "")).strip() + if not title: + continue + try: + page_start = int(raw.get("page_start", 0)) + except Exception: + continue + if page_start < 1 or page_start > total_pages: + continue + try: + level = int(raw.get("level", 2)) + except Exception: + level = 2 + level = max(2, min(level, 5)) + summary = str(raw.get("summary", "")).strip() + key = (page_start, title) + if key in seen: + continue + seen.add(key) + sections.append( + {"title": title, "page_start": page_start, "level": level, "summary": summary} + ) + sections.sort( + key=lambda item: (int(item["page_start"]), int(item["level"]), str(item["title"])) + ) + return sections + + +async def build_section_tree( + pages: list[PageIndexPage], + page_window: int, + language: str = "en", +) -> list[SectionTreeNode]: + total_pages = len(pages) + candidates = _collect_heading_candidates(pages) + if len(candidates) < 2: + return _fallback_sections(pages, page_window) + + prompt = ( + "You are building a PageIndex-style document tree for Structure Note only.\n" + "PageIndex uses a table-of-contents-like hierarchy first, then retrieves by reasoning over that tree. " + "Your task is to infer a stable document-level outline, not page-by-page notes.\n" + "Return JSON only with shape " + '{"sections": [{"title": str, "level": 2-5, "page_start": int, "summary": str}]}\n' + "Rules:\n" + "- Keep the sections in reading order.\n" + "- Use only headings that are strongly supported by the candidates.\n" + "- Use levels 2-5 only.\n" + "- Do not invent page numbers outside the document.\n" + "- Include 3-20 sections depending on the material.\n" + "- Merge repeated slide headers into one section when they represent the same topic.\n" + "- Summaries should describe the section topic in one short sentence.\n" + f"- Output language: {'Chinese' if language == 'zh' else 'English'}.\n\n" + f"Total pages: {total_pages}\n" + f"Heading candidates: {candidates}\n" + f"Page overviews: {_page_overviews(pages)}" + ) + + try: + raw_response = await llm_complete( + prompt=prompt, + system_prompt="You convert page heading candidates into structured JSON.", + temperature=0.1, + ) + parsed = parse_json_response(raw_response, fallback={}) + raw_sections = parsed.get("sections") if isinstance(parsed, dict) else None + if not isinstance(raw_sections, list): + raise ValueError("Missing sections array") + sections = _coerce_sections(raw_sections, total_pages) + except Exception: + sections = [] + + if not sections: + return _fallback_sections(pages, page_window) + + nodes: list[SectionTreeNode] = [] + stack: list[SectionTreeNode] = [] + child_map: dict[str, list[str]] = defaultdict(list) + + for index, section in enumerate(sections): + title = str(section["title"]) + page_start = int(section["page_start"]) + level = int(section["level"]) + + while stack and stack[-1].level >= level: + stack.pop() + + parent_id = stack[-1].section_id if stack else None + path = [*stack[-1].path, title] if stack else [title] + node = SectionTreeNode( + section_id=f"section-{index + 1:03d}", + title=title, + level=level, + page_start=page_start, + page_end=page_start, + summary=str(section.get("summary") or ""), + parent_id=parent_id, + path=path, + ) + if parent_id: + child_map[parent_id].append(node.section_id) + nodes.append(node) + stack.append(node) + + for index, node in enumerate(nodes): + next_boundary = total_pages + 1 + for later in nodes[index + 1 :]: + if later.level <= node.level: + next_boundary = later.page_start + break + node.page_end = max(node.page_start, min(total_pages, next_boundary - 1)) + if not node.summary: + node.summary = _summary_from_pages(pages, node.page_start, node.page_end) + + for node in nodes: + node.child_ids = child_map.get(node.section_id, []) + return nodes diff --git a/docs/features/overview.md b/docs/features/overview.md new file mode 100644 index 000000000..d257ccb89 --- /dev/null +++ b/docs/features/overview.md @@ -0,0 +1,162 @@ +# 🏛️ DeepTutor's Framework + +DeepTutor Full-Stack Workflow + +## 💬 User Interface Layer +• **Intuitive Interaction**: Simple bidirectional query-response flow for intuitive interaction. +• **Structured Output**: Structured response generation that organizes complex information into actionable outputs. +• **Dark/Light Mode**: System-wide theme support with automatic system preference detection. +• **Collapsible Sidebar**: Compact navigation with icon-only mode for focused learning. + +## 🤖 Intelligent Agent Modules +• **Problem Solving & Assessment**: Step-by-step problem solving and custom assessment generation. +• **Research & Learning**: Deep Research for topic exploration and Guided Learning with visualization. +• **Idea Generation**: Automated and interactive concept development with multi-source insights. + +## 🔧 Tool Integration Layer +• **Information Retrieval**: RAG hybrid retrieval, real-time web search, and academic paper databases. +• **Processing & Analysis**: Python code execution, query item lookup, and PDF parsing for document analysis. +• **Multi-Provider Support**: Flexible LLM providers (OpenAI, Anthropic, Ollama, etc.) and embedding adapters (OpenAI, Jina, Cohere, Ollama, etc.). + +## 🧠 Knowledge & Memory Foundation +• **Knowledge Graph**: Entity-relation mapping for semantic connections and knowledge discovery. +• **Vector Store**: Embedding-based semantic search for intelligent content retrieval. +• **Memory System**: Session state management and citation tracking for contextual continuity. + +--- + +# Key Features of DeepTutor + +## 📚 Massive Document Knowledge Q&A + +• **Smart Knowledge Base**: Upload textbooks, research papers, technical manuals, and domain-specific documents. Build a comprehensive AI-powered knowledge repository for instant access. + +• **Multi-Agent Problem Solving**: Dual-loop reasoning architecture with RAG, web search, paper search, and code execution—delivering step-by-step solutions with precise citations. + +## 🎨 Interactive Learning Visualization + +• **Knowledge Simplification & Explanations**: Transform complex concepts, knowledge, and algorithms into easy-to-understand visual aids, detailed step-by-step breakdowns, and engaging interactive demonstrations. + +• **Personalized Q&A**: Context-aware conversations that adapt to your learning progress, with interactive pages and session-based knowledge tracking. + +## 🎯 Knowledge Reinforcement with Practice Problem Generator + +• **Intelligent Exercise Creation**: Generate targeted quizzes, practice problems, and customized assessments tailored to your current knowledge level and specific learning objectives. + +• **Authentic Exam Simulation**: Upload reference exams to generate practice questions that perfectly match the original style, format, and difficulty—giving you realistic preparation for the actual test. + +## 🔍 Deep Research & Idea Generation + +• **Comprehensive Research & Literature Review**: Conduct in-depth topic exploration with systematic analysis. Identify patterns, connect related concepts across disciplines, and synthesize existing research findings. + +• **Novel Insight Discovery**: Generate structured learning materials and uncover knowledge gaps. Identify promising new research directions through intelligent cross-domain knowledge synthesis. + +--- + + + + + + + + +
+ +### 📚 Massive Document Knowledge Q&A + +Document Q&A demonstration + +Document Q&A and Step-by-Step Problem Solving + + + +### 🎨 Interactive Learning Visualization + +Interactive learning visualization demo + +Interactive AI Learning with Knowledge Visual Explanations + +
+ + + +### 🎯 Knowledge Reinforcement + + + + + + +
+ +Custom question generation demo + +**Custom Questions** +Auto-Validated Practice Questions with Instant Feedback + + + +Mimic exam style questions demo + +**Mimic Questions** +Clone Exam Style for Authentic Practice + +
+ + + +### 🔍 Deep Research & Idea Generation + + + + + + + +
+ +Deep research with web and paper search demo + +**Deep Research** +Web and Paper Search with Literature Review + + + +Automated idea generation demo + +**Automated IdeaGen** +Systematic Brainstorming and Concept Synthesis + + + +Interactive idea generation demo + +**Interactive IdeaGen** +RAG-powered Idea Generation with Multi-Source Insights + +
+ + + +### 🏗️ All-in-One Knowledge System + + + + + + +
+ +Personal knowledge base demo + +**Personal Knowledge Base** +Build and Organize Your Own Knowledge Repository + + + +Personal notebook demo + +**Personal Notebook** +Your Contextual Memory for Learning Sessions + +
diff --git a/docs/guide/data-preparation.md b/docs/guide/data-preparation.md new file mode 100644 index 000000000..7b502b06d --- /dev/null +++ b/docs/guide/data-preparation.md @@ -0,0 +1,185 @@ +# Data Preparation + +DeepTutor provides demo knowledge bases and sample questions to help you get started quickly. + +## Demo Knowledge Bases + +We provide two pre-built knowledge bases on [Google Drive](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing): + +### 1. Research Papers Collection + +
+
+ 📄 + 5 Research Papers (20-50 pages each) +
+
+

A curated collection of cutting-edge research papers from our lab, covering RAG and Agent fields.

+

Included Papers:

+ +

Best for: Research scenarios, broad knowledge coverage

+
+
+ +### 2. Data Science Textbook + +
+
+ 📚 + 8 Chapters, 296 Pages +
+
+

A comprehensive deep learning textbook from UC Berkeley.

+

Source: Deep Representation Learning Book

+

Topics Covered:

+
    +
  • Neural Network Fundamentals
  • +
  • Representation Learning
  • +
  • Deep Learning Architectures
  • +
  • Advanced Topics
  • +
+

Best for: Learning scenarios, deep knowledge depth

+
+
+ +## Download & Setup + +### Step 1: Download + +Visit our [Google Drive folder](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing) and download: + +- `knowledge_bases.zip` - Pre-built knowledge bases with embeddings +- `questions.zip` - Sample questions and usage examples (optional) + +### Step 2: Extract + +Extract the downloaded files into the `data/` directory: + +``` +DeepTutor/ +├── data/ +│ └── knowledge_bases/ +│ ├── research_papers/ # Research papers KB +│ ├── data_science_book/ # Textbook KB +│ └── kb_config.json # Knowledge base config +└── user/ # User data (auto-created) +``` + +### Step 3: Verify + +After extracting, your knowledge bases will be automatically available when you start DeepTutor. + +::: warning Embedding Compatibility +Our demo knowledge bases use `text-embedding-3-large` with `dimensions = 3072`. + +If your embedding model has different dimensions, you'll need to create your own knowledge base instead. +::: + +## Creating Custom Knowledge Bases + +### Supported File Formats + +| Format | Extension | Notes | +|:-------|:----------|:------| +| PDF | `.pdf` | Supports text extraction and layout analysis | +| Text | `.txt` | Plain text files | +| Markdown | `.md` | Markdown with formatting support | + +### Via Web Interface + +1. Navigate to `http://localhost:3782/knowledge` +2. Click **"New Knowledge Base"** +3. Enter a unique name for your knowledge base +4. Upload your documents (single or batch upload) +5. Wait for processing to complete + +::: tip Processing Time +- Small documents (< 10 pages): ~1 minute +- Medium documents (10-100 pages): ~5-10 minutes +- Large documents (100+ pages): May take longer +::: + +### Via Command Line + +```bash +# Initialize a new knowledge base with documents +python -m src.knowledge.start_kb init --docs + +# Add documents to existing knowledge base +python -m src.knowledge.add_documents --docs +``` + +## Data Storage Structure + +All user data is stored in the `data/` directory: + +``` +data/ +├── knowledge_bases/ # Knowledge base storage +│ ├── / +│ │ ├── documents/ # Original documents +│ │ ├── chunks/ # Chunked content +│ │ ├── embeddings/ # Vector embeddings +│ │ └── graph/ # Knowledge graph data +└── user/ # User activity data + ├── solve/ # Problem solving results + ├── question/ # Generated questions + ├── research/ # Research reports + ├── notebook/ # Notebook records + └── logs/ # System logs +``` + +--- + +**Next Step:** [Local Installation →](/guide/local-start) + + diff --git a/docs/guide/local-start.md b/docs/guide/local-start.md new file mode 100644 index 000000000..e015eec86 --- /dev/null +++ b/docs/guide/local-start.md @@ -0,0 +1,190 @@ +# Local Installation + +This guide covers manual installation for development or non-Docker environments. + +## Prerequisites + +- **Python 3.10+** — [Download](https://www.python.org/downloads/) +- **Node.js 18+** — [Download](https://nodejs.org/) +- **Git** — [Download](https://git-scm.com/) + +::: tip Windows Users +If you encounter path length errors during installation, enable long path support: + +```cmd +reg add "HKLM\SYSTEM\CurrentControlSet\Control\FileSystem" /v LongPathsEnabled /t REG_DWORD /d 1 /f +``` + +Restart your terminal after running this command. +::: + +## Step 1: Set Up Virtual Environment + +Choose one of the following options: + +::: code-group + +```bash [Conda (Recommended)] +# Create environment +conda create -n deeptutor python=3.10 + +# Activate environment +conda activate deeptutor +``` + +```bash [venv] +# Create environment +python -m venv venv + +# Activate (Windows) +venv\Scripts\activate + +# Activate (macOS/Linux) +source venv/bin/activate +``` + +::: + +## Step 2: Install Dependencies + +### Option A: Automated Installation (Recommended) + +```bash +# Using Python script +python scripts/install_all.py + +# Or using shell script (macOS/Linux) +bash scripts/install_all.sh +``` + +### Option B: Manual Installation + +```bash +# Install Python dependencies +pip install -r requirements.txt + +# Install Node.js dependencies +npm install --prefix web +``` + +::: warning Common Issues +If you see `npm: command not found`: + +```bash +# Using Conda +conda install -c conda-forge nodejs + +# Or install from https://nodejs.org/ +``` +::: + +## Step 3: Configure Environment + +Make sure you have completed the [Pre-Configuration](/guide/pre-config) steps: + +1. ✅ Created `.env` file with your API keys +2. ✅ (Optional) Customized `config/agents.yaml` +3. ✅ (Optional) Downloaded demo knowledge bases + +## Step 4: Launch Application + +### Start Web Interface (Recommended) + +```bash +python scripts/start_web.py +``` + +This starts both the **frontend** (Next.js) and **backend** (FastAPI) servers. + +### Alternative: CLI Interface Only + +```bash +python scripts/start.py +``` + +### Access URLs + +| Service | URL | Description | +|:---:|:---|:---| +| **Frontend** | http://localhost:3782 | Main web interface | +| **API Docs** | http://localhost:8001/docs | Interactive API documentation | + +## Advanced: Start Services Separately + +For development, you may want to run frontend and backend separately: + +### Backend (FastAPI) + +```bash +python src/api/run_server.py + +# Or with uvicorn directly +uvicorn src.api.main:app --host 0.0.0.0 --port 8001 --reload +``` + +### Frontend (Next.js) + +First, create `web/.env.local`: + +```bash +NEXT_PUBLIC_API_BASE=http://localhost:8001 +``` + +Then start the development server: + +```bash +cd web +npm install +npm run dev -- -p 3782 +``` + +## Stopping the Service + +Press `Ctrl+C` in the terminal to stop the service. + +::: warning Port Still in Use? +If you see "port already in use" after pressing Ctrl+C: + +**macOS/Linux:** +```bash +lsof -i :8001 +kill -9 +``` + +**Windows:** +```bash +netstat -ano | findstr :8001 +taskkill /PID /F +``` +::: + +## Troubleshooting + +### Backend fails to start + +**Checklist:** +- Confirm Python version >= 3.10: `python --version` +- Confirm all dependencies installed: `pip install -r requirements.txt` +- Check if port 8001 is in use +- Verify `.env` file configuration + +### Frontend cannot connect to backend + +**Solutions:** +1. Confirm backend is running: visit http://localhost:8001/docs +2. Check browser console for error messages +3. Create `web/.env.local` with: + ```bash + NEXT_PUBLIC_API_BASE=http://localhost:8001 + ``` + +### WebSocket connection fails + +**Checklist:** +- Confirm backend is running +- Check firewall settings +- Verify WebSocket URL format: `ws://localhost:8001/api/v1/...` + +--- + +**Next Step:** [Docker Deployment →](/guide/docker-start) diff --git a/docs/guide/pre-config.md b/docs/guide/pre-config.md new file mode 100644 index 000000000..4056263c6 --- /dev/null +++ b/docs/guide/pre-config.md @@ -0,0 +1,201 @@ +# Pre-Configuration + +Before starting DeepTutor, you need to complete the following setup steps. + +## 1. Clone Repository + +```bash +git clone https://github.com/HKUDS/DeepTutor.git +cd DeepTutor +``` + +## 2. Environment Variables Setup + +Create your `.env` file from the template: + +```bash +cp .env.example .env +``` + +Then edit the `.env` file with your API keys: + +```bash +# ============================================================================ +# Server Configuration +# ============================================================================ +BACKEND_PORT=8001 # Backend API port +FRONTEND_PORT=3782 # Frontend web port + +# For remote/LAN access - set to your server's IP address +# NEXT_PUBLIC_API_BASE=http://192.168.1.100:8001 + +# ============================================================================ +# LLM (Large Language Model) Configuration - Required +# ============================================================================ +LLM_BINDING=openai # Provider: openai, anthropic, azure_openai, ollama, etc. +LLM_MODEL=gpt-4o # Model name: gpt-4o, deepseek-chat, claude-3-5-sonnet, etc. +LLM_HOST=https://api.openai.com/v1 # API endpoint URL +LLM_API_KEY=your_api_key # Your LLM API key + +# ============================================================================ +# Embedding Model Configuration - Required for Knowledge Base +# ============================================================================ +EMBEDDING_BINDING=openai # Provider type +EMBEDDING_MODEL=text-embedding-3-large # Embedding model name +EMBEDDING_DIMENSION=3072 # Must match model dimensions +EMBEDDING_HOST=https://api.openai.com/v1 # API endpoint +EMBEDDING_API_KEY=your_api_key # Embedding API key + +# ============================================================================ +# Web Search Configuration - Optional +# ============================================================================ +SEARCH_PROVIDER=perplexity # Options: perplexity, tavily, serper, jina, exa, baidu +SEARCH_API_KEY=your_search_api_key # API key for search provider +``` + +### Environment Variables Reference + +| Variable | Required | Description | +|:---|:---:|:---| +| `LLM_MODEL` | **Yes** | Model name (e.g., `gpt-4o`, `deepseek-chat`) | +| `LLM_API_KEY` | **Yes** | Your LLM API key | +| `LLM_HOST` | **Yes** | API endpoint URL | +| `EMBEDDING_MODEL` | **Yes** | Embedding model name | +| `EMBEDDING_DIMENSION` | **Yes** | Must match model output dimensions | +| `EMBEDDING_API_KEY` | **Yes** | Embedding API key | +| `EMBEDDING_HOST` | **Yes** | Embedding API endpoint | +| `BACKEND_PORT` | No | Backend port (default: `8001`) | +| `FRONTEND_PORT` | No | Frontend port (default: `3782`) | +| `NEXT_PUBLIC_API_BASE` | No | Set for remote/LAN access | +| `SEARCH_PROVIDER` | No | Web search provider | +| `SEARCH_API_KEY` | No | Search API key | + +### Supported LLM Providers + +| Provider | `LLM_BINDING` Value | Notes | +|:---------|:--------------------|:------| +| OpenAI | `openai` | GPT-4o, GPT-4, GPT-3.5 | +| Anthropic | `anthropic` | Claude 3.5, Claude 3 | +| Azure OpenAI | `azure_openai` | Enterprise deployments | +| Ollama | `ollama` | Local models | +| DeepSeek | `deepseek` | DeepSeek-V3, DeepSeek-R1 | +| Groq | `groq` | Fast inference | +| OpenRouter | `openrouter` | Multi-model gateway | +| Google Gemini | `gemini` | OpenAI-compatible mode | + +### Supported Embedding Providers + +| Provider | `EMBEDDING_BINDING` Value | Notes | +|:---------|:--------------------------|:------| +| OpenAI | `openai` | text-embedding-3-large/small | +| Azure OpenAI | `azure_openai` | Enterprise deployments | +| Jina AI | `jina` | jina-embeddings-v3 | +| Cohere | `cohere` | embed-v3 series | +| Ollama | `ollama` | Local embedding models | +| LM Studio | `lm_studio` | Local inference server | +| HuggingFace | `huggingface` | OpenAI-compatible endpoints | + +## 3. Configuration Files + +DeepTutor uses two YAML configuration files for customization: + +### `config/agents.yaml` - Agent Parameters + +This file controls LLM parameters for each module: + +```yaml +# Solve Module - Problem solving agents +solve: + temperature: 0.3 + max_tokens: 8192 + +# Research Module - Deep research agents +research: + temperature: 0.5 + max_tokens: 12000 + +# Question Module - Question generation agents +question: + temperature: 0.7 + max_tokens: 4096 + +# Guide Module - Learning guidance agents +guide: + temperature: 0.5 + max_tokens: 16192 + +# IdeaGen Module - Idea generation agents +ideagen: + temperature: 0.7 + max_tokens: 4096 + +# CoWriter Module - Collaborative writing agents +co_writer: + temperature: 0.7 + max_tokens: 4096 +``` + +### `config/main.yaml` - System Settings + +This file controls paths, tools, and module-specific settings: + +```yaml +# System language +system: + language: en + +# Data paths +paths: + user_data_dir: ./data/user + knowledge_bases_dir: ./data/knowledge_bases + +# Tool configuration +tools: + rag_tool: + kb_base_dir: ./data/knowledge_bases + default_kb: ai_textbook + run_code: + workspace: ./data/user/run_code_workspace + web_search: + enabled: true + query_item: + enabled: true + max_results: 5 + +# Module-specific settings +research: + researching: + execution_mode: series # "series" or "parallel" + max_iterations: 5 + enable_rag_hybrid: true + enable_paper_search: true + enable_web_search: true +``` + +> **Tip:** For most users, the default configuration works well. Only modify these files if you need specific customizations. + +## 4. Knowledge Base Preparation (Optional) + +You can use our pre-built demo knowledge bases to get started quickly. + +### Download Demo Knowledge Bases + +Download from [Google Drive](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing) and extract into the `data/` directory. + +::: info Important +The demo knowledge bases use `text-embedding-3-large` with `dimensions = 3072`. Make sure your embedding model has matching dimensions. +::: + +### Create Your Own Knowledge Base + +After launching DeepTutor: + +1. Navigate to `http://localhost:3782/knowledge` +2. Click **"New Knowledge Base"** +3. Enter a unique name +4. Upload PDF/TXT/MD files +5. Monitor progress in the terminal + +--- + +**Next Step:** [Data Preparation →](/guide/data-preparation) diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md new file mode 100644 index 000000000..37803c3ff --- /dev/null +++ b/docs/guide/troubleshooting.md @@ -0,0 +1,170 @@ +# ❓ FAQ + +## Backend fails to start? + +**Checklist** +- Confirm Python version >= 3.10 +- Confirm all dependencies installed: `pip install -r requirements.txt` +- Check if port 8001 is in use (configurable in `config/main.yaml`) +- Check `.env` file configuration + +**Solutions** +- **Change port**: Edit `config/main.yaml` server.backend_port +- **Check logs**: Review terminal error messages + +--- + +## Port occupied after Ctrl+C? + +**Problem** + +After pressing Ctrl+C during a running task (e.g., deep research), restarting shows "port already in use" error. + +**Cause** + +Ctrl+C sometimes only terminates the frontend process while the backend continues running in the background. + +**Solution** + +```bash +# macOS/Linux +kill -9 $(lsof -t -i :8001) + +# Windows +netstat -ano | findstr :8001 +taskkill /PID /F +``` + +Then restart the service with `python scripts/start_web.py`. + +--- + +## npm: command not found error? + +**Problem** + +Running `scripts/start_web.py` shows `npm: command not found` or exit status 127. + +**Checklist** +- Check if npm is installed: `npm --version` +- Check if Node.js is installed: `node --version` +- Confirm conda environment is activated (if using conda) + +**Solutions** +```bash +# Option A: Using Conda (Recommended) +conda install -c conda-forge nodejs + +# Option B: Using Official Installer +# Download from https://nodejs.org/ + +# Option C: Using nvm +nvm install 18 +nvm use 18 +``` + +**Verify Installation** +```bash +node --version # Should show v18.x.x or higher +npm --version # Should show version number +``` + +--- + +## Frontend cannot connect to backend? + +**Checklist** +- Confirm backend is running (visit `http://localhost:8001/docs`) +- Check browser console for error messages + +**Solution** + +Create `.env.local` in `web` directory: + +```bash +NEXT_PUBLIC_API_BASE=http://localhost:8001 +``` + +--- + +## WebSocket connection fails? + +**Checklist** +- Confirm backend is running +- Check firewall settings +- Confirm WebSocket URL is correct + +**Solution** +- **Check backend logs** +- **Confirm URL format**: `ws://localhost:8001/api/v1/...` + +--- + +## Where are module outputs stored? + +| Module | Output Path | +|:---:|:---| +| Solve | `data/user/solve/solve_YYYYMMDD_HHMMSS/` | +| Question | `data/user/question/question_YYYYMMDD_HHMMSS/` | +| Research | `data/user/research/reports/` | +| Interactive IdeaGen | `data/user/co-writer/` | +| Notebook | `data/user/notebook/` | +| Guide | `data/user/guide/session_{session_id}.json` | +| Logs | `data/user/logs/` | + +--- + +## How to add a new knowledge base? + +**Web Interface** +1. Visit `http://localhost:3782/knowledge` +2. Click "New Knowledge Base" +3. Enter knowledge base name +4. Upload PDF/TXT/MD documents +5. System will process documents in background + +**CLI** +```bash +python -m src.knowledge.start_kb init --docs +``` + +--- + +## How to incrementally add documents to existing KB? + +**CLI (Recommended)** +```bash +python -m src.knowledge.add_documents --docs +``` + +**Benefits** +- Only processes new documents, saves time and API costs +- Automatically merges with existing knowledge graph +- Preserves all existing data + +--- + +## Numbered items extraction failed with uvloop.Loop error? + +**Problem** + +When initializing a knowledge base, you may encounter this error: +```text +ValueError: Can't patch loop of type +``` + +This occurs because Uvicorn uses `uvloop` event loop by default, which is incompatible with `nest_asyncio`. + +**Solution** + +Use one of the following methods to extract numbered items: + +```bash +# Option 1: Using the shell script (recommended) +./scripts/extract_numbered_items.sh + +# Option 2: Direct Python command +python src/knowledge/extract_numbered_items.py --kb --base-dir ./data/knowledge_bases +``` + +This will extract numbered items (Definitions, Theorems, Equations, etc.) from your knowledge base without reinitializing it. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..3e892b379 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,77 @@ +--- +layout: home + +hero: + name: "DeepTutor" + text: "Your AI Learning Companion" + tagline: Transform any document into an interactive learning experience with multi-agent intelligence + image: + src: /logo.png + alt: DeepTutor + actions: + - theme: brand + text: Get Started → + link: /guide/pre-config + - theme: alt + text: GitHub + link: https://github.com/HKUDS/DeepTutor + +features: + - icon: 📚 + title: Massive Document Q&A + details: Upload textbooks, papers, and manuals. Build AI-powered knowledge repositories with RAG and knowledge graph integration. + - icon: 🧠 + title: Smart Problem Solving + details: Dual-loop reasoning architecture with multi-agent collaboration, delivering step-by-step solutions with precise citations. + - icon: 🎯 + title: Practice Generator + details: Generate custom quizzes based on your knowledge base, or mimic real exam styles for authentic practice. + - icon: 🎓 + title: Guided Learning + details: Personalized learning paths with interactive visualizations and adaptive explanations. + - icon: 🔬 + title: Deep Research + details: Systematic topic exploration with web search, paper retrieval, and literature synthesis. + - icon: 💡 + title: Idea Generation + details: AI-assisted brainstorming with knowledge extraction and multi-stage filtering. +--- + +## Why DeepTutor? + +- **Deep Understanding** — Not just answers, but guided learning journeys with visual explanations +- **Multi-Modal Support** — PDF, LaTeX, images, code execution, and more +- **Knowledge Graph** — Semantic connections powered by LightRAG for better comprehension +- **All-in-One Platform** — Problem solving, question generation, research, and idea generation in one place + + diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 000000000..c0e530452 --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,41 @@ +# Roadmap + +Our vision for DeepTutor's future development. + +## ✅ Recently Completed (v0.4.0) + +- [x] **RAG Module Decoupling** — Modular RAG architecture with provider-agnostic interface + - Currently supports RAG-Anything (MinerU + LightRAG) + - More backends coming soon +- [x] **Multi-Provider Support** — Expanded LLM and Embedding provider options + - LLM: OpenAI, Anthropic, Azure, Ollama, Groq, OpenRouter, DeepSeek, Gemini + - Embedding: OpenAI, Jina, Cohere, Ollama, LM Studio, HuggingFace +- [x] **Dark Mode** — System-wide dark/light theme support +- [x] **Environment Configuration** — Unified `.env` based configuration + +## 🚀 Planned Features + +- [ ] **Deepcoding from Idea Generation** — Transform research ideas into working prototypes +- [ ] **Personalized Memory** — Adapt tutoring style based on user learning history +- [ ] **Additional RAG Backends** — LlamaIndex, ChromaDB, Pinecone integration +- [ ] **More Embedding Adapters** — Voyage AI, Mixedbread, local transformers + +## 💭 Under Consideration + +- Multi-language support +- Mobile-friendly interface +- Collaborative learning features +- Voice interaction support +- Faster frontend framework + +## 🤝 Community Requests + +Have a feature idea? We'd love to hear it! + +- Open a [Feature Request](https://github.com/HKUDS/DeepTutor/issues/new?template=feature_request.yml) +- Join the discussion on existing proposals +- Check our [GitHub Discussions](https://github.com/HKUDS/DeepTutor/discussions) + +--- + +⭐ **Star the repo** to follow our future updates! diff --git a/docs/testdoc/structure-note-prd.md b/docs/testdoc/structure-note-prd.md new file mode 100644 index 000000000..6557dd096 --- /dev/null +++ b/docs/testdoc/structure-note-prd.md @@ -0,0 +1,278 @@ +# Structure Note 产品需求文档(PRD) + +## Title + +Structure Note: 基于 PageIndex 的课件/教材结构化讲义工作区 + +## Summary + +新增一个独立的 `Structure Note` 工作区。用户上传 `PDF` 或 `PPT/PPTX` 后,系统先统一归一化为 PDF,再基于新增的 `PageIndex` 构建页级结构树,按章节与页范围分段生成详细讲义,并通过图片占位、定位、切图和回填补全图文内容,最终输出一份带引用来源的可读 PDF。用户可在生成前选择三档讲解难度:简单、中等(默认)、复杂。 + +## Existing Requirements And Current State + +DeepTutor 当前已具备相邻能力,但缺少本功能所需的页级结构层: + +- 知识库主干已统一到 `llamaindex`,`lightrag` 只是兼容别名 +- 当前文档导入以整文抽取和向量检索为主,没有稳定的页级索引或章节树 +- `Guided Learning` 已有独立工作区、session、分页状态和后台生成机制,可作为工作区组织方式参考 +- `Notebook` 适合保存文本记录,不适合作为 PDF 最终产物主模型 +- 上传校验允许 `ppt/pptx`,但 RAG 文件路由当前并不真正支持它们进入主流程 + +## Problem Statement + +DeepTutor 目前没有一个面向课件和教材的“逐页、详细、可回溯”的结构化讲义产物。 + +这带来两个明显缺口: + +1. 学生无法获得接近逐字稿的图文讲义,用于跟课、补漏和课后复习 +2. 教师无法将现有 PPT 快速转换为可直接讲授的 Script,仍需自行整理讲稿 + +现有知识库能力偏向检索,不足以支撑页级结构、章节树、图片回填和最终 PDF 产出。 + +## Repo Context + +- 该功能应是独立工作区,而不是 `Knowledge Hub` 的附属按钮,也不是 `Guided Learning` 的变种 +- 该功能不应建立在 LightRAG 上,而应新增 `PageIndex` 结构层 +- 当前 repo 没有现成 `PageIndex` 实现,需要新增核心服务 +- 该功能会跨后端 router、任务流、路径管理、前端工作区和 PDF 导出,更适合进入 core,并以实验性工作区首发 + +## Target Users + +- 学生:跟课、补漏、课后复习 +- 教师:PPT 转 Script,减轻备课负担 +- 研发与测试:验证引用、图片回填和恢复流程 + +## Goals + +- 提供独立的 `Structure Note` 工作区 +- 支持 `PDF + PPT/PPTX` 上传 +- 将 PPT/PPTX 先归一化为 PDF +- 基于 `PageIndex` 生成页级结构树,而非整文向量块 +- 采用章节树加分段生成策略,降低 lost-in-the-middle 风险 +- 支持简单 / 中等 / 复杂三档讲解难度,其中中等为默认 +- 输出最终可读 PDF,并附带引用来源 +- 在后端保留中间状态、图片回填和续跑能力,用于测试和恢复 + +## Non-Goals + +- 不替换现有 `Knowledge Hub` 主流程 +- 不把最终产物首发建模为 Notebook 主记录类型 +- 不在首发覆盖 DOCX、图片 OCR、音频转录等更多素材 +- 不向前端暴露占位符、象限定位、切图调试细节 +- 不要求 CLI / SDK 首发同步支持 + +## Proposed Solution + +新增 `Structure Note` 工作区,采用独立 router、manager、artifact 存储与前端页面。 + +### 主流程 + +1. 用户上传 `PDF` 或 `PPT/PPTX` +2. 若为 PPT/PPTX,先通过转换适配器归一化为 PDF +3. 对 PDF 执行 `PageIndex`,输出逐页文本、页码、标题候选、图像候选区域 +4. 构建多级章节树,优先覆盖二级到五级结构 +5. 以章节树为主线,按约 10 页窗口分段生成讲义;`复杂讲解` 可自动缩小为 5-8 页窗口 +6. 首轮文本生成时插入图片占位符,并记录对应页码范围 +7. 图像流水线识别占位符,执行“页定位 -> 象限定位 -> 切图 -> 回填” +8. 将最终内容渲染为 PDF,并生成 `citation_manifest.json` +9. 前端展示最终 PDF、下载入口和引用来源列表 + +### Difficulty Model + +#### simple + +- 定位:科普型、入门型 +- 目标:讲清关键词、定义、核心知识和结论 +- 风格:少推理、少展开、少旁支 +- 篇幅:最短 + +#### medium + +- 定位:默认档,接近正常课堂讲解密度 +- 目标:概念、重点、基础逻辑链讲清楚 +- 风格:细致但不过度展开 +- 篇幅:中等 + +#### detailed + +- 定位:最完整档 +- 目标:尽量展开所有内容,包括推理、过程、细节和隐含连接 +- 风格:最详细 +- 篇幅:最长 +- 特殊策略:自动缩小页窗口,以换取生成稳定性 + +## Scope In + +- 独立工作区 +- PDF 与 PPT/PPTX 上传 +- PPT/PPTX -> PDF 归一化 +- `PageIndex` 服务层 +- 章节树生成 +- 按页范围分段生成 +- 三档难度控制 +- 图片占位、定位、切图、回填 +- 最终 PDF 导出 +- 引用来源展示 +- 后端中间状态持久化与续跑 + +## Scope Out + +- 与知识库检索结果的双向联动 +- Notebook 一键保存 PDF +- 用户手动编辑章节树 +- 多文档自动合并成一本总讲义 +- CLI / SDK 首发接口 +- 高级版式编辑器 + +## UX Or Interaction Notes + +- 工作区形态采用“上传 -> 配置 -> 处理中 -> 查看结果” +- 上传页提供: + - 文件选择 + - 难度选择器:简单 / 中等(默认) / 复杂 +- 结果页仅展示: + - 最终 PDF 预览或下载 + - 本次难度档位 + - 引用来源列表 + - 失败后的重试入口 +- 不向用户展示内部中间态和 agent 细节 + +## Technical Considerations + +- `PageIndex` 是新增结构服务,不是新的 RAG provider +- 用户可见主产物是 PDF;后端内部仍保留中间 JSON / Markdown / render state +- 内部 artifact 至少包含: + - `source_file` + - `normalized_pdf_path` + - `difficulty_level` + - `page_index` + - `section_tree` + - `generation_chunks` + - `image_fill_state` + - `final_pdf_path` + - `citation_manifest` +- 引用来源至少应包含: + - 章节路径 + - 页码范围 + - 原始 PDF 页 + - 图像来源页 +- `detailed` 模式应允许更长耗时和更小页窗口,以换取稳定性 + +## Impacted Areas Of The Repo + +- 新增 backend router:`deeptutor/api/routers/structure_note.py` +- 新增服务目录:`deeptutor/services/structure_note/` +- 扩展路径管理:`deeptutor/services/path_service.py` +- 复用任务流与日志广播模式:`deeptutor/api/routers/knowledge.py` +- 新增前端页面:`web/app/(workspace)/structure-note/page.tsx` +- 更新工作区导航与文档 + +## Acceptance Criteria + +- 用户可在独立工作区上传 PDF 并生成最终 PDF 讲义 +- 用户可上传 PPT/PPTX,系统会先转换为 PDF 再进入同一流程 +- 系统基于页级结构和章节树分段生成,而不是整文一次性生成 +- 用户可选择三档难度;未选择时默认为中等 +- 三档难度的结果在覆盖密度和篇幅上有明显差异 +- 最终结果可回溯到页码范围,并在前端展示引用来源 +- 图片可通过占位符 -> 定位 -> 切图 -> 回填进入最终结果 +- 生成中断后可在后端基于中间状态续跑 +- 不影响现有 `Knowledge Hub`、`Notebook`、`Guided Learning` + +## Success Metrics + +- 任务成功率 +- 平均生成时长 +- 页码引用正确率 +- 图片回填成功率 +- 三档难度的用户使用分布 +- 学生复习场景下的二次打开率 +- 教师上传后导出率 + +## Rollout And Compatibility + +- 以独立工作区、实验性功能首发 +- 完全 opt-in,不替换现有知识库主行为 +- 中间状态保留策略做成可配置项,测试环境默认开启,生产环境可裁剪 +- 若 PPT 转 PDF 或 `PageIndex` 失败,应给出明确错误并允许重试 + +## Risks And Mitigations + +### PageIndex 质量不稳定 + +- 风险:树生成失败或页级抽取噪声过大 +- 缓解:树失败时回退为按页段生成,保证主流程可用 + +### PPT 转 PDF 兼容性不足 + +- 风险:不同模板、字体或复杂动画导致转换异常 +- 缓解:转换器做成可替换 adapter;首发默认使用 LibreOffice + +### simple 过度压缩 + +- 风险:为追求短篇幅丢失关键上下文 +- 缓解:强制保留关键词、定义、结论和最小解释链 + +### detailed 成本和耗时过高 + +- 风险:长文档生成时间和成本显著上升 +- 缓解:缩小页窗口并启用缓存和续跑 + +### 图片定位不准 + +- 风险:四象限粗定位与真实图像区域偏差较大 +- 缓解:定位失败时允许整页截图回退 + +## Maintainer Fit + +该功能适合进入 core,但建议以实验性工作区首发。它直接服务于 DeepTutor 的“材料 -> 学习产物”主线,需要复用上传、任务流、前端工作区和路径管理;若做成外置 plugin,会让产品入口、状态管理和文件处理都变得割裂。 + +## Alternatives Considered + +- 挂在 `Knowledge Hub` 下:不选,因为它不是普通 KB 初始化副产物 +- 复用 `Guided Learning`:不选,因为其主产物是交互页面,不是最终 PDF 讲义 +- 只保存最终 PDF,不保留中间状态:不选,因为测试、恢复和图片回填都会变差 +- 基于 LightRAG 扩展:不选,因为当前主干不走这条路径,且需求核心是页级结构 + +## Docs And Test Impact + +- README 增加 `Structure Note` 工作区说明 +- docs 增加支持格式、难度档位、生成流程、引用来源说明 +- 后端测试覆盖: + - PPT/PPTX 归一化 + - `PageIndex` + - 章节树生成 + - 难度分层 + - 引用页码 + - 图片回填 + - 续跑恢复 +- 前端测试覆盖: + - 上传与难度选择 + - 处理中状态 + - 最终 PDF 展示 + - 引用来源展示 + +## Open Questions + +- 最终 PDF 是否需要内嵌引用附录,还是只在前端展示完整 citation +- 生产环境中间状态保留多久 +- 是否允许下载 `PDF + citation manifest` 打包结果 + +## Assumptions + +- 首发是 web-first +- 最终用户产物是 PDF +- 后端保留中间状态仅用于测试、恢复和内部验证 +- `medium` 为默认档位 +- `detailed` 可接受更长生成时延和更高成本 + +## Decision Log + +- 入口:独立工作区 +- 素材:PDF + PPT/PPTX +- 归一化:PPT/PPTX 先转 PDF +- 结构底座:新增 `PageIndex` +- 用户可见产物:最终 PDF +- 前端:只展示结果与引用 +- 后端:保留中间状态、图片回填和续跑 +- 难度:简单 / 中等(默认) / 复杂 diff --git a/docs/testdoc/structure-note-technical-plan.md b/docs/testdoc/structure-note-technical-plan.md new file mode 100644 index 000000000..be0af868d --- /dev/null +++ b/docs/testdoc/structure-note-technical-plan.md @@ -0,0 +1,539 @@ +# Structure Note 详细技术计划 + +## 1. 背景与决策 + +### 1.1 目标 + +在 DeepTutor 中新增一个独立的 `Structure Note` 工作区,将用户上传的课件或教材转成可阅读、可回溯的结构化 PDF 讲义。 + +### 1.2 已锁定决策 + +- 工作区独立存在,不挂在 `Knowledge Hub` 或 `Guided Learning` 下 +- 首发真实支持 `PDF + PPT/PPTX` +- `PPT/PPTX -> PDF` 由服务端通过 `headless LibreOffice` 实现 +- 结构底座为新增的 `PageIndex`,不接入现有 `llamaindex` provider +- 最终用户产物是 PDF +- 引用只在前端结果页侧栏展示,不强行内嵌到 PDF 中 +- 中间状态保留采用环境可配策略 +- 难度固定为 `simple / medium / detailed`,默认 `medium` + +### 1.3 与现有能力的关系 + +- 复用 `knowledge` 路由中的任务流、SSE 日志和后台任务模式 +- 参考 `guide` 工作区的 session / manager 组织方式 +- 不复用 Notebook 作为主产物容器 +- 不影响现有 RAG 搜索、聊天和 Guided Learning + +## 2. 端到端数据流 + +### 2.1 主流程 + +1. 前端上传文件并提交难度参数 +2. 后端创建 `job_id`,生成 artifact 目录结构 +3. 后端执行素材归一化: + - PDF:直接进入下一阶段 + - PPT/PPTX:使用 `soffice --headless --convert-to pdf` 转为 PDF +4. 后端执行 `PageIndex`: + - 逐页抽文本 + - 渲染页图缩略信息 + - 识别标题候选 + - 记录图像候选区域 +5. 后端构建章节树: + - 规则层抽标题候选 + - LLM 将候选标准化为 2-5 级结构 + - 输出节点与页码范围映射 +6. 后端分段生成正文: + - 按章节树与页范围切块 + - 每块调用 LLM 生成 Markdown 讲义正文 + - 同步输出页码范围与图片占位符 +7. 后端执行图片流水线: + - 识别占位符 + - 生成页号映射 + - 通过定位 Agent 选择象限 + - 通过切图执行器生成图片资源 + - 回填 Markdown / render model +8. 后端渲染最终 PDF 与 `citation_manifest.json` +9. 前端结果页读取任务详情、PDF 地址与 citation 清单 + +### 2.2 阶段与状态 + +统一任务状态: + +- `queued` +- `normalizing` +- `indexing` +- `planning` +- `generating` +- `processing_images` +- `rendering` +- `ready` +- `failed` + +### 2.3 失败与续跑原则 + +- 若素材归一化失败,任务直接失败,不进入后续阶段 +- 若 `PageIndex` 失败,任务失败;后续重试从 `normalize` 后的 PDF 继续 +- 若章节树生成失败,可回退到按页段生成 +- 若图片定位或切图失败,不阻塞整份文档,可对该占位符降级为整页截图或文本标注 +- `retry` 优先复用已存在中间态,而不是重新上传文件 + +## 3. 后端模块拆分 + +### 3.1 新增目录 + +建议新增: + +- `deeptutor/api/routers/structure_note.py` +- `deeptutor/services/structure_note/` + +### 3.2 服务子模块 + +建议按以下模块拆分: + +#### `models.py` + +定义内部类型: + +- `DifficultyLevel` +- `JobStatus` +- `StructureNoteArtifact` +- `PageIndexPage` +- `SectionTreeNode` +- `GenerationChunk` +- `CitationEntry` +- `ImagePlaceholder` + +#### `storage.py` + +负责: + +- 生成 artifact 目录 +- 读写 `artifact.json` +- 路径组装 +- 环境化保留策略清理 + +#### `normalizer.py` + +负责: + +- 判断输入格式 +- 调用 LibreOffice 完成 PPT/PPTX -> PDF 转换 +- 输出标准 PDF 路径 +- 提供依赖缺失时的明确错误消息 + +#### `page_index.py` + +负责: + +- 使用 PyMuPDF 逐页抽文本 +- 记录页码、页尺寸、文本块信息 +- 渲染页面基础图像信息 +- 提取标题候选与图像候选区域 + +#### `tree_builder.py` + +负责: + +- 规则层标题候选提取 +- 调用 LLM 将候选标准化为 2-5 级章节树 +- 建立 `section -> page range` 映射 +- 失败时回退到按页段生成 + +#### `difficulty.py` + +负责三档难度 preset: + +- 输出长度预算 +- 输出风格约束 +- 页窗口大小 +- 术语解释深度 +- 推理展开深度 + +#### `generator.py` + +负责: + +- 根据树结构切分 generation chunks +- 生成 Markdown 正文 +- 注入页码范围标签 +- 生成图片占位符 + +#### `image_pipeline.py` + +负责: + +- 占位符扫描 +- 页号映射 +- 调用定位 Agent 得到页与象限 +- 将象限转换为 PyMuPDF crop box +- 切图并写入 `images/` +- 回填到 Markdown / render model + +#### `renderer.py` + +负责: + +- Markdown -> HTML +- HTML -> PDF(WeasyPrint) +- citation manifest 输出 + +#### `manager.py` + +负责: + +- 任务编排 +- 状态流转 +- 后台续跑 +- 对 router 提供统一接口 + +### 3.3 PathService 扩展 + +在 `deeptutor/services/path_service.py` 中增加 `structure_note` 工作区路径支持,最终目录落到: + +`data/user/workspace/structure_note//` + +固定目录结构: + +- `source/` +- `normalized/` +- `index/` +- `chunks/` +- `images/` +- `final/` +- `artifact.json` + +## 4. API 与类型 + +### 4.1 对外接口 + +#### `POST /api/v1/structure-note/jobs` + +用途:创建任务 +请求:`multipart/form-data` + +- `file`: 上传文件 +- `difficulty_level`: `simple | medium | detailed` + +行为: + +- 验证格式 +- 创建 `job_id` +- 写入源文件 +- 启动后台任务 +- 返回任务基础信息与 task stream 标识 + +#### `GET /api/v1/structure-note/jobs` + +用途:获取工作区历史列表 +返回最少字段: + +- `job_id` +- `file_name` +- `difficulty_level` +- `status` +- `created_at` +- `updated_at` + +#### `GET /api/v1/structure-note/jobs/{job_id}` + +用途:获取任务详情 +返回最少字段: + +- `job_id` +- `status` +- `source_format` +- `difficulty_level` +- `final_pdf_path` +- `citation_manifest_summary` +- `retry_available` + +#### `POST /api/v1/structure-note/jobs/{job_id}/retry` + +用途:失败任务续跑 +行为: + +- 读取 `artifact.json` +- 检查上次成功阶段 +- 从最近可复用阶段继续执行 + +#### `GET /api/v1/structure-note/tasks/{task_id}/stream` + +用途:SSE 任务流 +复用 `knowledge` 的日志和状态推送模式 + +### 4.2 内部 artifact 结构 + +`artifact.json` 至少包含: + +- `job_id` +- `source_format` +- `difficulty_level` +- `source_path` +- `normalized_pdf_path` +- `page_index_path` +- `section_tree_path` +- `generation_chunks_path` +- `citation_manifest_path` +- `final_pdf_path` +- `status` +- `retry_state` +- `created_at` +- `updated_at` + +### 4.3 Citation 类型 + +每条 citation 至少包含: + +- `citation_id` +- `section_path` +- `page_start` +- `page_end` +- `source_file` +- `source_kind` +- `image_page` +- `image_region` + +其中: + +- `source_kind` 仅允许 `text` 或 `image` +- `image_page` / `image_region` 仅在图像引用时填写 + +## 5. 前端工作区设计 + +### 5.1 页面结构 + +新页面建议为: + +`web/app/(workspace)/structure-note/page.tsx` + +### 5.2 三个核心面板 + +#### 上传与配置 + +- 文件选择 +- 难度切换: + - 简单 + - 中等(默认) + - 复杂 +- `detailed` 旁边增加一条轻量提示:生成时间更长 + +#### 处理中 + +- 阶段文本 +- 进度条 +- 错误提示 +- 重试按钮 + +#### 结果页 + +- PDF 预览 +- 下载按钮 +- citation 侧栏 +- 历史任务入口 + +### 5.3 非目标展示 + +前端明确不展示: + +- 章节树调试信息 +- 图片占位符 +- 四象限判断 +- crop box +- 中间 Markdown +- 中间 JSON + +### 5.4 历史列表字段 + +至少显示: + +- 文件名 +- 难度 +- 状态 +- 创建时间 +- 重新打开结果 + +## 6. 生成与渲染策略 + +### 6.1 难度预设 + +#### simple + +- 目标:关键词、定义、核心知识、结论 +- 风格:科普型 +- 输出:最短 +- 推理:尽量压缩 +- 页窗口:默认 10 页 + +#### medium + +- 目标:正常课堂讲解 +- 风格:重点解释 + 基础逻辑链 +- 输出:中等 +- 推理:保留基础过程 +- 页窗口:默认 10 页 + +#### detailed + +- 目标:覆盖细节、推理、过程与隐含逻辑 +- 风格:最详细 +- 输出:最长 +- 推理:尽量完整 +- 页窗口:自动缩到 5-8 页 + +### 6.2 章节树生成规则 + +- 优先依据字体大小、位置、编号样式和文本模式抽取标题候选 +- 交给 LLM 做结构归一化,但输出必须约束为 2-5 级节点 +- 若 LLM 输出不可用,回退到按页段分组,而不是阻塞整个流程 + +### 6.3 图片流水线 + +#### 第一步:占位符生成 + +正文生成时输出形如: + +`[[IMAGE_PLACEHOLDER:section_id:page_hint:purpose]]` + +#### 第二步:定位 Agent + +Agent 输出固定格式: + +- 第几页 +- 象限:`left_top | right_top | left_bottom | right_bottom` + +#### 第三步:切图执行 + +根据页面宽高将页面切成四象限: + +- 左上 +- 右上 +- 左下 +- 右下 + +切图执行器只做确定性 crop,不自行做语义判断。 + +#### 第四步:回填 + +回填模块将图片资源路径写回 Markdown 或 render model,再进入最终 PDF 渲染。 + +### 6.4 PDF 渲染 + +- 中间产物使用 Markdown 表达 +- 渲染时先转 HTML,再交给 WeasyPrint 输出 PDF +- 引用不嵌入正文,只保留干净版 PDF +- citation manifest 单独生成 JSON,供前端侧栏展示 + +## 7. 存储与中间态策略 + +### 7.1 存储内容 + +在完整保留模式下,应保存: + +- 原始上传文件 +- 归一化 PDF +- 页级索引 JSON +- 章节树 JSON +- generation chunks JSON +- 图片资源 +- 回填后 Markdown +- citation manifest +- 最终 PDF +- `artifact.json` + +### 7.2 环境化保留策略 + +建议新增环境配置项,例如: + +- `STRUCTURE_NOTE_RETENTION_MODE=full|minimal` + +规则: + +- 测试环境默认 `full` +- 生产环境默认 `minimal` + +`minimal` 至少保留: + +- `artifact.json` +- `final.pdf` +- `citation_manifest.json` + +### 7.3 续跑策略 + +`retry_state` 记录最近成功阶段。续跑时遵循: + +- 已完成 `normalize`:不重复转换 +- 已完成 `page_index`:不重复抽页 +- 已完成 `tree_build`:不重复建树 +- 仅后续阶段失败:从失败阶段继续 + +## 8. 测试矩阵与实施里程碑 + +### 8.1 单元测试 + +- `PageIndex` 逐页文本提取 +- 空页处理 +- 页码顺序稳定 +- 标题候选抽取规则 +- 树标准化结果满足 2-5 级结构约束 +- 难度 preset 对窗口大小和长度预算的影响 +- 象限到 crop box 的换算 +- 保留策略清理逻辑 + +### 8.2 集成测试 + +- PDF 上传全链路成功,生成 PDF 与 citation manifest +- PPT/PPTX 上传真实走 LibreOffice 转 PDF,再进入后续链路 +- `simple / medium / detailed` 三档输出长度和内容密度有明显差异 +- 图片占位符能被回填 +- 图片失败时能降级或重试 +- 任务中断后 `retry` 会复用中间态,而不是从头重跑 + +### 8.3 前端测试 + +- 上传页能提交文件和难度 +- 处理中状态能接收 SSE 进度 +- 结果页能加载 PDF 和 citation 侧栏 +- 失败任务能触发重试 + +### 8.4 验收样本 + +- 学生教材 PDF:中等模式,结果可读、页码回溯清楚 +- 教师 PPT:复杂模式,细节展开明显更多 +- 简单模式:明显短于中等与复杂,不丢失关键词和核心知识 + +### 8.5 实施里程碑 + +#### M1:文档与骨架 + +- 在 `docs/testdoc/` 落 `PRD + 技术计划` +- 建立路由、artifact 模型、目录结构、SSE 任务流 + +#### M2:素材归一化与 PageIndex + +- 接通 PDF 上传 +- 接通 PPT/PPTX -> PDF 转换 +- 产出页级索引与章节树 + +#### M3:内容生成与三档难度 + +- 接通 chunk 生成 +- 接通难度 preset +- 产出 citation manifest + +#### M4:图片流水线与 PDF 渲染 + +- 接通占位符识别、象限定位、切图回填 +- 接通 Markdown -> HTML -> PDF + +#### M5:前端结果工作区与完整测试 + +- 上传 / 进度 / 结果 / 重试闭环 +- 完成集成测试与验收样本 + +## 9. 实施默认值 + +- 文档落盘格式使用中文 Markdown,不额外导出 PDF 版 PRD / 技术计划 +- `docs/testdoc` 仅作归档,不在文档 sidebar 中额外挂载 +- LibreOffice 是首发必需依赖;环境缺失时,PPT/PPTX 上传失败并返回安装指引 +- 最终 PDF 不内嵌完整引用;完整引用只在前端侧栏和 `citation_manifest.json` 中展示 +- 中间态保留走环境配置:测试环境保留完整中间态,生产环境默认保留 `artifact.json + final.pdf + citation_manifest.json` diff --git a/docs/zh/features/overview.md b/docs/zh/features/overview.md new file mode 100644 index 000000000..dc9d85265 --- /dev/null +++ b/docs/zh/features/overview.md @@ -0,0 +1,57 @@ +# 功能介绍 + +DeepTutor 提供六个专业模块,助力 AI 驱动的学习体验。 + +
+ +## 核心模块 + +| 模块 | 描述 | 文档 | +|:-------|:------------|:--------------| +| 🧠 **智能解题** | 多 Agent 协作解题,双循环推理架构 | [详情 →](https://github.com/HKUDS/DeepTutor#-smart-solver) | +| 🎯 **题目生成** | 自定义测验生成,模拟真实考试风格 | [详情 →](https://github.com/HKUDS/DeepTutor#-question-generator) | +| 🎓 **引导学习** | 个性化学习路径,交互式可视化 | [详情 →](https://github.com/HKUDS/DeepTutor#-guided-learning) | +| 🔬 **深度研究** | 系统化主题探索,动态主题队列 | [详情 →](https://github.com/HKUDS/DeepTutor#-deep-research) | +| 💡 **想法生成** | 自动化研究想法生成,多阶段过滤 | [详情 →](https://github.com/HKUDS/DeepTutor#-automated-ideagen) | +| ✏️ **协作写作** | AI 辅助写作,TTS 语音朗读 | [详情 →](https://github.com/HKUDS/DeepTutor#-interactive-ideagen-co-writer) | + +## 系统架构 + +DeepTutor 架构 + +### 系统层次 + +- **用户界面** — 双向查询响应,结构化输出 +- **Agent 模块** — 各学习任务的专业 AI Agent +- **工具集成** — RAG、网络搜索、论文检索、代码执行 +- **知识基础** — 知识图谱 + 向量存储 + 记忆系统 + +
+ +## 支持的服务商 + +DeepTutor 支持多种 LLM 和 Embedding 服务商: + +::: details LLM 服务商 +- OpenAI (GPT-4o, GPT-4) +- Anthropic (Claude 3.5) +- Azure OpenAI +- DeepSeek +- Groq +- OpenRouter +- Ollama(本地部署) +- Google Gemini +::: + +::: details Embedding 服务商 +- OpenAI +- Jina AI +- Cohere +- Ollama +- LM Studio +- HuggingFace +::: + +--- + +📖 **完整文档**: [GitHub README](https://github.com/HKUDS/DeepTutor) diff --git a/docs/zh/guide/data-preparation.md b/docs/zh/guide/data-preparation.md new file mode 100644 index 000000000..e148c40b1 --- /dev/null +++ b/docs/zh/guide/data-preparation.md @@ -0,0 +1,185 @@ +# 数据准备 + +DeepTutor 提供示例知识库和样例问题,帮助您快速上手。 + +## 示例知识库 + +我们在 [Google Drive](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing) 上提供两个预构建的知识库: + +### 1. 研究论文合集 + +
+
+ 📄 + 5 篇研究论文(每篇 20-50 页) +
+
+

来自我们实验室的前沿研究论文精选集,涵盖 RAG 和 Agent 领域。

+

包含论文:

+ +

适用场景: 研究场景,广泛知识覆盖

+
+
+ +### 2. 数据科学教材 + +
+
+ 📚 + 8 章,296 页 +
+
+

来自加州大学伯克利分校的综合深度学习教材。

+

来源: Deep Representation Learning Book

+

涵盖主题:

+
    +
  • 神经网络基础
  • +
  • 表示学习
  • +
  • 深度学习架构
  • +
  • 高级主题
  • +
+

适用场景: 学习场景,深度知识挖掘

+
+
+ +## 下载与设置 + +### 步骤 1:下载 + +访问我们的 [Google Drive 文件夹](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing) 并下载: + +- `knowledge_bases.zip` - 包含嵌入的预构建知识库 +- `questions.zip` - 样例问题和使用示例(可选) + +### 步骤 2:解压 + +将下载的文件解压到 `data/` 目录: + +``` +DeepTutor/ +├── data/ +│ └── knowledge_bases/ +│ ├── research_papers/ # 研究论文知识库 +│ ├── data_science_book/ # 教材知识库 +│ └── kb_config.json # 知识库配置 +└── user/ # 用户数据(自动创建) +``` + +### 步骤 3:验证 + +解压后,启动 DeepTutor 时您的知识库将自动可用。 + +::: warning 嵌入兼容性 +我们的示例知识库使用 `text-embedding-3-large`,`dimensions = 3072`。 + +如果您的嵌入模型具有不同的维度,您需要创建自己的知识库。 +::: + +## 创建自定义知识库 + +### 支持的文件格式 + +| 格式 | 扩展名 | 说明 | +|:-------|:----------|:------| +| PDF | `.pdf` | 支持文本提取和版面分析 | +| 文本 | `.txt` | 纯文本文件 | +| Markdown | `.md` | 支持格式化的 Markdown | + +### 通过 Web 界面 + +1. 导航到 `http://localhost:3782/knowledge` +2. 点击 **"New Knowledge Base"** +3. 为您的知识库输入唯一名称 +4. 上传您的文档(单个或批量上传) +5. 等待处理完成 + +::: tip 处理时间 +- 小文档(< 10 页):约 1 分钟 +- 中等文档(10-100 页):约 5-10 分钟 +- 大文档(100+ 页):可能需要更长时间 +::: + +### 通过命令行 + +```bash +# 使用文档初始化新知识库 +python -m src.knowledge.start_kb init --docs + +# 向现有知识库添加文档 +python -m src.knowledge.add_documents --docs +``` + +## 数据存储结构 + +所有用户数据存储在 `data/` 目录中: + +``` +data/ +├── knowledge_bases/ # 知识库存储 +│ ├── / +│ │ ├── documents/ # 原始文档 +│ │ ├── chunks/ # 分块内容 +│ │ ├── embeddings/ # 向量嵌入 +│ │ └── graph/ # 知识图谱数据 +└── user/ # 用户活动数据 + ├── solve/ # 解题结果 + ├── question/ # 生成的题目 + ├── research/ # 研究报告 + ├── notebook/ # 笔记本记录 + └── logs/ # 系统日志 +``` + +--- + +**下一步:** [本地安装 →](/zh/guide/local-start) + + diff --git a/docs/zh/guide/local-conda-cursor.md b/docs/zh/guide/local-conda-cursor.md new file mode 100644 index 000000000..4ff9e5a2f --- /dev/null +++ b/docs/zh/guide/local-conda-cursor.md @@ -0,0 +1,56 @@ +# 本地 Conda 环境(可选) + +仅当你在本机使用 **Cursor/VSCode** 且希望用**专用 conda 环境**时参考,无需所有人统一这样做。 + +## 1. 让终端识别 conda + +若在 Cursor 终端里出现 `command not found: conda`,多半是集成终端未加载 conda。任选其一: + +**方式 A:当前终端临时启用** + +```bash +source scripts/activate_conda.sh +``` + +**方式 B:长期生效** + +在 `~/.zshrc` 中保留 conda 初始化块(安装 Miniconda/Anaconda 时通常已添加),然后新开终端即可。 + +## 2. 创建项目专用环境 + +```bash +# 先让 conda 可用(若尚未可用) +source scripts/activate_conda.sh + +# 一键创建环境并安装依赖(Python 3.12 + Node 20 + 前端) +bash scripts/setup_conda_env.sh +``` + +环境名为 `deeptutor`。 + +## 3. 在 Cursor 里使用该环境 + +- 已通过 **`.vscode/settings.json`** 指定解释器为: + `~/miniconda3/envs/deeptutor/bin/python` +- 若你用的是 **Anaconda**,请把该文件中的 `miniconda3` 改为 `anaconda3`。 +- 打开 Python 文件时,Cursor 会使用上述解释器;终端里可执行: + + ```bash + conda activate deeptutor + python scripts/start_web.py + ``` + +或直接: + +```bash +bash scripts/run_with_conda.sh +``` + +## 4. 小结 + +| 目的 | 操作 | +|----------------|------| +| 终端里能用 conda | `source scripts/activate_conda.sh` 或配置好 `~/.zshrc` | +| 创建/重建环境 | `bash scripts/setup_conda_env.sh` | +| 用指定环境启动 | `bash scripts/run_with_conda.sh` 或 `conda activate deeptutor && python scripts/start_web.py` | +| 编辑器用该环境 | 已由 `.vscode/settings.json` 指定,无需额外操作 | diff --git a/docs/zh/guide/local-start.md b/docs/zh/guide/local-start.md new file mode 100644 index 000000000..64dee562d --- /dev/null +++ b/docs/zh/guide/local-start.md @@ -0,0 +1,190 @@ +# 本地安装 + +本指南介绍用于开发或非 Docker 环境的手动安装。 + +## 前提条件 + +- **Python 3.10+** — [下载](https://www.python.org/downloads/) +- **Node.js 18+** — [下载](https://nodejs.org/) +- **Git** — [下载](https://git-scm.com/) + +::: tip Windows 用户 +如果在安装过程中遇到路径长度错误,请启用长路径支持: + +```cmd +reg add "HKLM\SYSTEM\CurrentControlSet\Control\FileSystem" /v LongPathsEnabled /t REG_DWORD /d 1 /f +``` + +运行此命令后重启终端。 +::: + +## 步骤 1:设置虚拟环境 + +选择以下选项之一: + +::: code-group + +```bash [Conda(推荐)] +# 创建环境 +conda create -n deeptutor python=3.10 + +# 激活环境 +conda activate deeptutor +``` + +```bash [venv] +# 创建环境 +python -m venv venv + +# 激活 (Windows) +venv\Scripts\activate + +# 激活 (macOS/Linux) +source venv/bin/activate +``` + +::: + +## 步骤 2:安装依赖 + +### 选项 A:自动安装(推荐) + +```bash +# 使用 Python 脚本 +python scripts/install_all.py + +# 或使用 shell 脚本 (macOS/Linux) +bash scripts/install_all.sh +``` + +### 选项 B:手动安装 + +```bash +# 安装 Python 依赖 +pip install -r requirements.txt + +# 安装 Node.js 依赖 +npm install --prefix web +``` + +::: warning 常见问题 +如果看到 `npm: command not found`: + +```bash +# 使用 Conda +conda install -c conda-forge nodejs + +# 或从 https://nodejs.org/ 安装 +``` +::: + +## 步骤 3:配置环境 + +确保您已完成[预配置](/zh/guide/pre-config)步骤: + +1. ✅ 创建了包含 API 密钥的 `.env` 文件 +2. ✅ (可选)自定义了 `config/agents.yaml` +3. ✅ (可选)下载了示例知识库 + +## 步骤 4:启动应用 + +### 启动 Web 界面(推荐) + +```bash +python scripts/start_web.py +``` + +这将同时启动 **前端**(Next.js)和 **后端**(FastAPI)服务器。 + +### 替代方案:仅 CLI 界面 + +```bash +python scripts/start.py +``` + +### 访问地址 + +| 服务 | URL | 说明 | +|:---:|:---|:---| +| **前端** | http://localhost:3782 | 主 Web 界面 | +| **API 文档** | http://localhost:8001/docs | 交互式 API 文档 | + +## 高级:分别启动服务 + +对于开发,您可能想要分别运行前端和后端: + +### 后端(FastAPI) + +```bash +python src/api/run_server.py + +# 或直接使用 uvicorn +uvicorn src.api.main:app --host 0.0.0.0 --port 8001 --reload +``` + +### 前端(Next.js) + +首先,创建 `web/.env.local`: + +```bash +NEXT_PUBLIC_API_BASE=http://localhost:8001 +``` + +然后启动开发服务器: + +```bash +cd web +npm install +npm run dev -- -p 3782 +``` + +## 停止服务 + +在终端中按 `Ctrl+C` 停止服务。 + +::: warning 端口仍在使用? +如果按 Ctrl+C 后看到"端口已在使用": + +**macOS/Linux:** +```bash +lsof -i :8001 +kill -9 +``` + +**Windows:** +```bash +netstat -ano | findstr :8001 +taskkill /PID /F +``` +::: + +## 故障排除 + +### 后端启动失败 + +**检查清单:** +- 确认 Python 版本 >= 3.10:`python --version` +- 确认所有依赖已安装:`pip install -r requirements.txt` +- 检查端口 8001 是否被占用 +- 验证 `.env` 文件配置 + +### 前端无法连接后端 + +**解决方案:** +1. 确认后端正在运行:访问 http://localhost:8001/docs +2. 检查浏览器控制台的错误信息 +3. 创建 `web/.env.local`: + ```bash + NEXT_PUBLIC_API_BASE=http://localhost:8001 + ``` + +### WebSocket 连接失败 + +**检查清单:** +- 确认后端正在运行 +- 检查防火墙设置 +- 验证 WebSocket URL 格式:`ws://localhost:8001/api/v1/...` + +--- + +**下一步:** [Docker 部署 →](/zh/guide/docker-start) diff --git a/docs/zh/guide/pre-config.md b/docs/zh/guide/pre-config.md new file mode 100644 index 000000000..5708c2b8f --- /dev/null +++ b/docs/zh/guide/pre-config.md @@ -0,0 +1,201 @@ +# 预配置 + +在启动 DeepTutor 之前,您需要完成以下设置步骤。 + +## 1. 克隆仓库 + +```bash +git clone https://github.com/HKUDS/DeepTutor.git +cd DeepTutor +``` + +## 2. 环境变量配置 + +从模板创建 `.env` 文件: + +```bash +cp .env.example .env +``` + +然后编辑 `.env` 文件,填入您的 API 密钥: + +```bash +# ============================================================================ +# 服务器配置 +# ============================================================================ +BACKEND_PORT=8001 # 后端 API 端口 +FRONTEND_PORT=3782 # 前端 Web 端口 + +# 远程/局域网访问 - 设置为您服务器的 IP 地址 +# NEXT_PUBLIC_API_BASE=http://192.168.1.100:8001 + +# ============================================================================ +# LLM (大语言模型) 配置 - 必填 +# ============================================================================ +LLM_BINDING=openai # 提供商: openai, anthropic, azure_openai, ollama 等 +LLM_MODEL=gpt-4o # 模型名: gpt-4o, deepseek-chat, claude-3-5-sonnet 等 +LLM_HOST=https://api.openai.com/v1 # API 端点 URL +LLM_API_KEY=your_api_key # 您的 LLM API 密钥 + +# ============================================================================ +# 嵌入模型配置 - 知识库必填 +# ============================================================================ +EMBEDDING_BINDING=openai # 提供商类型 +EMBEDDING_MODEL=text-embedding-3-large # 嵌入模型名称 +EMBEDDING_DIMENSION=3072 # 必须与模型维度匹配 +EMBEDDING_HOST=https://api.openai.com/v1 # API 端点 +EMBEDDING_API_KEY=your_api_key # 嵌入 API 密钥 + +# ============================================================================ +# 网络搜索配置 - 可选 +# ============================================================================ +SEARCH_PROVIDER=perplexity # 选项: perplexity, tavily, serper, jina, exa, baidu +SEARCH_API_KEY=your_search_api_key # 搜索提供商的 API 密钥 +``` + +### 环境变量参考 + +| 变量 | 必填 | 说明 | +|:---|:---:|:---| +| `LLM_MODEL` | **是** | 模型名称 (如 `gpt-4o`, `deepseek-chat`) | +| `LLM_API_KEY` | **是** | 您的 LLM API 密钥 | +| `LLM_HOST` | **是** | API 端点 URL | +| `EMBEDDING_MODEL` | **是** | 嵌入模型名称 | +| `EMBEDDING_DIMENSION` | **是** | 必须与模型输出维度匹配 | +| `EMBEDDING_API_KEY` | **是** | 嵌入 API 密钥 | +| `EMBEDDING_HOST` | **是** | 嵌入 API 端点 | +| `BACKEND_PORT` | 否 | 后端端口 (默认: `8001`) | +| `FRONTEND_PORT` | 否 | 前端端口 (默认: `3782`) | +| `NEXT_PUBLIC_API_BASE` | 否 | 设置用于远程/局域网访问 | +| `SEARCH_PROVIDER` | 否 | 网络搜索提供商 | +| `SEARCH_API_KEY` | 否 | 搜索 API 密钥 | + +### 支持的 LLM 提供商 + +| 提供商 | `LLM_BINDING` 值 | 说明 | +|:---------|:--------------------|:------| +| OpenAI | `openai` | GPT-4o, GPT-4, GPT-3.5 | +| Anthropic | `anthropic` | Claude 3.5, Claude 3 | +| Azure OpenAI | `azure_openai` | 企业部署 | +| Ollama | `ollama` | 本地模型 | +| DeepSeek | `deepseek` | DeepSeek-V3, DeepSeek-R1 | +| Groq | `groq` | 快速推理 | +| OpenRouter | `openrouter` | 多模型网关 | +| Google Gemini | `gemini` | OpenAI 兼容模式 | + +### 支持的嵌入提供商 + +| 提供商 | `EMBEDDING_BINDING` 值 | 说明 | +|:---------|:--------------------------|:------| +| OpenAI | `openai` | text-embedding-3-large/small | +| Azure OpenAI | `azure_openai` | 企业部署 | +| Jina AI | `jina` | jina-embeddings-v3 | +| Cohere | `cohere` | embed-v3 系列 | +| Ollama | `ollama` | 本地嵌入模型 | +| LM Studio | `lm_studio` | 本地推理服务器 | +| HuggingFace | `huggingface` | OpenAI 兼容端点 | + +## 3. 配置文件 + +DeepTutor 使用两个 YAML 配置文件进行自定义: + +### `config/agents.yaml` - Agent 参数 + +此文件控制每个模块的 LLM 参数: + +```yaml +# 解题模块 - 问题求解 agents +solve: + temperature: 0.3 + max_tokens: 8192 + +# 研究模块 - 深度研究 agents +research: + temperature: 0.5 + max_tokens: 12000 + +# 题目模块 - 题目生成 agents +question: + temperature: 0.7 + max_tokens: 4096 + +# 引导模块 - 学习引导 agents +guide: + temperature: 0.5 + max_tokens: 16192 + +# 灵感生成模块 - 想法生成 agents +ideagen: + temperature: 0.7 + max_tokens: 4096 + +# 协作写作模块 - 协作写作 agents +co_writer: + temperature: 0.7 + max_tokens: 4096 +``` + +### `config/main.yaml` - 系统设置 + +此文件控制路径、工具和模块特定设置: + +```yaml +# 系统语言 +system: + language: en + +# 数据路径 +paths: + user_data_dir: ./data/user + knowledge_bases_dir: ./data/knowledge_bases + +# 工具配置 +tools: + rag_tool: + kb_base_dir: ./data/knowledge_bases + default_kb: ai_textbook + run_code: + workspace: ./data/user/run_code_workspace + web_search: + enabled: true + query_item: + enabled: true + max_results: 5 + +# 模块特定设置 +research: + researching: + execution_mode: series # "series" 或 "parallel" + max_iterations: 5 + enable_rag_hybrid: true + enable_paper_search: true + enable_web_search: true +``` + +> **提示:** 对于大多数用户,默认配置已经足够好用。只有在需要特定自定义时才修改这些文件。 + +## 4. 知识库准备(可选) + +您可以使用我们预构建的示例知识库来快速开始。 + +### 下载示例知识库 + +从 [Google Drive](https://drive.google.com/drive/folders/1iWwfZXiTuQKQqUYb5fGDZjLCeTUP6DA6?usp=sharing) 下载并解压到 `data/` 目录。 + +::: info 重要提示 +示例知识库使用 `text-embedding-3-large`,`dimensions = 3072`。请确保您的嵌入模型具有匹配的维度。 +::: + +### 创建您自己的知识库 + +启动 DeepTutor 后: + +1. 导航到 `http://localhost:3782/knowledge` +2. 点击 **"New Knowledge Base"** +3. 输入唯一的名称 +4. 上传 PDF/TXT/MD 文件 +5. 在终端中监控进度 + +--- + +**下一步:** [数据准备 →](/zh/guide/data-preparation) diff --git a/docs/zh/guide/troubleshooting.md b/docs/zh/guide/troubleshooting.md new file mode 100644 index 000000000..4bccdce0d --- /dev/null +++ b/docs/zh/guide/troubleshooting.md @@ -0,0 +1,48 @@ +# 常见问题 + +快速解决常见问题。 + +## 启动问题 + +| 问题 | 解决方案 | +|:--------|:---------| +| 后端启动失败 | 检查 Python ≥ 3.10,验证 `.env` 配置 | +| `npm: command not found` | 安装 Node.js: `conda install -c conda-forge nodejs` | +| 端口已被占用 | 终止进程:`lsof -i :8001` → `kill -9 ` | + +## 连接问题 + +| 问题 | 解决方案 | +|:--------|:---------| +| 前端无法连接后端 | 确认后端运行在 | +| WebSocket 连接失败 | 检查防火墙,确认 `ws://localhost:8001/api/v1/...` 格式 | +| 远程访问失败 | 在 `.env` 中设置 `NEXT_PUBLIC_API_BASE=http://your-ip:8001` | + +## Docker 问题 + +| 问题 | 解决方案 | +|:--------|:---------| +| 云端前端无法连接 | 设置 `NEXT_PUBLIC_API_BASE_EXTERNAL=https://your-server:8001` | +| 架构不匹配 | 使用 `uname -m` 检查:AMD64 用 `:latest`,ARM 用 `:latest-arm64` | + +## 知识库问题 + +| 问题 | 解决方案 | +|:--------|:---------| +| 处理卡住 | 检查终端日志,验证 API 密钥 | +| `uvloop.Loop` 错误 | 运行:`./scripts/extract_numbered_items.sh ` | + +## 终止后台进程 + +```bash +# macOS/Linux +lsof -i :8001 && kill -9 + +# Windows +netstat -ano | findstr :8001 +taskkill /PID /F +``` + +--- + +📖 **完整 FAQ**: [GitHub README](https://github.com/HKUDS/DeepTutor#-faq) diff --git a/docs/zh/index.md b/docs/zh/index.md new file mode 100644 index 000000000..5cf53712c --- /dev/null +++ b/docs/zh/index.md @@ -0,0 +1,77 @@ +--- +layout: home + +hero: + name: "DeepTutor" + text: "你的 AI 学习伙伴" + tagline: 将任何文档转化为多智能体驱动的互动学习体验 + image: + src: /logo.png + alt: DeepTutor + actions: + - theme: brand + text: 快速开始 → + link: /zh/guide/pre-config + - theme: alt + text: GitHub + link: https://github.com/HKUDS/DeepTutor + +features: + - icon: 📚 + title: 海量文档问答 + details: 上传教材、论文和手册,构建基于 RAG 和知识图谱的 AI 知识库。 + - icon: 🧠 + title: 智能解题 + details: 双循环推理架构配合多智能体协作,提供带有精准文档引用的逐步解答。 + - icon: 🎯 + title: 题目生成 + details: 基于知识库生成自定义测验,或模拟真实考试风格进行练习。 + - icon: 🎓 + title: 引导学习 + details: 个性化学习路径,配合交互式可视化和自适应讲解。 + - icon: 🔬 + title: 深度研究 + details: 系统化主题探索,整合网络搜索、论文检索和文献综合。 + - icon: 💡 + title: 灵感生成 + details: AI 辅助头脑风暴,知识提取与多阶段筛选。 +--- + +## 为什么选择 DeepTutor? + +- **深度理解** — 不只是答案,而是带有可视化讲解的引导式学习之旅 +- **多模态支持** — PDF、LaTeX、图片、代码执行等全面支持 +- **知识图谱** — 基于 LightRAG 的语义连接,实现更好的理解 +- **一站式平台** — 解题、题目生成、研究、灵感生成集于一体 + + diff --git a/pyproject.toml b/pyproject.toml index 1b608155e..9080f5289 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "python-dotenv>=1.0.0", "PyYAML>=6.0", "jinja2>=3.1.0", + "Markdown>=3.6", "openai>=1.30.0", "tiktoken>=0.5.0", "aiohttp>=3.9.4", @@ -45,6 +46,7 @@ server = [ "uvicorn[standard]>=0.24.0", "websockets>=12.0", "python-multipart>=0.0.6", + "WeasyPrint>=62.0", ] math-animator = ["manim>=0.19.0"] all = [ @@ -56,6 +58,7 @@ all = [ "uvicorn[standard]>=0.24.0", "websockets>=12.0", "python-multipart>=0.0.6", + "WeasyPrint>=62.0", "manim>=0.19.0", ] diff --git a/requirements/cli.txt b/requirements/cli.txt index dd0b84489..204c0696d 100644 --- a/requirements/cli.txt +++ b/requirements/cli.txt @@ -8,6 +8,7 @@ python-dotenv>=1.0.0 PyYAML>=6.0 jinja2>=3.1.0 +Markdown>=3.6 # --- LLM --- openai>=1.30.0 diff --git a/requirements/server.txt b/requirements/server.txt index 6c7350fb1..e11cfe658 100644 --- a/requirements/server.txt +++ b/requirements/server.txt @@ -11,3 +11,4 @@ fastapi>=0.100.0 uvicorn[standard]>=0.24.0 websockets>=12.0 python-multipart>=0.0.6 +WeasyPrint>=62.0 diff --git a/tests/api/test_structure_note_router.py b/tests/api/test_structure_note_router.py new file mode 100644 index 000000000..300582367 --- /dev/null +++ b/tests/api/test_structure_note_router.py @@ -0,0 +1,325 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +try: + from fastapi import FastAPI + from fastapi.testclient import TestClient +except Exception: # pragma: no cover + FastAPI = None + TestClient = None + +pytestmark = pytest.mark.skipif( + FastAPI is None or TestClient is None, reason="fastapi not installed" +) + +if FastAPI is not None and TestClient is not None: + from deeptutor.api.routers import structure_note as structure_note_router_module + from deeptutor.services.structure_note import ( + DifficultyLevel, + ExplanationStyleLevel, + JobStatus, + NoteLanguage, + StructureNoteManager, + StructureNoteStorage, + ) + + router = structure_note_router_module.router +else: # pragma: no cover + structure_note_router_module = None + router = None + + +def _build_app() -> FastAPI: + if FastAPI is None or router is None: # pragma: no cover + raise RuntimeError("fastapi is not installed") + app = FastAPI() + app.include_router(router, prefix="/api/v1/structure-note") + return app + + +def _write_kb_raw_file( + tmp_path: Path, + kb_name: str, + file_name: str, + content: bytes = b"%PDF-1.4\n%%EOF", +) -> Path: + kb_base = tmp_path / "data" / "knowledge_bases" + raw_dir = kb_base / kb_name / "raw" + raw_dir.mkdir(parents=True, exist_ok=True) + (kb_base / kb_name / "llamaindex_storage").mkdir(parents=True, exist_ok=True) + config_path = kb_base / "kb_config.json" + config = ( + json.loads(config_path.read_text(encoding="utf-8")) + if config_path.exists() + else {"knowledge_bases": {}} + ) + config.setdefault("knowledge_bases", {})[kb_name] = { + "path": kb_name, + "status": "ready", + "rag_provider": "llamaindex", + } + config_path.write_text(json.dumps(config), encoding="utf-8") + source_path = raw_dir / file_name + source_path.write_bytes(content) + return source_path + + +@pytest.fixture +def manager(tmp_path: Path): + storage = StructureNoteStorage() + original_root = storage.path_service._project_root + original_user_dir = storage.path_service._user_data_dir + try: + storage.path_service._project_root = tmp_path + storage.path_service._user_data_dir = tmp_path / "data" / "user" + storage.path_service.ensure_all_directories() + yield StructureNoteManager(storage=storage) + finally: + storage.path_service._project_root = original_root + storage.path_service._user_data_dir = original_user_dir + + +def test_create_job_returns_contract(monkeypatch, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + + async def _noop_run(*_args, **_kwargs): + return None + + monkeypatch.setattr(structure_note_router_module, "_run_structure_note_job", _noop_run) + + with TestClient(_build_app()) as client: + response = client.post( + "/api/v1/structure-note/jobs", + data={ + "difficulty_level": DifficultyLevel.MEDIUM.value, + "note_language": NoteLanguage.EN.value, + "style_level": ExplanationStyleLevel.HIGH.value, + "project_name": "Course A", + }, + files={ + "file": ( + "chapter.pdf", + b"%PDF-1.4\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF", + "application/pdf", + ) + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["file_name"] == "chapter.pdf" + assert body["difficulty_level"] == "medium" + assert body["note_language"] == "en" + assert body["style_level"] == "high" + assert body["project_name"] == "Course A" + assert body["note_title"] == "chapter" + assert body["source_kind"] == "upload" + assert body["status"] == "queued" + assert isinstance(body["task_id"], str) and body["task_id"] + + +def test_create_and_rename_project_updates_jobs(monkeypatch, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + + async def _noop_run(*_args, **_kwargs): + return None + + monkeypatch.setattr(structure_note_router_module, "_run_structure_note_job", _noop_run) + + with TestClient(_build_app()) as client: + create_project_response = client.post( + "/api/v1/structure-note/projects", + data={"name": "Course A"}, + ) + create_job_response = client.post( + "/api/v1/structure-note/jobs", + data={ + "difficulty_level": DifficultyLevel.MEDIUM.value, + "note_language": NoteLanguage.EN.value, + "style_level": ExplanationStyleLevel.MEDIUM.value, + "project_name": "Course A", + }, + files={ + "file": ( + "chapter.pdf", + b"%PDF-1.4\n1 0 obj\n<<>>\nendobj\ntrailer\n<<>>\n%%EOF", + "application/pdf", + ) + }, + ) + rename_response = client.post( + "/api/v1/structure-note/projects/Course%20A/rename", + data={"new_name": "Course B"}, + ) + list_response = client.get("/api/v1/structure-note/projects") + detail_response = client.get( + f"/api/v1/structure-note/jobs/{create_job_response.json()['job_id']}" + ) + + assert create_project_response.status_code == 200 + assert create_project_response.json()["name"] == "Course A" + assert create_job_response.status_code == 200 + assert rename_response.status_code == 200 + assert rename_response.json()["name"] == "Course B" + project_names = [project["name"] for project in list_response.json()["projects"]] + assert "Course B" in project_names + assert "Course A" not in project_names + assert detail_response.json()["project_name"] == "Course B" + + +def test_delete_project_removes_jobs(monkeypatch, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + job_dirs = manager.storage.ensure_job_dirs("job_project_delete") + source_path = job_dirs["source"] / "demo.pdf" + source_path.write_text("pdf", encoding="utf-8") + artifact = manager.create_job( + file_name="demo.pdf", + source_format="pdf", + difficulty_level=DifficultyLevel.SIMPLE, + note_language=NoteLanguage.ZH, + style_level=ExplanationStyleLevel.LOW, + source_path=source_path, + task_id="task_1", + job_id="job_project_delete", + project_name="Course A", + ) + assert manager.storage.get_job_dir(artifact.job_id).exists() + + with TestClient(_build_app()) as client: + response = client.delete("/api/v1/structure-note/projects/Course%20A") + list_response = client.get("/api/v1/structure-note/jobs") + + assert response.status_code == 200 + assert response.json()["deleted_job_ids"] == ["job_project_delete"] + assert not manager.storage.get_job_dir(artifact.job_id).exists() + assert list_response.json()["jobs"] == [] + + +def test_list_and_detail_jobs(monkeypatch, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + job_dirs = manager.storage.ensure_job_dirs("job_1") + source_path = job_dirs["source"] / "demo.pdf" + source_path.write_text("pdf", encoding="utf-8") + artifact = manager.create_job( + file_name="demo.pdf", + source_format="pdf", + difficulty_level=DifficultyLevel.SIMPLE, + note_language=NoteLanguage.ZH, + style_level=ExplanationStyleLevel.LOW, + source_path=source_path, + task_id="task_1", + job_id="job_1", + ) + manager.update_status(artifact, JobStatus.FAILED, error="boom", task_id="task_1") + + with TestClient(_build_app()) as client: + list_response = client.get("/api/v1/structure-note/jobs") + detail_response = client.get("/api/v1/structure-note/jobs/job_1") + + assert list_response.status_code == 200 + jobs = list_response.json()["jobs"] + assert jobs[0]["job_id"] == "job_1" + assert detail_response.status_code == 200 + assert detail_response.json()["retry_available"] is True + assert detail_response.json()["note_language"] == "zh" + assert detail_response.json()["style_level"] == "low" + + +def test_retry_failed_job(monkeypatch, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + + async def _noop_run(*_args, **_kwargs): + return None + + monkeypatch.setattr(structure_note_router_module, "_run_structure_note_job", _noop_run) + + job_dirs = manager.storage.ensure_job_dirs("job_retry") + source_path = job_dirs["source"] / "deck.pdf" + source_path.write_text("pdf", encoding="utf-8") + artifact = manager.create_job( + file_name="deck.pdf", + source_format="pdf", + difficulty_level=DifficultyLevel.MEDIUM, + note_language=NoteLanguage.EN, + style_level=ExplanationStyleLevel.MEDIUM, + source_path=source_path, + task_id="task_old", + job_id="job_retry", + ) + manager.update_status(artifact, JobStatus.FAILED, error="old error", task_id="task_old") + + with TestClient(_build_app()) as client: + response = client.post("/api/v1/structure-note/jobs/job_retry/retry") + + assert response.status_code == 200 + body = response.json() + assert body["job_id"] == "job_retry" + assert body["status"] == "queued" + assert body["task_id"] != "task_old" + + +def test_list_knowledge_base_source_files(monkeypatch, tmp_path: Path, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + monkeypatch.setattr( + structure_note_router_module, + "_kb_base_dir", + tmp_path / "data" / "knowledge_bases", + ) + _write_kb_raw_file(tmp_path, "Math", "lesson.pdf") + _write_kb_raw_file(tmp_path, "Ignored", "notes.txt") + + with TestClient(_build_app()) as client: + response = client.get("/api/v1/structure-note/kb/files") + + assert response.status_code == 200 + groups = response.json()["knowledge_bases"] + math_group = next(group for group in groups if group["kb_name"] == "Math") + assert math_group["files"][0]["file_id"] == "lesson.pdf" + assert math_group["files"][0]["file_name"] == "lesson.pdf" + ignored_group = next(group for group in groups if group["kb_name"] == "Ignored") + assert ignored_group["files"] == [] + + +def test_create_job_from_knowledge_base_file(monkeypatch, tmp_path: Path, manager) -> None: + monkeypatch.setattr(structure_note_router_module, "_structure_note_manager", manager) + monkeypatch.setattr( + structure_note_router_module, + "_kb_base_dir", + tmp_path / "data" / "knowledge_bases", + ) + + async def _noop_run(*_args, **_kwargs): + return None + + monkeypatch.setattr(structure_note_router_module, "_run_structure_note_job", _noop_run) + _write_kb_raw_file(tmp_path, "Math", "lesson.pdf") + + with TestClient(_build_app()) as client: + response = client.post( + "/api/v1/structure-note/jobs/from-kb", + data={ + "kb_name": "Math", + "file_id": "lesson.pdf", + "difficulty_level": DifficultyLevel.SIMPLE.value, + "note_language": NoteLanguage.ZH.value, + "style_level": ExplanationStyleLevel.MEDIUM.value, + }, + ) + + assert response.status_code == 200 + body = response.json() + assert body["file_name"] == "lesson.pdf" + assert body["project_name"] == "Math" + assert body["note_title"] == "lesson" + assert body["source_kind"] == "knowledge_base" + assert body["source_ref"] == { + "kb_name": "Math", + "file_id": "lesson.pdf", + "file_name": "lesson.pdf", + } + artifact = manager.get_job(body["job_id"]) + assert Path(artifact.source_path).exists() diff --git a/tests/services/test_path_service.py b/tests/services/test_path_service.py index 50162e2a5..c7222e924 100644 --- a/tests/services/test_path_service.py +++ b/tests/services/test_path_service.py @@ -30,9 +30,10 @@ def test_public_output_filter_allows_only_whitelisted_artifacts(tmp_path: Path) denied.parent.mkdir(parents=True, exist_ok=True) denied.write_text("{}", encoding="utf-8") - assert service.is_public_output_path( - "workspace/chat/deep_solve/solve_1/artifacts/plot.png" - ) is True + assert ( + service.is_public_output_path("workspace/chat/deep_solve/solve_1/artifacts/plot.png") + is True + ) assert service.is_public_output_path("settings/env.json") is False assert service.is_public_output_path("../outside.txt") is False finally: @@ -73,12 +74,16 @@ def test_public_output_filter_allows_math_animator_artifacts(tmp_path: Path) -> denied.parent.mkdir(parents=True, exist_ok=True) denied.write_text("print('debug')", encoding="utf-8") - assert service.is_public_output_path( - "workspace/chat/math_animator/turn_1/artifacts/animation.mp4" - ) is True - assert service.is_public_output_path( - "workspace/chat/math_animator/turn_1/source/scene.py" - ) is False + assert ( + service.is_public_output_path( + "workspace/chat/math_animator/turn_1/artifacts/animation.mp4" + ) + is True + ) + assert ( + service.is_public_output_path("workspace/chat/math_animator/turn_1/source/scene.py") + is False + ) finally: service._project_root = original_root service._user_data_dir = original_user_dir @@ -97,13 +102,72 @@ def test_task_workspace_maps_capabilities_into_workspace_chat(tmp_path: Path) -> tmp_path / "data" / "user" / "workspace" / "chat" / "chat" / "turn_1" ) assert service.get_task_workspace("deep_question", "turn_2") == ( - tmp_path - / "data" - / "user" + tmp_path / "data" / "user" / "workspace" / "chat" / "deep_question" / "turn_2" + ) + finally: + service._project_root = original_root + service._user_data_dir = original_user_dir + + +def test_public_output_filter_allows_structure_note_markdown_pdf_and_images(tmp_path: Path) -> None: + service = PathService.get_instance() + original_root = service._project_root + original_user_dir = service._user_data_dir + + try: + service._project_root = tmp_path + service._user_data_dir = tmp_path / "data" / "user" + + allowed = ( + service._user_data_dir / "workspace" - / "chat" - / "deep_question" - / "turn_2" + / "structure_note" + / "job_1" + / "final" + / "final.pdf" + ) + allowed.parent.mkdir(parents=True, exist_ok=True) + allowed.write_text("pdf", encoding="utf-8") + allowed_md = allowed.with_name("rendered.md") + allowed_md.write_text("# Note", encoding="utf-8") + allowed_image = ( + service._user_data_dir + / "workspace" + / "structure_note" + / "job_1" + / "images" + / "figure.png" + ) + allowed_image.parent.mkdir(parents=True, exist_ok=True) + allowed_image.write_text("png", encoding="utf-8") + + denied = ( + service._user_data_dir + / "workspace" + / "structure_note" + / "job_1" + / "chunks" + / "generation_chunks.json" + ) + denied.parent.mkdir(parents=True, exist_ok=True) + denied.write_text("{}", encoding="utf-8") + + assert ( + service.is_public_output_path("workspace/structure_note/job_1/final/final.pdf") is True + ) + assert ( + service.is_public_output_path("workspace/structure_note/job_1/final/rendered.md") + is True + ) + assert ( + service.is_public_output_path("workspace/structure_note/job_1/images/figure.png") + is True + ) + assert ( + service.is_public_output_path( + "workspace/structure_note/job_1/chunks/generation_chunks.json" + ) + is False ) finally: service._project_root = original_root diff --git a/tests/services/test_runtime_storage_guard.py b/tests/services/test_runtime_storage_guard.py index 068505b60..a713ddd1c 100644 --- a/tests/services/test_runtime_storage_guard.py +++ b/tests/services/test_runtime_storage_guard.py @@ -2,9 +2,9 @@ from pathlib import Path +from deeptutor.agents.research.utils.citation_manager import CitationManager from deeptutor.services.config.loader import load_config_with_main from deeptutor.services.path_service import PathService -from deeptutor.agents.research.utils.citation_manager import CitationManager def test_runtime_config_paths_are_confined_to_data_user() -> None: @@ -18,6 +18,7 @@ def test_runtime_config_paths_are_confined_to_data_user() -> None: assert Path(paths["research_output_dir"]).resolve().is_relative_to(user_root) assert Path(paths["research_reports_dir"]).resolve().is_relative_to(user_root) assert Path(paths["guide_output_dir"]).resolve().is_relative_to(user_root) + assert Path(paths["structure_note_output_dir"]).resolve().is_relative_to(user_root) assert Path(paths["user_log_dir"]).resolve() == user_root / "logs" assert Path(config["tools"]["run_code"]["workspace"]).resolve().is_relative_to(user_root) @@ -34,13 +35,7 @@ def test_citation_manager_defaults_to_research_workspace(tmp_path: Path) -> None manager = CitationManager("research_123") assert manager.cache_dir == ( - tmp_path - / "data" - / "user" - / "workspace" - / "chat" - / "deep_research" - / "research_123" + tmp_path / "data" / "user" / "workspace" / "chat" / "deep_research" / "research_123" ) finally: service._project_root = original_root diff --git a/tests/services/test_structure_note_service.py b/tests/services/test_structure_note_service.py new file mode 100644 index 000000000..679b583a4 --- /dev/null +++ b/tests/services/test_structure_note_service.py @@ -0,0 +1,373 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from deeptutor.services.structure_note import generator as generator_module +from deeptutor.services.structure_note.difficulty import get_difficulty_preset +from deeptutor.services.structure_note.generator import ( + _combination_instruction, + build_generation_chunks, + generate_transition_markdown, +) +from deeptutor.services.structure_note.image_pipeline import process_images +from deeptutor.services.structure_note.manager import StructureNoteManager +from deeptutor.services.structure_note.markdown_postprocessor import ( + normalize_structure_note_markdown, + validate_renderer_compatible_markdown, +) +from deeptutor.services.structure_note.models import ( + DifficultyLevel, + ExplanationStyleLevel, + GenerationChunk, + JobStatus, + NoteLanguage, + PageIndexPage, + SectionTreeNode, + StructureNoteArtifact, +) +from deeptutor.services.structure_note.normalizer import NormalizationError, normalize_to_pdf +from deeptutor.services.structure_note.planner import build_document_plan +from deeptutor.services.structure_note.tree_builder import build_section_tree + + +def _page(page_number: int, *, text: str = "", image_candidates=None) -> PageIndexPage: + return PageIndexPage( + page_number=page_number, + width=800, + height=1200, + text=text, + text_blocks=[], + title_candidates=[], + image_candidates=image_candidates or [], + ) + + +def test_normalizer_requires_soffice_for_ppt(monkeypatch, tmp_path: Path) -> None: + ppt_path = tmp_path / "deck.pptx" + ppt_path.write_text("fake", encoding="utf-8") + monkeypatch.setattr("shutil.which", lambda _name: None) + + with pytest.raises(NormalizationError) as exc: + normalize_to_pdf(ppt_path, tmp_path / "normalized") + + assert "LibreOffice" in str(exc.value) + + +def test_generation_chunks_use_difficulty_window() -> None: + pages = [_page(index, text=f"content {index}") for index in range(1, 15)] + chunks = build_generation_chunks(pages, [], get_difficulty_preset(DifficultyLevel.DETAILED)) + + assert chunks + assert all(len(chunk.page_numbers) <= 6 for chunk in chunks) + assert chunks[0].page_start == 1 + + +def test_generation_chunks_use_section_plan_not_page_windows() -> None: + pages = [_page(index, text=f"content {index}") for index in range(1, 15)] + sections = [ + SectionTreeNode( + section_id="section-001", + title="Long Section", + level=2, + page_start=1, + page_end=14, + summary="A long section.", + path=["Long Section"], + ) + ] + plan = build_document_plan(pages, sections, document_title="demo.pdf", language="en") + chunks = build_generation_chunks( + pages, + sections, + get_difficulty_preset(DifficultyLevel.DETAILED), + document_plan=plan, + ) + + assert len(chunks) == 1 + assert chunks[0].section_id == "section-001" + assert chunks[0].page_start == 1 + assert chunks[0].page_end == 14 + assert chunks[0].evidence + + +def test_generation_chunks_limit_parent_section_to_overview_pages() -> None: + pages = [_page(index, text=f"content {index}") for index in range(1, 7)] + sections = [ + SectionTreeNode( + section_id="section-001", + title="Chapter", + level=2, + page_start=1, + page_end=6, + summary="Chapter overview.", + child_ids=["section-002"], + path=["Chapter"], + ), + SectionTreeNode( + section_id="section-002", + title="Main Topic", + level=3, + page_start=2, + page_end=6, + parent_id="section-001", + summary="Main details.", + path=["Chapter", "Main Topic"], + ), + ] + plan = build_document_plan(pages, sections, document_title="demo.pdf", language="en") + chunks = build_generation_chunks( + pages, + sections, + get_difficulty_preset(DifficultyLevel.MEDIUM), + document_plan=plan, + ) + + assert [chunk.section_id for chunk in chunks] == ["section-001", "section-002"] + assert chunks[0].page_numbers == [1] + assert chunks[1].page_numbers == [2, 3, 4, 5, 6] + + +@pytest.mark.parametrize( + ("depth", "style", "expected"), + [ + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.LOW, "短篇幅、低门槛"), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.MEDIUM, "核心知识骨架"), + (DifficultyLevel.SIMPLE, ExplanationStyleLevel.HIGH, "short but dense"), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.LOW, "中等篇幅、科普风格"), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.MEDIUM, "默认模式"), + (DifficultyLevel.MEDIUM, ExplanationStyleLevel.HIGH, "学术课堂风格"), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.LOW, "长篇幅、低门槛"), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.MEDIUM, "完整课堂讲义"), + (DifficultyLevel.DETAILED, ExplanationStyleLevel.HIGH, "学术讲义/课程笔记"), + ], +) +def test_depth_style_combinations_have_stable_prompt_semantics( + depth: DifficultyLevel, + style: ExplanationStyleLevel, + expected: str, +) -> None: + assert expected in _combination_instruction(depth, style, NoteLanguage.ZH.value) + + +def test_markdown_postprocessor_normalizes_renderer_math_syntax() -> None: + markdown = ( + "Use $x + y$ to express the sum, call \\(softmax(x)\\), and keep currency $100.\n\n" + "$$a=b$$\n\n" + "$$\\(c=d\\)$$\n\n" + "\\[e=f\\]\n\n" + "```python\nprint('$x$')\n```\n" + ) + + normalized = normalize_structure_note_markdown(markdown) + + assert "$x + y$" in normalized + assert "`softmax(x)`" in normalized + assert "$$\na=b\n$$" in normalized + assert "$$\nc=d\n$$" in normalized + assert "$$\ne=f\n$$" in normalized + assert "\\(" not in normalized + assert "\\[" not in normalized + assert "print('$x$')" in normalized + assert validate_renderer_compatible_markdown(normalized).ok is True + + +def test_markdown_validation_detects_damaged_formula() -> None: + result = validate_renderer_compatible_markdown("The update is $x + \\frac{1}{2.\n") + + assert result.ok is False + assert any("damaged inline math delimiter" in warning for warning in result.warnings) + + +def test_compose_markdown_inserts_generated_transition_between_major_sections() -> None: + artifact = StructureNoteArtifact( + job_id="job_1", + file_name="demo.pdf", + source_format="pdf", + difficulty_level=DifficultyLevel.MEDIUM, + note_language=NoteLanguage.ZH, + style_level=ExplanationStyleLevel.HIGH, + status=JobStatus.RENDERING, + source_path="/tmp/demo.pdf", + created_at="2026-01-01T00:00:00", + updated_at="2026-01-01T00:00:00", + ) + chunks = [ + GenerationChunk( + chunk_id="chunk-001", + section_id="section-001", + section_title="基础概念", + section_summary="基础概念", + section_path=["基础概念"], + heading_level=2, + page_start=1, + page_end=3, + page_numbers=[1, 2, 3], + markdown="## 基础概念\n\n内容。", + ), + GenerationChunk( + chunk_id="chunk-002", + section_id="section-002", + section_title="理论展开", + section_summary="理论展开", + section_path=["理论展开"], + heading_level=2, + page_start=4, + page_end=6, + page_numbers=[4, 5, 6], + markdown="## 理论展开\n\n内容。", + ), + ] + + markdown = StructureNoteManager()._compose_markdown( + artifact, + chunks, + NoteLanguage.ZH.value, + transition_map={ + "section-002": "基础概念已经确定了问题的核心变量,但要解释这些变量如何形成稳定关系,还需要进入理论层面的约束与推理。" + }, + ) + + assert "> **过渡:**" not in markdown + assert "建立了必要的概念基础" not in markdown + assert "核心变量" in markdown + assert "基础概念" in markdown + assert "理论展开" in markdown + + +@pytest.mark.asyncio +async def test_generate_transition_markdown_uses_llm_context(monkeypatch) -> None: + captured: dict[str, str] = {} + + async def _fake_complete(prompt: str, **_kwargs): + captured["prompt"] = prompt + return ( + "> 前面的定义已经说明目标函数衡量什么,但优化过程还需要解释参数为什么沿梯度方向移动。" + ) + + monkeypatch.setattr(generator_module, "llm_complete", _fake_complete) + previous = GenerationChunk( + chunk_id="chunk-001", + section_id="section-001", + section_title="目标函数", + section_summary="定义损失与优化目标。", + section_path=["目标函数"], + heading_level=2, + page_start=1, + page_end=2, + page_numbers=[1, 2], + markdown="## 目标函数\n\n损失函数刻画预测与真实结果之间的差异。" * 20, + ) + current = GenerationChunk( + chunk_id="chunk-002", + section_id="section-002", + section_title="梯度下降", + section_summary="用梯度更新参数。", + section_path=["梯度下降"], + heading_level=2, + page_start=3, + page_end=4, + page_numbers=[3, 4], + markdown="## 梯度下降\n\n梯度方向给出局部变化最快的方向。", + ) + + transition = await generate_transition_markdown( + previous, + current, + language=NoteLanguage.ZH.value, + style_level=ExplanationStyleLevel.HIGH, + ) + + assert transition.startswith("前面的定义") + assert not transition.startswith(">") + assert "Title: 目标函数" in captured["prompt"] + assert "Title: 梯度下降" in captured["prompt"] + assert "损失函数刻画预测" in captured["prompt"] + assert "梯度方向给出" in captured["prompt"] + assert "Do NOT use template phrases" in captured["prompt"] + + +@pytest.mark.asyncio +async def test_generate_transition_markdown_falls_back_without_blocking(monkeypatch) -> None: + async def _raise_complete(*_args, **_kwargs): + raise RuntimeError("llm unavailable") + + monkeypatch.setattr(generator_module, "llm_complete", _raise_complete) + previous = GenerationChunk( + chunk_id="chunk-001", + section_title="A", + section_path=["A"], + heading_level=2, + page_start=1, + page_end=1, + page_numbers=[1], + markdown="## A\n\nA content.", + ) + current = GenerationChunk( + chunk_id="chunk-002", + section_title="B", + section_path=["B"], + heading_level=2, + page_start=2, + page_end=2, + page_numbers=[2], + markdown="## B\n\nB content.", + ) + + transition = await generate_transition_markdown( + previous, + current, + language=NoteLanguage.EN.value, + style_level=ExplanationStyleLevel.MEDIUM, + ) + + assert transition == "A leads naturally to B." + + +@pytest.mark.asyncio +async def test_tree_builder_falls_back_without_candidates() -> None: + pages = [_page(index, text=f"content {index}") for index in range(1, 12)] + tree = await build_section_tree(pages, page_window=5, language="en") + + assert tree + assert tree[0].page_start == 1 + assert tree[0].page_end == 5 + + +def test_image_pipeline_falls_back_to_full_page(tmp_path: Path) -> None: + fitz = pytest.importorskip("fitz") + pdf_path = tmp_path / "demo.pdf" + document = fitz.open() + page = document.new_page(width=400, height=600) + page.insert_text((72, 72), "Hello Structure Note") + document.save(pdf_path) + document.close() + + pages = [_page(1, text="Hello", image_candidates=[])] + chunks = [ + GenerationChunk( + chunk_id="chunk-001", + section_title="Page 1", + section_path=["Page 1"], + page_start=1, + page_end=1, + page_numbers=[1], + markdown="## Page 1\n\n[[IMAGE_PLACEHOLDER:chunk-001-image-1:1:key_figure]]\n", + placeholder_ids=["chunk-001-image-1"], + ) + ] + + next_chunks, placeholders, citations = process_images( + chunks, + pages, + pdf_path, + tmp_path / "images", + "demo.pdf", + language="en", + ) + + assert placeholders[0].status == "fallback_page" + assert citations[0].source_kind == "image" + assert "images/chunk-001-image-1.png" in next_chunks[0].markdown + assert "It supports the explanation of Page 1" in next_chunks[0].markdown diff --git a/web/app/(workspace)/structure-note/page.tsx b/web/app/(workspace)/structure-note/page.tsx new file mode 100644 index 000000000..0f40aacb9 --- /dev/null +++ b/web/app/(workspace)/structure-note/page.tsx @@ -0,0 +1,1876 @@ +'use client' + +import { useEffect, useMemo, useRef, useState } from 'react' +import { + AlertCircle, + BookOpen, + CheckCircle2, + ChevronDown, + ChevronRight, + Clock3, + Database, + Download, + Eye, + FileText, + FolderOpen, + Layers3, + ListTree, + Loader2, + PanelLeftClose, + PanelLeftOpen, + Pencil, + Plus, + RefreshCcw, + ScrollText, + Trash2, + Upload, + X, +} from 'lucide-react' +import { useTranslation } from 'react-i18next' +import RichMarkdownRenderer from '@/components/common/RichMarkdownRenderer' +import Button from '@/components/ui/Button' +import { apiUrl } from '@/lib/api' +import { + createStructureNoteJob, + createStructureNoteJobFromKnowledgeBase, + createStructureNoteProject, + deleteStructureNoteProject, + fetchStructureNoteMarkdown, + getStructureNoteJob, + invalidateStructureNoteCaches, + listStructureNoteJobs, + listStructureNoteKnowledgeBaseFiles, + listStructureNoteProjects, + renameStructureNoteProject, + retryStructureNoteJob, + type StructureNoteDifficulty, + type StructureNoteJob, + type StructureNoteKbFile, + type StructureNoteKbGroup, + type StructureNoteLanguage, + type StructureNoteProject, + type StructureNoteStatus, + type StructureNoteStyleLevel, +} from '@/lib/structure-note-api' + +type SourceMode = 'upload' | 'knowledge_base' + +interface MaterialNode { + key: string + fileName: string + sourceKind: 'upload' | 'knowledge_base' + updatedAt: string +} + +interface VersionNode { + label: string + job: StructureNoteJob + isLatest: boolean +} + +interface NoteNode { + key: string + title: string + sourceFileName: string + latestJob: StructureNoteJob + versions: VersionNode[] +} + +interface ProjectNode { + key: string + name: string + materials: MaterialNode[] + notes: NoteNode[] +} + +type MaterialMap = Map +type NoteJobMap = Map + +interface ProjectAccumulator { + name: string + materials: MaterialMap + notes: NoteJobMap +} + +const PROCESSING_STATUS_ORDER: StructureNoteStatus[] = [ + 'queued', + 'normalizing', + 'indexing', + 'planning', + 'generating', + 'processing_images', + 'rendering', +] + +const PROCESSING_STATUSES = new Set(PROCESSING_STATUS_ORDER) + +const DIFFICULTY_OPTIONS: Array<{ + value: StructureNoteDifficulty + labelKey: string + hintKey: string +}> = [ + { + value: 'simple', + labelKey: 'Simple', + hintKey: 'Shorter notes focused on key definitions and outcomes.', + }, + { + value: 'medium', + labelKey: 'Medium', + hintKey: 'Balanced classroom-style coverage with core logic.', + }, + { + value: 'detailed', + labelKey: 'Detailed', + hintKey: 'Longer notes with deeper reasoning and slower generation.', + }, +] + +const NOTE_LANGUAGE_OPTIONS: Array<{ + value: StructureNoteLanguage + labelKey: string + hintKey: string +}> = [ + { + value: 'zh', + labelKey: 'Chinese', + hintKey: 'Generate the final note content in Chinese.', + }, + { + value: 'en', + labelKey: 'English', + hintKey: 'Generate the final note content in English.', + }, +] + +const STYLE_LEVEL_OPTIONS: Array<{ + value: StructureNoteStyleLevel + labelKey: string + hintKey: string +}> = [ + { + value: 'low', + labelKey: 'Low', + hintKey: 'Popular-science style for fast entry-level understanding.', + }, + { + value: 'medium', + labelKey: 'Medium', + hintKey: 'Standard classroom note style with balanced clarity and detail.', + }, + { + value: 'high', + labelKey: 'High', + hintKey: + 'Academic style with more rigorous principles, formulas, and derivations when supported.', + }, +] + +const STATUS_LABELS: Record = { + queued: 'Queued', + normalizing: 'Normalizing source', + indexing: 'Building page index', + planning: 'Planning sections', + generating: 'Generating notes', + processing_images: 'Processing figures', + rendering: 'Rendering PDF', + ready: 'Ready', + failed: 'Failed', +} + +function formatTimestamp(value: string) { + if (!value) return '' + try { + return new Intl.DateTimeFormat(undefined, { + month: 'short', + day: 'numeric', + hour: 'numeric', + minute: '2-digit', + }).format(new Date(value)) + } catch { + return value + } +} + +function formatFileSize(sizeBytes: number) { + if (!Number.isFinite(sizeBytes) || sizeBytes <= 0) return '' + if (sizeBytes < 1024 * 1024) { + return `${Math.max(1, Math.round(sizeBytes / 1024))} KB` + } + return `${(sizeBytes / 1024 / 1024).toFixed(1)} MB` +} + +function formatPageRange(pageStart: number, pageEnd: number) { + return pageStart === pageEnd ? `p. ${pageStart}` : `pp. ${pageStart}-${pageEnd}` +} + +function stripExtension(fileName: string) { + return fileName.replace(/\.[^/.]+$/, '') || fileName +} + +function getProjectInitials(name: string) { + const compact = name.trim() + if (!compact) return 'SN' + return compact.slice(0, 2).toUpperCase() +} + +function getProjectName(job: StructureNoteJob) { + return job.project_name || job.source_ref?.kb_name || 'Local Uploads' +} + +function getSourceKind(job: StructureNoteJob): 'upload' | 'knowledge_base' { + return job.source_kind === 'knowledge_base' ? 'knowledge_base' : 'upload' +} + +function getSourceFileName(job: StructureNoteJob) { + return job.source_ref?.file_name || job.file_name +} + +function getSourceFileId(job: StructureNoteJob) { + if (getSourceKind(job) === 'knowledge_base') { + return `${job.source_ref?.kb_name || getProjectName(job)}/${job.source_ref?.file_id || job.file_name}` + } + return job.source_ref?.file_name || job.file_name +} + +function getMaterialKey(job: StructureNoteJob) { + return `${getSourceKind(job)}:${getProjectName(job)}:${getSourceFileId(job)}` +} + +function getNoteTitle(job: StructureNoteJob) { + return job.note_title || stripExtension(getSourceFileName(job)) +} + +function getStatusProgress(status: StructureNoteStatus) { + if (status === 'ready' || status === 'failed') return 100 + const index = PROCESSING_STATUS_ORDER.indexOf(status) + if (index < 0) return 8 + return Math.max(10, Math.round(((index + 1) / PROCESSING_STATUS_ORDER.length) * 92)) +} + +function buildProjectTree( + jobs: StructureNoteJob[], + projectRecords: StructureNoteProject[] +): ProjectNode[] { + const projectMap = new Map() + + projectRecords.forEach(project => { + const projectName = project.name.trim() + if (!projectName) return + projectMap.set(projectName, { + name: projectName, + materials: new Map(), + notes: new Map(), + }) + }) + + jobs.forEach(job => { + const projectName = getProjectName(job) + const projectKey = projectName + const project = projectMap.get(projectKey) ?? { + name: projectName, + materials: new Map(), + notes: new Map(), + } + const materialKey = getMaterialKey(job) + const existingMaterial = project.materials.get(materialKey) + if (!existingMaterial || job.updated_at > existingMaterial.updatedAt) { + project.materials.set(materialKey, { + key: materialKey, + fileName: getSourceFileName(job), + sourceKind: getSourceKind(job), + updatedAt: job.updated_at, + }) + } + project.notes.set(materialKey, [...(project.notes.get(materialKey) ?? []), job]) + projectMap.set(projectKey, project) + }) + + return Array.from(projectMap.entries()) + .map(([projectKey, project]) => { + const notes = Array.from(project.notes.entries()) + .map(([noteKey, noteJobs]) => { + const sortedVersions = [...noteJobs].sort((a, b) => + a.created_at.localeCompare(b.created_at) + ) + const latestJob = sortedVersions[sortedVersions.length - 1] + return { + key: noteKey, + title: getNoteTitle(latestJob), + sourceFileName: getSourceFileName(latestJob), + latestJob, + versions: sortedVersions.map((job, index) => ({ + label: `v${index + 1}`, + job, + isLatest: index === sortedVersions.length - 1, + })), + } + }) + .sort((a, b) => b.latestJob.updated_at.localeCompare(a.latestJob.updated_at)) + + return { + key: projectKey, + name: project.name, + materials: Array.from(project.materials.values()).sort((a, b) => + a.fileName.localeCompare(b.fileName) + ), + notes, + } + }) + .sort((a, b) => a.name.localeCompare(b.name)) +} + +function rewriteRelativeMarkdownAssets(content: string, assetBaseUrl: string | null) { + if (!assetBaseUrl) return content + const base = apiUrl(assetBaseUrl).replace(/\/$/, '') + return content.replace( + /(!\[[^\]]*\]\()((?!https?:\/\/|data:|\/|#)[^)]+)(\))/gi, + (_match, prefix, src, suffix) => { + const normalizedSrc = String(src).replace(/^\.\//, '') + return `${prefix}${base}/${normalizedSrc}${suffix}` + } + ) +} + +function scrollToSection(sectionId: string) { + const target = document.getElementById(sectionId) + target?.scrollIntoView({ block: 'start', behavior: 'smooth' }) +} + +function StatusIcon({ status }: { status: StructureNoteStatus }) { + if (status === 'ready') { + return