Project-N-E-K-O
diff --git a/‎plugin/plugins/study_companion/__init__.py‎
Lines changed: 130 additions & 9 deletions b/‎plugin/plugins/study_companion/__init__.py‎
Lines changed: 130 additions & 9 deletions
diff --git a/‎plugin/plugins/study_companion/models.py‎
Lines changed: 20 additions & 1 deletion b/‎plugin/plugins/study_companion/models.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎plugin/plugins/study_companion/plugin.toml‎
Lines changed: 2 additions & 0 deletions b/‎plugin/plugins/study_companion/plugin.toml‎
Lines changed: 2 additions & 0 deletions
@@ -2,6 +2,7 @@
 
 import asyncio
 import base64
+import binascii
 from datetime import datetime
 import math
 from pathlib import Path
@@ -112,6 +113,60 @@ def _register_install_routes() -> None:
     )
 
 
+_MAX_SUBMITTED_IMAGE_BASE64_LENGTH = 10 * 1024 * 1024
+_MAX_SUBMITTED_IMAGE_BASE64_ENCODED_LENGTH = (
+    ((_MAX_SUBMITTED_IMAGE_BASE64_LENGTH + 2) // 3) * 4 + 64
+)
+_SUPPORTED_SUBMITTED_IMAGE_MIME_BY_DATA_URL_PREFIX = {
+    "data:image/jpeg;base64": "image/jpeg",
+    "data:image/png;base64": "image/png",
+}
+
+
+def _detect_submitted_image_mime(raw: bytes) -> str:
+    if raw.startswith(b"\xff\xd8\xff"):
+        return "image/jpeg"
+    if raw.startswith(b"\x89PNG\r\n\x1a\n"):
+        return "image/png"
+    return ""
+
+
+def _normalize_submitted_image_payload(image_base64: str) -> str:
+    image_payload = str(image_base64 or "").strip()
+    if not image_payload:
+        raise ValueError("image_base64 is required")
+
+    expected_mime = ""
+    encoded_payload = image_payload
+    if image_payload.lower().startswith("data:"):
+        header, separator, encoded_payload = image_payload.partition(",")
+        expected_mime = _SUPPORTED_SUBMITTED_IMAGE_MIME_BY_DATA_URL_PREFIX.get(
+            header.strip().lower(),
+            "",
+        )
+        if not separator or not encoded_payload.strip():
+            raise ValueError("image_base64 data URL is malformed")
+        if not expected_mime:
+            raise ValueError("only JPEG/PNG data URLs are supported")
+    encoded_payload = encoded_payload.strip()
+    if len(encoded_payload) > _MAX_SUBMITTED_IMAGE_BASE64_ENCODED_LENGTH:
+        raise ValueError("image_base64 is too large (max 10MB)")
+    try:
+        raw = base64.b64decode(encoded_payload, validate=True)
+    except (binascii.Error, ValueError) as exc:
+        raise ValueError("image_base64 is not valid base64") from exc
+    if not raw:
+        raise ValueError("image_base64 is not valid base64")
+    if len(raw) > _MAX_SUBMITTED_IMAGE_BASE64_LENGTH:
+        raise ValueError("image_base64 is too large (max 10MB)")
+    actual_mime = _detect_submitted_image_mime(raw)
+    if not actual_mime:
+        raise ValueError("only JPEG/PNG images are supported")
+    if expected_mime and actual_mime != expected_mime:
+        raise ValueError("image_base64 MIME does not match image data")
+    return f"data:{actual_mime};base64,{encoded_payload}"
+
+
 def _validated_pomodoro_focus_minutes(
     config: StudyConfig, focus_minutes: Any | None
 ) -> int:
@@ -761,6 +816,25 @@ async def _build_learning_context(
                 self._knowledge_tracker.get_status_summary,
                 limit=5,
             )
+        if bool(self._cfg.llm_vision_enabled):
+            user_image = ""
+            with self._lock:
+                user_image = str(self._state.last_vision_image_base64 or "").strip()
+            if user_image:
+                context["vision_enabled"] = True
+                context["vision_image_base64"] = user_image
+            elif self._ocr_pipeline is not None:
+                vision_snapshot = self._ocr_pipeline.latest_vision_snapshot()
+                if vision_snapshot:
+                    context["vision_enabled"] = True
+                    context["vision_image_base64"] = str(
+                        vision_snapshot.get("vision_image_base64") or ""
+                    )
+                    context["vision_snapshot"] = {
+                        key: value
+                        for key, value in vision_snapshot.items()
+                        if key != "vision_image_base64"
+                    }
         if extra:
             context.update(extra)
         return context
@@ -2692,6 +2766,41 @@ async def study_ocr_snapshot(self, **_):
         await self._persist_state()
         return Ok(payload)
 
+    @plugin_entry(
+        id="study_submit_image",
+        name=tr("entries.submit_image.name", default="Submit Study Image"),
+        description=tr(
+            "entries.submit_image.description",
+            default="Accept a user image and explain it with the configured vision model.",
+        ),
+        input_schema={
+            "type": "object",
+            "properties": {
+                "image_base64": {"type": "string"},
+                "text": {"type": "string", "default": ""},
+            },
+            "required": ["image_base64"],
+        },
+        timeout=60.0,
+        llm_result_fields=["summary", "reply", "diagnostic"],
+    )
+    async def study_submit_image(self, image_base64: str, text: str = "", **_):
+        try:
+            image_payload = _normalize_submitted_image_payload(image_base64)
+        except ValueError as exc:
+            return Err(SdkError(str(exc)))
+        if not bool(self._cfg.llm_vision_enabled):
+            return Err(SdkError("llm_vision_enabled is not enabled"))
+        normalized_text = str(text or "").strip()
+        if normalized_text:
+            with self._lock:
+                self._state.last_ocr_text = normalized_text
+        source_text = normalized_text or "请查看这张图片的内容"
+        return await self.study_explain_text(
+            text=source_text,
+            vision_image_base64=image_payload,
+        )
+
     @plugin_entry(
         id="study_explain_text",
         name=tr("entries.explain_text.name", default="Explain Study Text"),
@@ -2703,12 +2812,13 @@ async def study_ocr_snapshot(self, **_):
             "type": "object",
             "properties": {
                 "text": {"type": "string", "default": ""},
+                "vision_image_base64": {"type": "string", "default": ""},
             },
         },
         timeout=45.0,
         llm_result_fields=["summary", "reply", "diagnostic"],
     )
-    async def study_explain_text(self, text: str = "", **_):
+    async def study_explain_text(self, text: str = "", vision_image_base64: str = "", **_):
         if self._agent is None:
             return Err(SdkError("study tutor agent is not initialized"))
         raw_text = str(text or "").strip()
@@ -2763,17 +2873,28 @@ async def study_explain_text(self, text: str = "", **_):
                 source_text = self._state.last_ocr_text
             used_ocr_fallback = bool(source_text.strip())
         # Phase 3: explain with the active mode selected above.
+        extra_context: dict[str, Any] = {
+            "source": "ocr_snapshot" if used_ocr_fallback or not raw_text else "manual",
+            "mode": active_mode,
+            "mode_switch": bool(mode_switch.get("changed")),
+            "source_text": source_text,
+        }
+        vision_image_payload = str(vision_image_base64 or "").strip()
+        if vision_image_payload:
+            if not bool(self._cfg.llm_vision_enabled):
+                return Err(SdkError("llm_vision_enabled is not enabled"))
+            try:
+                vision_image_payload = _normalize_submitted_image_payload(
+                    vision_image_payload,
+                )
+            except ValueError as exc:
+                return Err(SdkError(str(exc)))
+            extra_context["vision_enabled"] = True
+            extra_context["vision_image_base64"] = vision_image_payload
         tutor_context = await self._build_learning_context(
             LLM_OPERATION_CONCEPT_EXPLAIN,
             input_text=source_text,
-            extra={
-                "source": "ocr_snapshot"
-                if used_ocr_fallback or not raw_text
-                else "manual",
-                "mode": active_mode,
-                "mode_switch": bool(mode_switch.get("changed")),
-                "source_text": source_text,
-            },
+            extra=extra_context,
         )
         reply = await self._agent.concept_explain(
             source_text,
 
@@ -208,6 +208,8 @@ class StudyConfig:
     rapidocr_model_type: str = "mobile"
     rapidocr_ocr_version: str = "PP-OCRv4"
     llm_call_timeout_seconds: float = 30.0
+    llm_vision_enabled: bool = False
+    llm_vision_max_image_px: int = 768
     fsrs_retention_target: float = 0.90
     fsrs_auto_optimize_interval_days: int = 30
     knowledge_contribution_opt_in: bool = False
@@ -239,6 +241,10 @@ def __post_init__(self) -> None:
         self.llm_call_timeout_seconds = self._clamp_float(
             self.llm_call_timeout_seconds, 1.0, 3600.0, 30.0
         )
+        self.llm_vision_enabled = bool(self.llm_vision_enabled)
+        self.llm_vision_max_image_px = max(
+            64, min(4096, self._coerce_int(self.llm_vision_max_image_px, 768))
+        )
         self.fsrs_retention_target = self._clamp_float(
             self.fsrs_retention_target, 0.1, 0.99, 0.90
         )
@@ -316,6 +322,7 @@ class StudyState:
     last_error: str = ""
     last_started_at: str = ""
     last_ocr_text: str = ""
+    last_vision_image_base64: str = ""
     last_ocr_at: str = ""
     last_screen_classification: dict[str, Any] = field(default_factory=dict)
     recent_screen_classifications: list[dict[str, Any]] = field(default_factory=list)
@@ -333,7 +340,9 @@ class StudyState:
     dependency_status: dict[str, Any] = field(default_factory=dict)
 
     def to_dict(self) -> dict[str, Any]:
-        return asdict(self)
+        payload = asdict(self)
+        payload.pop("last_vision_image_base64", None)
+        return payload
 
 
 @dataclass(slots=True)
@@ -541,6 +550,16 @@ def _clamp(value: float, minimum: float, maximum: float, default: float) -> floa
             3600.0,
             30.0,
         ),
+        llm_vision_enabled=_bool(
+            llm, "llm_vision_enabled", False, "llm_vision_enabled"
+        ),
+        llm_vision_max_image_px=max(
+            64,
+            min(
+                4096,
+                _int(llm, "llm_vision_max_image_px", 768, "llm_vision_max_image_px"),
+            ),
+        ),
         fsrs_retention_target=_clamp(
             _float(fsrs, "retention_target", 0.90, "fsrs_retention_target"),
             0.1,
 
@@ -57,6 +57,8 @@ enabled = true
 
 [llm]
 llm_call_timeout_seconds = 30
+llm_vision_enabled = false
+llm_vision_max_image_px = 768
 
 [ocr_reader]
 enabled = true