Project-N-E-K-O · MomiJiSan · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/plugin/plugins/study_companion/entry_common.py b/plugin/plugins/study_companion/entry_common.py
@@ -220,6 +220,23 @@ def _entry_exception_error(
     return Err(SdkError(str(exc) if message is None else message))
 
 
+def _validate_optional_vision_image_payload(
+    owner: Any,
+    image_base64: str,
+    *,
+    operation: str,
+):
+    image_payload = str(image_base64 or "").strip()
+    if not image_payload:
+        return ""
+    if not bool(getattr(owner._cfg, "llm_vision_enabled", False)):
+        return Err(SdkError("llm_vision_enabled is not enabled"))
+    try:
+        return _normalize_submitted_image_payload(image_payload)
+    except ValueError as exc:
+        return _entry_exception_error(owner, exc, operation=operation)
+
+
 __all__ = [
     "Any",
     "Mapping",
@@ -297,6 +314,7 @@ def _entry_exception_error(
     "_validated_pomodoro_focus_minutes",
     "_detect_mastery_threshold_crossed",
     "_normalize_submitted_image_payload",
+    "_validate_optional_vision_image_payload",
     "_plugin_lock",
     "_entry_exception_error",
     "_event_ratio",

diff --git a/plugin/plugins/study_companion/entry_tutor_answer_entries.py b/plugin/plugins/study_companion/entry_tutor_answer_entries.py
@@ -6,6 +6,7 @@
     Ok,
     SdkError,
     _entry_exception_error,
+    _validate_optional_vision_image_payload,
     plugin_entry,
     tr,
     LLM_OPERATION_ANSWER_EVALUATE,
@@ -26,6 +27,7 @@ class _TutorAnswerEntriesMixin:
                 "answer": {"type": "string", "default": ""},
                 "question": {"type": "string", "default": ""},
                 "expected_answer": {"type": "string", "default": ""},
+                "vision_image_base64": {"type": "string", "default": ""},
             },
         },
         timeout=60.0,
@@ -53,6 +55,13 @@ async def study_evaluate_answer(
         resolved_question = supplied_question or state_question
         if not resolved_question:
             return Err(SdkError("study tutor requires a question to evaluate against"))
+        vision_image_payload = str(kwargs.get("vision_image_base64") or "").strip()
+        validated_vision_image = _validate_optional_vision_image_payload(
+            self, vision_image_payload, operation="study_evaluate_answer"
+        )
+        if isinstance(validated_vision_image, Err):
+            return validated_vision_image
+        vision_image_payload = validated_vision_image
         resolved_expected = supplied_expected
         if not resolved_expected and (
             not supplied_question or supplied_question == state_question
@@ -89,6 +98,14 @@ async def study_evaluate_answer(
                     "run_id": run_id,
                     "session_id": session_id,
                     "mode": active_mode,
+                    **(
+                        {
+                            "vision_enabled": True,
+                            "vision_image_base64": vision_image_payload,
+                        }
+                        if vision_image_payload
+                        else {}
+                    ),
                 },
             )
             reply = await self._agent.answer_evaluate(

diff --git a/plugin/plugins/study_companion/entry_tutor_explain_entries.py b/plugin/plugins/study_companion/entry_tutor_explain_entries.py
@@ -7,6 +7,7 @@
     SdkError,
     _entry_exception_error,
     _normalize_submitted_image_payload,
+    _validate_optional_vision_image_payload,
     _plugin_lock,
     plugin_entry,
     tr,
@@ -17,6 +18,20 @@
 )
 
 
+IMAGE_ONLY_EXPLAIN_PROMPT_EN = "Please explain the pasted image."
+IMAGE_ONLY_EXPLAIN_PROMPT_ZH_CN = "请解释这张图片的内容"
+IMAGE_ONLY_EXPLAIN_PROMPT_ZH_TW = "請解釋這張圖片的內容"
+
+
+def _image_only_explain_prompt(language: str) -> str:
+    normalized = str(language or "").strip().lower()
+    if normalized.startswith(("zh-tw", "zh-hk", "zh-hant")):
+        return IMAGE_ONLY_EXPLAIN_PROMPT_ZH_TW
+    if normalized.startswith("zh"):
+        return IMAGE_ONLY_EXPLAIN_PROMPT_ZH_CN
+    return IMAGE_ONLY_EXPLAIN_PROMPT_EN
+
+
 class _TutorExplainEntriesMixin:
     @plugin_entry(
         id="study_submit_image",
@@ -47,7 +62,9 @@ async def study_submit_image(self, image_base64: str, text: str = "", **_):
         if normalized_text:
             async with _plugin_lock(self._lock):
                 self._state.last_ocr_text = normalized_text
-        source_text = normalized_text or "请查看这张图片的内容"
+        source_text = normalized_text or _image_only_explain_prompt(
+            self._cfg.language
+        )
         return await self.study_explain_text(
             text=source_text,
             vision_image_base64=image_payload,
@@ -121,13 +138,13 @@ async def study_explain_text(
         source_text = str(intent.get("remaining_text") or "").strip()
         if not source_text and intent_kind != "concept_explain":
             source_text = raw_text
+        vision_image_payload = str(vision_image_base64 or "").strip()
         used_ocr_fallback = False
-        if not source_text:
+        if not source_text and not vision_image_payload:
             async with _plugin_lock(self._lock):
                 source_text = self._state.last_ocr_text
             used_ocr_fallback = bool(source_text.strip())
         source_text = source_text.strip()
-        vision_image_payload = str(vision_image_base64 or "").strip()
         if not source_text and not vision_image_payload:
             return Err(
                 SdkError(
@@ -137,20 +154,26 @@ async def study_explain_text(
             )
         # Phase 3: explain with the active mode selected above.
         try:
+            image_only_source = False
+            if vision_image_payload:
+                validated_vision_image = _validate_optional_vision_image_payload(
+                    self, vision_image_payload, operation="study_explain_text"
+                )
+                if isinstance(validated_vision_image, Err):
+                    return validated_vision_image
+                vision_image_payload = validated_vision_image
+                if not source_text:
+                    source_text = _image_only_explain_prompt(self._cfg.language)
+                    image_only_source = True
             extra_context: dict[str, Any] = {
                 "source": "ocr_snapshot"
-                if used_ocr_fallback or not raw_text
-                else "manual",
+                if used_ocr_fallback
+                else ("vision_image" if image_only_source else "manual"),
                 "mode": active_mode,
                 "mode_switch": bool(mode_switch.get("changed")),
                 "source_text": source_text,
             }
             if vision_image_payload:
-                if not bool(self._cfg.llm_vision_enabled):
-                    return Err(SdkError("llm_vision_enabled is not enabled"))
-                vision_image_payload = _normalize_submitted_image_payload(
-                    vision_image_payload,
-                )
                 extra_context["vision_enabled"] = True
                 extra_context["vision_image_base64"] = vision_image_payload
             tutor_context = await self._build_learning_context(

diff --git a/plugin/plugins/study_companion/entry_tutor_question_entries.py b/plugin/plugins/study_companion/entry_tutor_question_entries.py
@@ -4,12 +4,28 @@
     Err,
     Ok,
     SdkError,
+    _entry_exception_error,
+    _validate_optional_vision_image_payload,
     plugin_entry,
     tr,
     LLM_OPERATION_QUESTION_GENERATE,
 )
 
 
+IMAGE_ONLY_QUESTION_PROMPT_EN = "Generate a study question from the pasted image."
+IMAGE_ONLY_QUESTION_PROMPT_ZH_CN = "请根据这张图片生成一道学习题。"
+IMAGE_ONLY_QUESTION_PROMPT_ZH_TW = "請根據這張圖片生成一道學習題。"
+
+
+def _image_only_question_prompt(language: str) -> str:
+    normalized = str(language or "").strip().lower()
+    if normalized.startswith(("zh-tw", "zh-hk", "zh-hant")):
+        return IMAGE_ONLY_QUESTION_PROMPT_ZH_TW
+    if normalized.startswith("zh"):
+        return IMAGE_ONLY_QUESTION_PROMPT_ZH_CN
+    return IMAGE_ONLY_QUESTION_PROMPT_EN
+
+
 class _TutorQuestionEntriesMixin:
     @plugin_entry(
         id="study_generate_question",
@@ -23,6 +39,7 @@ class _TutorQuestionEntriesMixin:
             "properties": {
                 "text": {"type": "string", "default": ""},
                 "topic": {"type": "string", "default": ""},
+                "vision_image_base64": {"type": "string", "default": ""},
             },
         },
         timeout=60.0,
@@ -35,52 +52,84 @@ class _TutorQuestionEntriesMixin:
             "topic",
         ],
     )
-    async def study_generate_question(self, text: str = "", topic: str = "", **_):
+    async def study_generate_question(
+        self,
+        text: str = "",
+        topic: str = "",
+        vision_image_base64: str = "",
+        **_,
+    ):
         if self._agent is None:
             return Err(SdkError("study tutor agent is not initialized"))
         source_text = str(text or "").strip()
+        vision_image_payload = str(vision_image_base64 or "").strip()
         used_ocr_fallback = False
-        if not source_text:
+        if not source_text and not vision_image_payload:
             async with self._lock:
                 source_text = self._state.last_ocr_text
             used_ocr_fallback = bool(source_text.strip())
         source_text = source_text.strip()
-        if not source_text:
+        if not source_text and not vision_image_payload:
             return Err(
                 SdkError(
-                    "study tutor requires text or a non-empty OCR snapshot",
+                    "study tutor requires text, an image, or a non-empty OCR snapshot",
                     code="MISSING_TEXT",
                 )
             )
-        async with self._lock:
-            active_mode = self._state.active_mode
-        tutor_context = await self._build_learning_context(
-            LLM_OPERATION_QUESTION_GENERATE,
-            input_text=source_text,
-            extra={
-                "source": "ocr_snapshot" if used_ocr_fallback or not text else "manual",
-                "source_text": source_text,
-                "topic_hint": str(topic or "").strip(),
-                "mode": active_mode,
-            },
-        )
-        reply = await self._agent.question_generate(
-            source_text, mode=active_mode, context=tutor_context
-        )
-        payload = await self._finalize_tutor_call(
-            LLM_OPERATION_QUESTION_GENERATE,
-            reply,
-            history_kind=LLM_OPERATION_QUESTION_GENERATE,
-            metadata={
-                "degraded": reply.degraded,
-                "diagnostic": reply.diagnostic,
-                "payload": reply.payload,
-                "screen_classification": tutor_context.get("screen_classification")
-                or {},
-            },
-            extra_context=tutor_context,
+        validated_vision_image = _validate_optional_vision_image_payload(
+            self, vision_image_payload, operation="study_generate_question"
         )
-        payload["screen_classification"] = (
-            tutor_context.get("screen_classification") or {}
-        )
-        return Ok(payload)
+        if isinstance(validated_vision_image, Err):
+            return validated_vision_image
+        vision_image_payload = validated_vision_image
+        try:
+            image_only_source = False
+            if not source_text and vision_image_payload:
+                source_text = _image_only_question_prompt(self._cfg.language)
+                image_only_source = True
+            async with self._lock:
+                active_mode = self._state.active_mode
+            tutor_context = await self._build_learning_context(
+                LLM_OPERATION_QUESTION_GENERATE,
+                input_text=source_text,
+                extra={
+                    "source": "ocr_snapshot"
+                    if used_ocr_fallback
+                    else ("vision_image" if image_only_source else "manual"),
+                    "source_text": source_text,
+                    "topic_hint": str(topic or "").strip(),
+                    "mode": active_mode,
+                    **(
+                        {
+                            "vision_enabled": True,
+                            "vision_image_base64": vision_image_payload,
+                        }
+                        if vision_image_payload
+                        else {}
+                    ),
+                },
+            )
+            reply = await self._agent.question_generate(
+                source_text, mode=active_mode, context=tutor_context
+            )
+            payload = await self._finalize_tutor_call(
+                LLM_OPERATION_QUESTION_GENERATE,
+                reply,
+                history_kind=LLM_OPERATION_QUESTION_GENERATE,
+                metadata={
+                    "degraded": reply.degraded,
+                    "diagnostic": reply.diagnostic,
+                    "payload": reply.payload,
+                    "screen_classification": tutor_context.get("screen_classification")
+                    or {},
+                },
+                extra_context=tutor_context,
+            )
+            payload["screen_classification"] = (
+                tutor_context.get("screen_classification") or {}
+            )
+            return Ok(payload)
+        except Exception as exc:
+            return _entry_exception_error(
+                self, exc, operation="study_generate_question"
+            )
diff --git a/plugin/plugins/study_companion/static/style.css b/plugin/plugins/study_companion/static/style.css
@@ -237,6 +237,66 @@ textarea {
   overflow-wrap: anywhere;
 }
 
+.study-panel__image-preview {
+  position: relative;
+  display: inline-flex;
+  max-width: 220px;
+  margin: 8px 0 0;
+  overflow: hidden;
+  border: 1px solid #c8d0dc;
+  border-radius: 6px;
+  background: #ffffff;
+}
+
+.study-panel__image-preview img {
+  display: block;
+  max-width: 100%;
+  max-height: 160px;
+  object-fit: contain;
+}
+
+.study-panel__image-remove {
+  position: absolute;
+  top: 3px;
+  right: 3px;
+  width: 24px;
+  min-height: 24px;
+  height: 24px;
+  padding: 0;
+  border: 1px solid rgba(255, 255, 255, 0.3);
+  border-radius: 50%;
+  background: rgba(0, 0, 0, 0.65);
+  color: #ffffff;
+  font-size: 15px;
+  line-height: 22px;
+  text-align: center;
+}
+
+.study-panel__image-remove:hover {
+  background: rgba(190, 42, 42, 0.86);
+}
+
+.study-panel__image-remove:focus-visible {
+  outline: 2px solid #2e5bff;
+  outline-offset: 2px;
+}
+
+.study-panel__paste-error {
+  margin: 8px 0 0;
+  color: #b42318;
+  font-size: 13px;
+  line-height: 1.45;
+}
+
+.study-panel[data-busy="true"] .study-panel__image-preview {
+  opacity: 0.55;
+}
+
+.study-panel[data-busy="true"] .study-panel__image-remove {
+  pointer-events: none;
+  opacity: 0.4;
+}
+
 .reply-panel {
   border-top: 1px solid #d8dde6;
   padding-top: 18px;