Skip to content
18 changes: 18 additions & 0 deletions plugin/plugins/study_companion/entry_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,23 @@ def _entry_exception_error(
return Err(SdkError(str(exc) if message is None else message))


def _validate_optional_vision_image_payload(
owner: Any,
image_base64: str,
*,
operation: str,
):
image_payload = str(image_base64 or "").strip()
if not image_payload:
return ""
if not bool(getattr(owner._cfg, "llm_vision_enabled", False)):
return Err(SdkError("llm_vision_enabled is not enabled"))
try:
return _normalize_submitted_image_payload(image_payload)
except ValueError as exc:
return _entry_exception_error(owner, exc, operation=operation)


__all__ = [
"Any",
"Mapping",
Expand Down Expand Up @@ -297,6 +314,7 @@ def _entry_exception_error(
"_validated_pomodoro_focus_minutes",
"_detect_mastery_threshold_crossed",
"_normalize_submitted_image_payload",
"_validate_optional_vision_image_payload",
"_plugin_lock",
"_entry_exception_error",
"_event_ratio",
Expand Down
17 changes: 17 additions & 0 deletions plugin/plugins/study_companion/entry_tutor_answer_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Ok,
SdkError,
_entry_exception_error,
_validate_optional_vision_image_payload,
plugin_entry,
tr,
LLM_OPERATION_ANSWER_EVALUATE,
Expand All @@ -26,6 +27,7 @@ class _TutorAnswerEntriesMixin:
"answer": {"type": "string", "default": ""},
"question": {"type": "string", "default": ""},
"expected_answer": {"type": "string", "default": ""},
"vision_image_base64": {"type": "string", "default": ""},
},
},
timeout=60.0,
Expand Down Expand Up @@ -53,6 +55,13 @@ async def study_evaluate_answer(
resolved_question = supplied_question or state_question
if not resolved_question:
return Err(SdkError("study tutor requires a question to evaluate against"))
vision_image_payload = str(kwargs.get("vision_image_base64") or "").strip()
validated_vision_image = _validate_optional_vision_image_payload(
self, vision_image_payload, operation="study_evaluate_answer"
)
if isinstance(validated_vision_image, Err):
return validated_vision_image
vision_image_payload = validated_vision_image
resolved_expected = supplied_expected
if not resolved_expected and (
not supplied_question or supplied_question == state_question
Expand Down Expand Up @@ -89,6 +98,14 @@ async def study_evaluate_answer(
"run_id": run_id,
"session_id": session_id,
"mode": active_mode,
**(
{
"vision_enabled": True,
"vision_image_base64": vision_image_payload,
}
if vision_image_payload
else {}
),
},
)
reply = await self._agent.answer_evaluate(
Expand Down
43 changes: 33 additions & 10 deletions plugin/plugins/study_companion/entry_tutor_explain_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
SdkError,
_entry_exception_error,
_normalize_submitted_image_payload,
_validate_optional_vision_image_payload,
_plugin_lock,
plugin_entry,
tr,
Expand All @@ -17,6 +18,20 @@
)


IMAGE_ONLY_EXPLAIN_PROMPT_EN = "Please explain the pasted image."
IMAGE_ONLY_EXPLAIN_PROMPT_ZH_CN = "请查看这张图片的内容"
IMAGE_ONLY_EXPLAIN_PROMPT_ZH_TW = "請查看這張圖片的內容"


def _image_only_explain_prompt(language: str) -> str:
normalized = str(language or "").strip().lower()
if normalized.startswith(("zh-tw", "zh-hk", "zh-hant")):
return IMAGE_ONLY_EXPLAIN_PROMPT_ZH_TW
if normalized.startswith("zh"):
return IMAGE_ONLY_EXPLAIN_PROMPT_ZH_CN
return IMAGE_ONLY_EXPLAIN_PROMPT_EN


class _TutorExplainEntriesMixin:
@plugin_entry(
id="study_submit_image",
Expand Down Expand Up @@ -47,7 +62,9 @@ async def study_submit_image(self, image_base64: str, text: str = "", **_):
if normalized_text:
async with _plugin_lock(self._lock):
self._state.last_ocr_text = normalized_text
source_text = normalized_text or "请查看这张图片的内容"
source_text = normalized_text or _image_only_explain_prompt(
self._cfg.language
)
return await self.study_explain_text(
text=source_text,
vision_image_base64=image_payload,
Expand Down Expand Up @@ -121,13 +138,13 @@ async def study_explain_text(
source_text = str(intent.get("remaining_text") or "").strip()
if not source_text and intent_kind != "concept_explain":
source_text = raw_text
vision_image_payload = str(vision_image_base64 or "").strip()
used_ocr_fallback = False
if not source_text:
if not source_text and not vision_image_payload:
async with _plugin_lock(self._lock):
source_text = self._state.last_ocr_text
used_ocr_fallback = bool(source_text.strip())
source_text = source_text.strip()
vision_image_payload = str(vision_image_base64 or "").strip()
if not source_text and not vision_image_payload:
return Err(
SdkError(
Expand All @@ -137,20 +154,26 @@ async def study_explain_text(
)
# Phase 3: explain with the active mode selected above.
try:
image_only_source = False
if vision_image_payload:
validated_vision_image = _validate_optional_vision_image_payload(
self, vision_image_payload, operation="study_explain_text"
)
if isinstance(validated_vision_image, Err):
return validated_vision_image
vision_image_payload = validated_vision_image
if not source_text:
source_text = _image_only_explain_prompt(self._cfg.language)
image_only_source = True
Comment on lines +165 to +167
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Prefer pasted image over stale OCR for image-only explanations

For an image-only Explain action with a previous OCR snapshot in state, source_text has already been filled from last_ocr_text before this check, so the image-only prompt is never selected and the context source remains ocr_snapshot. This makes a pasted image get explained with stale OCR text from a prior screen; the OCR fallback should only run when no pasted vision image was provided.

Useful? React with 👍 / 👎.

extra_context: dict[str, Any] = {
"source": "ocr_snapshot"
if used_ocr_fallback or not raw_text
else "manual",
if used_ocr_fallback
else ("vision_image" if image_only_source else "manual"),
"mode": active_mode,
"mode_switch": bool(mode_switch.get("changed")),
"source_text": source_text,
}
if vision_image_payload:
if not bool(self._cfg.llm_vision_enabled):
return Err(SdkError("llm_vision_enabled is not enabled"))
vision_image_payload = _normalize_submitted_image_payload(
vision_image_payload,
)
extra_context["vision_enabled"] = True
extra_context["vision_image_base64"] = vision_image_payload
tutor_context = await self._build_learning_context(
Expand Down
119 changes: 84 additions & 35 deletions plugin/plugins/study_companion/entry_tutor_question_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,28 @@
Err,
Ok,
SdkError,
_entry_exception_error,
_validate_optional_vision_image_payload,
plugin_entry,
tr,
LLM_OPERATION_QUESTION_GENERATE,
)


IMAGE_ONLY_QUESTION_PROMPT_EN = "Generate a study question from the pasted image."
IMAGE_ONLY_QUESTION_PROMPT_ZH_CN = "请根据这张图片生成一道学习题。"
IMAGE_ONLY_QUESTION_PROMPT_ZH_TW = "請根據這張圖片生成一道學習題。"


def _image_only_question_prompt(language: str) -> str:
normalized = str(language or "").strip().lower()
if normalized.startswith(("zh-tw", "zh-hk", "zh-hant")):
return IMAGE_ONLY_QUESTION_PROMPT_ZH_TW
if normalized.startswith("zh"):
return IMAGE_ONLY_QUESTION_PROMPT_ZH_CN
return IMAGE_ONLY_QUESTION_PROMPT_EN


class _TutorQuestionEntriesMixin:
@plugin_entry(
id="study_generate_question",
Expand All @@ -23,6 +39,7 @@ class _TutorQuestionEntriesMixin:
"properties": {
"text": {"type": "string", "default": ""},
"topic": {"type": "string", "default": ""},
"vision_image_base64": {"type": "string", "default": ""},
},
},
timeout=60.0,
Expand All @@ -35,52 +52,84 @@ class _TutorQuestionEntriesMixin:
"topic",
],
)
async def study_generate_question(self, text: str = "", topic: str = "", **_):
async def study_generate_question(
self,
text: str = "",
topic: str = "",
vision_image_base64: str = "",
**_,
):
if self._agent is None:
return Err(SdkError("study tutor agent is not initialized"))
source_text = str(text or "").strip()
vision_image_payload = str(vision_image_base64 or "").strip()
used_ocr_fallback = False
if not source_text:
if not source_text and not vision_image_payload:
async with self._lock:
source_text = self._state.last_ocr_text
used_ocr_fallback = bool(source_text.strip())
source_text = source_text.strip()
if not source_text:
if not source_text and not vision_image_payload:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Invoke vision for image-only question generation

This now accepts vision_image_base64 without any text/OCR, but in that scenario source_text remains empty and the later call to self._agent.question_generate(source_text, ...) short-circuits in tutor_llm_agent_question_generate.question_generate to the empty_input fallback before _invoke_structured_operation can attach the image. Users pasting only a diagram/photo and clicking Generate Question therefore get the generic empty-input fallback instead of a vision-generated question.

Useful? React with 👍 / 👎.

return Err(
SdkError(
"study tutor requires text or a non-empty OCR snapshot",
"study tutor requires text, an image, or a non-empty OCR snapshot",
code="MISSING_TEXT",
)
)
async with self._lock:
active_mode = self._state.active_mode
tutor_context = await self._build_learning_context(
LLM_OPERATION_QUESTION_GENERATE,
input_text=source_text,
extra={
"source": "ocr_snapshot" if used_ocr_fallback or not text else "manual",
"source_text": source_text,
"topic_hint": str(topic or "").strip(),
"mode": active_mode,
},
)
reply = await self._agent.question_generate(
source_text, mode=active_mode, context=tutor_context
)
payload = await self._finalize_tutor_call(
LLM_OPERATION_QUESTION_GENERATE,
reply,
history_kind=LLM_OPERATION_QUESTION_GENERATE,
metadata={
"degraded": reply.degraded,
"diagnostic": reply.diagnostic,
"payload": reply.payload,
"screen_classification": tutor_context.get("screen_classification")
or {},
},
extra_context=tutor_context,
validated_vision_image = _validate_optional_vision_image_payload(
self, vision_image_payload, operation="study_generate_question"
)
payload["screen_classification"] = (
tutor_context.get("screen_classification") or {}
)
return Ok(payload)
if isinstance(validated_vision_image, Err):
return validated_vision_image
vision_image_payload = validated_vision_image
try:
image_only_source = False
if not source_text and vision_image_payload:
source_text = _image_only_question_prompt(self._cfg.language)
image_only_source = True
Comment on lines +87 to +89
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Prefer pasted image over stale OCR for question generation

When the user pastes only an image while last_ocr_text is still populated from an earlier snapshot, the earlier OCR fallback fills source_text before this image-only branch runs, so the branch is skipped and the request is labeled ocr_snapshot rather than vision_image. In that scenario, Generate Question sends stale OCR text alongside the new image instead of using the image-only prompt, which can produce questions about the previous screen; skip the OCR fallback whenever vision_image_base64 is supplied without explicit text.

Useful? React with 👍 / 👎.

async with self._lock:
active_mode = self._state.active_mode
tutor_context = await self._build_learning_context(
LLM_OPERATION_QUESTION_GENERATE,
input_text=source_text,
extra={
"source": "ocr_snapshot"
if used_ocr_fallback
else ("vision_image" if image_only_source else "manual"),
"source_text": source_text,
"topic_hint": str(topic or "").strip(),
"mode": active_mode,
**(
{
"vision_enabled": True,
"vision_image_base64": vision_image_payload,
}
if vision_image_payload
else {}
),
},
)
reply = await self._agent.question_generate(
source_text, mode=active_mode, context=tutor_context
)
payload = await self._finalize_tutor_call(
LLM_OPERATION_QUESTION_GENERATE,
reply,
history_kind=LLM_OPERATION_QUESTION_GENERATE,
metadata={
"degraded": reply.degraded,
"diagnostic": reply.diagnostic,
"payload": reply.payload,
"screen_classification": tutor_context.get("screen_classification")
or {},
},
extra_context=tutor_context,
)
payload["screen_classification"] = (
tutor_context.get("screen_classification") or {}
)
return Ok(payload)
except Exception as exc:
return _entry_exception_error(
self, exc, operation="study_generate_question"
)
60 changes: 60 additions & 0 deletions plugin/plugins/study_companion/static/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,66 @@ textarea {
overflow-wrap: anywhere;
}

.study-panel__image-preview {
position: relative;
display: inline-flex;
max-width: 220px;
margin: 8px 0 0;
overflow: hidden;
border: 1px solid #c8d0dc;
border-radius: 6px;
background: #ffffff;
}

.study-panel__image-preview img {
display: block;
max-width: 100%;
max-height: 160px;
object-fit: contain;
}

.study-panel__image-remove {
position: absolute;
top: 3px;
right: 3px;
width: 24px;
min-height: 24px;
height: 24px;
padding: 0;
border: 1px solid rgba(255, 255, 255, 0.3);
border-radius: 50%;
background: rgba(0, 0, 0, 0.65);
color: #ffffff;
font-size: 15px;
line-height: 22px;
text-align: center;
}

.study-panel__image-remove:hover {
background: rgba(190, 42, 42, 0.86);
}

.study-panel__image-remove:focus-visible {
outline: 2px solid #2e5bff;
outline-offset: 2px;
}

.study-panel__paste-error {
margin: 8px 0 0;
color: #b42318;
font-size: 13px;
line-height: 1.45;
}

.study-panel[data-busy="true"] .study-panel__image-preview {
opacity: 0.55;
}

.study-panel[data-busy="true"] .study-panel__image-remove {
pointer-events: none;
opacity: 0.4;
}

.reply-panel {
border-top: 1px solid #d8dde6;
padding-top: 18px;
Expand Down
Loading
Loading