feat(shopping): live voice-triggered frame description

28nitin07 · claude · 28nitin07 · commit fed62dcb11dc · 2026-05-25T11:00:37.000+05:30
Say 'describe this' while in shopping mode to hear colour, pattern,
and fabric details without capturing. Uses a shopping-specific Gemini
prompt focused on tactile details for visually impaired users.

- /describe-frame: new mode=shopping param with garment-focused prompt
- CameraCapture: exposes describe() via captureRef, accepts describeMode
- ShoppingScreen: wires voice command, announces hint on load
- commandParser: 'describe this / what is this / what do I see' → describe_frame
- VoiceContext: describe_frame intent → DESCRIBE_FRAME voiceCommand event

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
@@ -405,7 +405,11 @@ async def context_chat(
 # ---------------------------------------------------------------------------
 
 @router.post("/describe-frame")
-async def describe_frame(image: UploadFile = File(...), language: Optional[str] = Form("en")):
+async def describe_frame(
+    image: UploadFile = File(...),
+    language: Optional[str] = Form("en"),
+    mode: Optional[str] = Form("general"),
+):
     import io
     from PIL import Image as PILImage
 
@@ -415,11 +419,22 @@ async def describe_frame(image: UploadFile = File(...), language: Optional[str]
     img.save(buf, format="JPEG", quality=75)
 
     lang_name = LANGUAGE_NAMES.get(language or "en", "English")
-    prompt = (
-        f"Describe what you see in this image in 1-2 short sentences in {lang_name}. "
-        "Focus on what's most prominent — the main subject, its position in frame, and lighting. "
-        "Write for text-to-speech. No markdown."
-    )
+
+    if (mode or "general") == "shopping":
+        prompt = (
+            f"Describe the clothing item in this image in 2-3 short sentences in {lang_name}. "
+            "Focus on: exact colour name, pattern (e.g. solid, plaid, floral, striped, graphic print), "
+            "fabric feel (cotton, linen, denim, silk-like, etc.), and fit/cut (oversized, slim, cropped, etc.). "
+            "Be concrete and tactile — as if describing it to someone who cannot see it. "
+            "Write for text-to-speech. No markdown."
+        )
+    else:
+        prompt = (
+            f"Describe what you see in this image in 1-2 short sentences in {lang_name}. "
+            "Focus on what's most prominent — the main subject, its position in frame, and lighting. "
+            "Write for text-to-speech. No markdown."
+        )
+
     try:
         response = _gemini().models.generate_content(
             model=GEMINI_MODEL,
diff --git a/frontend/src/components/CameraCapture.jsx b/frontend/src/components/CameraCapture.jsx
@@ -2,7 +2,7 @@ import { useRef, useEffect, useState, useCallback } from 'react'
 import { COLORS } from '../utils/constants.js'
 import { describeFrame } from '../services/api.js'
 
-export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingMode: initialFacing = 'environment', aspectRatio = '3/4', language = 'en' }) {
+export function CameraCapture({ onCapture, captureRef, onFrameDescribed, describeMode = 'general', facingMode: initialFacing = 'environment', aspectRatio = '3/4', language = 'en' }) {
   const videoRef = useRef(null)
   const canvasRef = useRef(null)
   const streamRef = useRef(null)
@@ -125,7 +125,7 @@ export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingM
       setDescribing(true)
       onFrameDescribed?.('Describing what I see...')
       try {
-        const data = await describeFrame(blob, language)
+        const data = await describeFrame(blob, language, describeMode)
         onFrameDescribed?.(data.description || 'Nothing clear in frame.')
       } catch {
         onFrameDescribed?.('Could not describe the frame right now.')
@@ -162,8 +162,8 @@ export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingM
   }, [onCapture])
 
   useEffect(() => {
-    if (captureRef) captureRef.current = capture
-  }, [captureRef, capture])
+    if (captureRef) captureRef.current = { capture, describe: describeCurrentFrame }
+  }, [captureRef, capture, describeCurrentFrame])
 
   useEffect(() => {
     startCamera()
diff --git a/frontend/src/contexts/VoiceContext.jsx b/frontend/src/contexts/VoiceContext.jsx
@@ -163,6 +163,9 @@ export function VoiceProvider({ children }) {
         speak(helps[current.screen] ?? helps.HOME)
         break
       }
+      case 'describe_frame':
+        window.dispatchEvent(new CustomEvent('voiceCommand', { detail: { type: 'DESCRIBE_FRAME' } }))
+        break
       case 'read_wardrobe':
         window.dispatchEvent(new CustomEvent('voiceCommand', { detail: { type: 'READ_WARDROBE' } }))
         break
diff --git a/frontend/src/screens/ShoppingScreen.jsx b/frontend/src/screens/ShoppingScreen.jsx
@@ -8,7 +8,7 @@ import { shoppingAnalyze } from '../services/api.js'
 import { COLORS } from '../utils/constants.js'
 
 export function ShoppingScreen() {
-  const { speak } = useVoice()
+  const { speak, language } = useVoice()
   const { items: wardrobeItems } = useWardrobe()
   const { goBack } = useApp()
   const { profileContext } = useProfile()
@@ -18,17 +18,31 @@ export function ShoppingScreen() {
   const [errorMsg, setErrorMsg] = useState('')
   const [capturedUrl, setCapturedUrl] = useState(null)
   const analyzingRef = useRef(false)
+  const cameraRef = useRef(null)
 
   const topsInWardrobe    = wardrobeItems.filter((i) => i.category === 'tops')
   const bottomsInWardrobe = wardrobeItems.filter((i) => i.category === 'bottoms')
   const emptyWardrobe     = wardrobeItems.length === 0
 
   useEffect(() => {
-    speak(emptyWardrobe
-      ? 'Shopping mode. Tap the camera button to scan a clothing item for style advice.'
-      : `Shopping mode. Tap the camera button to scan a top or bottom. I will check it against your ${wardrobeItems.length} wardrobe items.`)
+    const wardrobeNote = emptyWardrobe
+      ? 'Tap the camera button to scan a clothing item for style advice.'
+      : `Tap the camera button to scan a top or bottom. I will check it against your ${wardrobeItems.length} wardrobe items.`
+    speak(`Shopping mode. ${wardrobeNote} Say "describe this" at any time to hear colour, pattern, and fabric details.`)
   }, []) // eslint-disable-line react-hooks/exhaustive-deps
 
+  // ── Voice: "describe this" triggers a live frame description ──────────────
+  useEffect(() => {
+    const handler = (e) => {
+      const cmd = e.detail
+      if (cmd.type === 'DESCRIBE_FRAME' && phase === 'camera') {
+        cameraRef.current?.describe?.()
+      }
+    }
+    window.addEventListener('voiceCommand', handler)
+    return () => window.removeEventListener('voiceCommand', handler)
+  }, [phase])
+
   const handleCapture = useCallback(async (blob, dataUrl) => {
     if (analyzingRef.current) return
     analyzingRef.current = true
@@ -175,8 +189,20 @@ export function ShoppingScreen() {
       )}
 
       <div style={{ flex: 1, position: 'relative', minHeight: 0 }}>
-        <CameraCapture onCapture={handleCapture} aspectRatio="unset" />
-        <div style={{ position: 'absolute', bottom: 16, left: 14, right: 14 }}>
+        <CameraCapture
+          onCapture={handleCapture}
+          captureRef={cameraRef}
+          onFrameDescribed={(t) => speak(t)}
+          describeMode="shopping"
+          language={language}
+          aspectRatio="unset"
+        />
+        <div style={{ position: 'absolute', bottom: 16, left: 14, right: 14, display: 'flex', flexDirection: 'column', gap: 6 }}>
+          <div style={{ background: 'rgba(10,10,8,0.85)', border: `2px solid ${COLORS.BORDER}`, borderRadius: COLORS.RADIUS, padding: '6px 14px', textAlign: 'center' }}>
+            <p style={{ fontSize: 11, fontWeight: 700, color: COLORS.ACCENT, margin: 0, letterSpacing: 0.5 }}>
+              Say "describe this" — hear colour, pattern &amp; fabric
+            </p>
+          </div>
           <div style={{ background: COLORS.BG, border: `2px solid ${COLORS.BORDER}`, borderRadius: COLORS.RADIUS, padding: '8px 16px', textAlign: 'center' }}>
             <p style={{ fontSize: 12, fontWeight: 700, color: COLORS.TEXT_MUTED, margin: 0 }}>
               {emptyWardrobe
diff --git a/frontend/src/services/api.js b/frontend/src/services/api.js
@@ -69,10 +69,11 @@ export async function contextChat({ question, feature = 'scan', resultContext =
   return post('/context-chat', fd)
 }
 
-export async function describeFrame(imageBlob, language = 'en') {
+export async function describeFrame(imageBlob, language = 'en', mode = 'general') {
   const fd = new FormData()
   fd.append('image', imageBlob, 'frame.jpg')
   fd.append('language', language)
+  fd.append('mode', mode)
   return post('/describe-frame', fd, TIMEOUTS.analyze)
 }
 
diff --git a/frontend/src/voice/commandParser.js b/frontend/src/voice/commandParser.js
@@ -25,6 +25,9 @@ export function parseCommand(transcript) {
   if (/\b(save|add to wardrobe|keep this)\b/.test(t))                      return { intent: 'save_item' }
   if (/\b(capture|take|shoot)\b/.test(t))                                  return { intent: 'capture' }
 
+  // ── Live frame description ─────────────────────────────────────────────────
+  if (/\b(describe (this|what (i see|you see|is here))|what (is this|am i holding|do i see)|what('s| is) (in front of me|this))\b/.test(t)) return { intent: 'describe_frame' }
+
   // ── Help ───────────────────────────────────────────────────────────────────
   if (/\b(help|what can i (say|do)|commands|options)\b/.test(t))           return { intent: 'help' }
   if (/\b(where am i|what screen|current screen)\b/.test(t))               return { intent: 'describe_screen' }

Original file line number	Diff line number	Diff line change
`@@ -69,10 +69,11 @@ export async function contextChat({ question, feature = 'scan', resultContext =`
`69`	`69`	`return post('/context-chat', fd)`
`70`	`70`	`}`
`71`	`71`
`72`		`-export async function describeFrame(imageBlob, language = 'en') {`
	`72`	`+export async function describeFrame(imageBlob, language = 'en', mode = 'general') {`
`73`	`73`	`const fd = new FormData()`
`74`	`74`	`fd.append('image', imageBlob, 'frame.jpg')`
`75`	`75`	`fd.append('language', language)`
	`76`	`+ fd.append('mode', mode)`
`76`	`77`	`return post('/describe-frame', fd, TIMEOUTS.analyze)`
`77`	`78`	`}`
`78`	`79`