Skip to content

Commit fed62dc

Browse files
28nitin07claude
andcommitted
feat(shopping): live voice-triggered frame description
Say 'describe this' while in shopping mode to hear colour, pattern, and fabric details without capturing. Uses a shopping-specific Gemini prompt focused on tactile details for visually impaired users. - /describe-frame: new mode=shopping param with garment-focused prompt - CameraCapture: exposes describe() via captureRef, accepts describeMode - ShoppingScreen: wires voice command, announces hint on load - commandParser: 'describe this / what is this / what do I see' → describe_frame - VoiceContext: describe_frame intent → DESCRIBE_FRAME voiceCommand event Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d8f3c6f commit fed62dc

6 files changed

Lines changed: 65 additions & 17 deletions

File tree

backend/app/api/routes.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,11 @@ async def context_chat(
405405
# ---------------------------------------------------------------------------
406406

407407
@router.post("/describe-frame")
408-
async def describe_frame(image: UploadFile = File(...), language: Optional[str] = Form("en")):
408+
async def describe_frame(
409+
image: UploadFile = File(...),
410+
language: Optional[str] = Form("en"),
411+
mode: Optional[str] = Form("general"),
412+
):
409413
import io
410414
from PIL import Image as PILImage
411415

@@ -415,11 +419,22 @@ async def describe_frame(image: UploadFile = File(...), language: Optional[str]
415419
img.save(buf, format="JPEG", quality=75)
416420

417421
lang_name = LANGUAGE_NAMES.get(language or "en", "English")
418-
prompt = (
419-
f"Describe what you see in this image in 1-2 short sentences in {lang_name}. "
420-
"Focus on what's most prominent — the main subject, its position in frame, and lighting. "
421-
"Write for text-to-speech. No markdown."
422-
)
422+
423+
if (mode or "general") == "shopping":
424+
prompt = (
425+
f"Describe the clothing item in this image in 2-3 short sentences in {lang_name}. "
426+
"Focus on: exact colour name, pattern (e.g. solid, plaid, floral, striped, graphic print), "
427+
"fabric feel (cotton, linen, denim, silk-like, etc.), and fit/cut (oversized, slim, cropped, etc.). "
428+
"Be concrete and tactile — as if describing it to someone who cannot see it. "
429+
"Write for text-to-speech. No markdown."
430+
)
431+
else:
432+
prompt = (
433+
f"Describe what you see in this image in 1-2 short sentences in {lang_name}. "
434+
"Focus on what's most prominent — the main subject, its position in frame, and lighting. "
435+
"Write for text-to-speech. No markdown."
436+
)
437+
423438
try:
424439
response = _gemini().models.generate_content(
425440
model=GEMINI_MODEL,

frontend/src/components/CameraCapture.jsx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { useRef, useEffect, useState, useCallback } from 'react'
22
import { COLORS } from '../utils/constants.js'
33
import { describeFrame } from '../services/api.js'
44

5-
export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingMode: initialFacing = 'environment', aspectRatio = '3/4', language = 'en' }) {
5+
export function CameraCapture({ onCapture, captureRef, onFrameDescribed, describeMode = 'general', facingMode: initialFacing = 'environment', aspectRatio = '3/4', language = 'en' }) {
66
const videoRef = useRef(null)
77
const canvasRef = useRef(null)
88
const streamRef = useRef(null)
@@ -125,7 +125,7 @@ export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingM
125125
setDescribing(true)
126126
onFrameDescribed?.('Describing what I see...')
127127
try {
128-
const data = await describeFrame(blob, language)
128+
const data = await describeFrame(blob, language, describeMode)
129129
onFrameDescribed?.(data.description || 'Nothing clear in frame.')
130130
} catch {
131131
onFrameDescribed?.('Could not describe the frame right now.')
@@ -162,8 +162,8 @@ export function CameraCapture({ onCapture, captureRef, onFrameDescribed, facingM
162162
}, [onCapture])
163163

164164
useEffect(() => {
165-
if (captureRef) captureRef.current = capture
166-
}, [captureRef, capture])
165+
if (captureRef) captureRef.current = { capture, describe: describeCurrentFrame }
166+
}, [captureRef, capture, describeCurrentFrame])
167167

168168
useEffect(() => {
169169
startCamera()

frontend/src/contexts/VoiceContext.jsx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ export function VoiceProvider({ children }) {
163163
speak(helps[current.screen] ?? helps.HOME)
164164
break
165165
}
166+
case 'describe_frame':
167+
window.dispatchEvent(new CustomEvent('voiceCommand', { detail: { type: 'DESCRIBE_FRAME' } }))
168+
break
166169
case 'read_wardrobe':
167170
window.dispatchEvent(new CustomEvent('voiceCommand', { detail: { type: 'READ_WARDROBE' } }))
168171
break

frontend/src/screens/ShoppingScreen.jsx

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { shoppingAnalyze } from '../services/api.js'
88
import { COLORS } from '../utils/constants.js'
99

1010
export function ShoppingScreen() {
11-
const { speak } = useVoice()
11+
const { speak, language } = useVoice()
1212
const { items: wardrobeItems } = useWardrobe()
1313
const { goBack } = useApp()
1414
const { profileContext } = useProfile()
@@ -18,17 +18,31 @@ export function ShoppingScreen() {
1818
const [errorMsg, setErrorMsg] = useState('')
1919
const [capturedUrl, setCapturedUrl] = useState(null)
2020
const analyzingRef = useRef(false)
21+
const cameraRef = useRef(null)
2122

2223
const topsInWardrobe = wardrobeItems.filter((i) => i.category === 'tops')
2324
const bottomsInWardrobe = wardrobeItems.filter((i) => i.category === 'bottoms')
2425
const emptyWardrobe = wardrobeItems.length === 0
2526

2627
useEffect(() => {
27-
speak(emptyWardrobe
28-
? 'Shopping mode. Tap the camera button to scan a clothing item for style advice.'
29-
: `Shopping mode. Tap the camera button to scan a top or bottom. I will check it against your ${wardrobeItems.length} wardrobe items.`)
28+
const wardrobeNote = emptyWardrobe
29+
? 'Tap the camera button to scan a clothing item for style advice.'
30+
: `Tap the camera button to scan a top or bottom. I will check it against your ${wardrobeItems.length} wardrobe items.`
31+
speak(`Shopping mode. ${wardrobeNote} Say "describe this" at any time to hear colour, pattern, and fabric details.`)
3032
}, []) // eslint-disable-line react-hooks/exhaustive-deps
3133

34+
// ── Voice: "describe this" triggers a live frame description ──────────────
35+
useEffect(() => {
36+
const handler = (e) => {
37+
const cmd = e.detail
38+
if (cmd.type === 'DESCRIBE_FRAME' && phase === 'camera') {
39+
cameraRef.current?.describe?.()
40+
}
41+
}
42+
window.addEventListener('voiceCommand', handler)
43+
return () => window.removeEventListener('voiceCommand', handler)
44+
}, [phase])
45+
3246
const handleCapture = useCallback(async (blob, dataUrl) => {
3347
if (analyzingRef.current) return
3448
analyzingRef.current = true
@@ -175,8 +189,20 @@ export function ShoppingScreen() {
175189
)}
176190

177191
<div style={{ flex: 1, position: 'relative', minHeight: 0 }}>
178-
<CameraCapture onCapture={handleCapture} aspectRatio="unset" />
179-
<div style={{ position: 'absolute', bottom: 16, left: 14, right: 14 }}>
192+
<CameraCapture
193+
onCapture={handleCapture}
194+
captureRef={cameraRef}
195+
onFrameDescribed={(t) => speak(t)}
196+
describeMode="shopping"
197+
language={language}
198+
aspectRatio="unset"
199+
/>
200+
<div style={{ position: 'absolute', bottom: 16, left: 14, right: 14, display: 'flex', flexDirection: 'column', gap: 6 }}>
201+
<div style={{ background: 'rgba(10,10,8,0.85)', border: `2px solid ${COLORS.BORDER}`, borderRadius: COLORS.RADIUS, padding: '6px 14px', textAlign: 'center' }}>
202+
<p style={{ fontSize: 11, fontWeight: 700, color: COLORS.ACCENT, margin: 0, letterSpacing: 0.5 }}>
203+
Say "describe this" — hear colour, pattern &amp; fabric
204+
</p>
205+
</div>
180206
<div style={{ background: COLORS.BG, border: `2px solid ${COLORS.BORDER}`, borderRadius: COLORS.RADIUS, padding: '8px 16px', textAlign: 'center' }}>
181207
<p style={{ fontSize: 12, fontWeight: 700, color: COLORS.TEXT_MUTED, margin: 0 }}>
182208
{emptyWardrobe

frontend/src/services/api.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,11 @@ export async function contextChat({ question, feature = 'scan', resultContext =
6969
return post('/context-chat', fd)
7070
}
7171

72-
export async function describeFrame(imageBlob, language = 'en') {
72+
export async function describeFrame(imageBlob, language = 'en', mode = 'general') {
7373
const fd = new FormData()
7474
fd.append('image', imageBlob, 'frame.jpg')
7575
fd.append('language', language)
76+
fd.append('mode', mode)
7677
return post('/describe-frame', fd, TIMEOUTS.analyze)
7778
}
7879

frontend/src/voice/commandParser.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ export function parseCommand(transcript) {
2525
if (/\b(save|add to wardrobe|keep this)\b/.test(t)) return { intent: 'save_item' }
2626
if (/\b(capture|take|shoot)\b/.test(t)) return { intent: 'capture' }
2727

28+
// ── Live frame description ─────────────────────────────────────────────────
29+
if (/\b(describe (this|what (i see|you see|is here))|what (is this|am i holding|do i see)|what('s| is) (in front of me|this))\b/.test(t)) return { intent: 'describe_frame' }
30+
2831
// ── Help ───────────────────────────────────────────────────────────────────
2932
if (/\b(help|what can i (say|do)|commands|options)\b/.test(t)) return { intent: 'help' }
3033
if (/\b(where am i|what screen|current screen)\b/.test(t)) return { intent: 'describe_screen' }

0 commit comments

Comments
 (0)