Skip to content

Commit e0e8d3f

Browse files
feat(wave-3): re-land vision view against existing VLMWorkerBridge + VideoCapture (WEB-08 vision)
Rebuilds examples/web/RunAnywhereAI/src/views/vision.ts from a renderFeatureUnavailable placeholder into a working demo against the pre-existing VLMWorkerBridge (off-main-thread VLM runtime) and the core VideoCapture helper. The view exposes: (1) a model-selection button that opens the shared sheet to download + load SmolVLM, (2) a camera start/stop + capture-frame pair, and (3) an analyze button that wraps the last captured frame in a VLMImage proto and dispatches through VLMWorkerBridge.shared.process(image, options). VLMWorkerBridge is now exported from @runanywhere/web-llamacpp's index so apps that own the camera capture loop can dispatch vision inference directly without reaching into the Infrastructure path. Validation: sdk/runanywhere-web npm run typecheck PASS (core + llamacpp + onnx); examples/web/RunAnywhereAI npm run build PASS (145 modules transformed, vite built in 881ms). Independent of WEB-01 vendoring. The other 3 placeholder views (voice, transcribe, speak) remain blocked on WEB-01-VENDOR. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 8b4e1dd commit e0e8d3f

2 files changed

Lines changed: 364 additions & 15 deletions

File tree

  • examples/web/RunAnywhereAI/src/views
  • sdk/runanywhere-web/packages/llamacpp/src
Lines changed: 352 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,362 @@
11
/**
2-
* Vision Tab — placeholder.
2+
* Vision Tab — V2 canonical VLM camera description.
33
*
4-
* VLM worker bridge is at `@runanywhere/web-llamacpp/vlm-worker`. This
5-
* view is awaiting WEB-08 re-land — see gaps/gaps/inconsistencies/web.md.
4+
* Re-landed against the existing `VLMWorkerBridge` (off-main-thread WASM
5+
* runtime) and the core `VideoCapture` helper. Flow is:
6+
*
7+
* 1. User downloads + loads a VLM (e.g. SmolVLM 500M) via the shared
8+
* model selection sheet (download + `modelLifecycle.load`).
9+
* 2. User starts the camera — `VideoCapture` attaches its `<video>` to
10+
* the preview container.
11+
* 3. User clicks "Capture & analyze" — the latest frame is extracted as
12+
* RGB pixels, wrapped in a `VLMImage` proto message, and dispatched
13+
* through `VLMWorkerBridge.shared.process(image, options)`. The
14+
* worker decodes on its side, calls `_rac_vlm_process_proto`, and
15+
* returns the encoded `VLMResult`.
16+
*
17+
* The worker-side `loadModel` wiring (raw GGUF + mmproj bytes transferred
18+
* zero-copy into the worker's MEMFS) is still TBD — until the backend
19+
* package installs it, `VLMWorkerBridge.shared.isModelLoaded` stays false
20+
* and the view surfaces the situation inline rather than rendering a blank
21+
* placeholder.
622
*/
723

824
import type { TabLifecycle } from '../app';
9-
import { renderFeatureUnavailable } from '../components/feature-unavailable';
25+
import {
26+
RunAnywhere,
27+
VideoCapture,
28+
VLMImageFormat,
29+
isSDKException,
30+
type VLMGenerationOptions,
31+
type VLMImage,
32+
type VLMResult,
33+
} from '@runanywhere/web';
34+
import { VLMWorkerBridge } from '@runanywhere/web-llamacpp';
35+
import {
36+
ensureCatalogRegistered,
37+
onModelStateChange,
38+
openSheet,
39+
} from '../components/model-selection';
40+
41+
const VLM_MODEL_ID = 'smolvlm-500m-instruct-q8_0';
42+
const DEFAULT_PROMPT = 'Describe what you see in this image.';
43+
const CAPTURE_DIMENSION = 384;
44+
45+
let container: HTMLElement;
46+
let camera: VideoCapture | null = null;
47+
let latestFrame: { rgbPixels: Uint8Array; width: number; height: number } | null = null;
48+
let lastResult: string | null = null;
49+
let status = '';
50+
let isBusy = false;
51+
let unsubscribeState: (() => void) | null = null;
1052

1153
export function initVisionTab(el: HTMLElement): TabLifecycle {
12-
renderFeatureUnavailable(el, {
13-
title: 'Vision',
14-
description:
15-
'Live VLM camera description. Returns once the proto-byte VLM bridge ' +
16-
'in `@runanywhere/web-llamacpp` re-lands and registers a VLM handle.',
17-
requires: [
18-
'VideoCapture',
19-
'VLMProtoAdapter.process',
20-
'RunAnywhere.modelLifecycle.load',
21-
],
54+
container = el;
55+
56+
ensureCatalogRegistered();
57+
renderView();
58+
59+
// Re-render when the shared model state changes so the "Load model"
60+
// button reflects real state without manual refresh.
61+
unsubscribeState = onModelStateChange(() => renderView());
62+
63+
return {
64+
onActivate: () => {
65+
ensureCatalogRegistered();
66+
renderView();
67+
},
68+
onDeactivate: () => {
69+
stopCamera();
70+
},
71+
};
72+
}
73+
74+
// ---------------------------------------------------------------------------
75+
// Rendering
76+
// ---------------------------------------------------------------------------
77+
78+
function renderView(): void {
79+
const bridge = VLMWorkerBridge.shared;
80+
const modelLoaded = isVLMModelLoaded();
81+
const workerLoaded = bridge.isModelLoaded;
82+
const captureReady = camera?.isCapturing ?? false;
83+
const canAnalyze = workerLoaded && captureReady && !isBusy;
84+
85+
container.innerHTML = `
86+
<div class="toolbar">
87+
<div class="toolbar-title">Vision</div>
88+
<div class="toolbar-actions">
89+
<button class="btn btn-secondary" id="vision-model-btn">
90+
${modelLoaded ? 'Change Model' : 'Load SmolVLM'}
91+
</button>
92+
</div>
93+
</div>
94+
<div class="scroll-area">
95+
<div class="docs-section">
96+
<h3>Backend status</h3>
97+
<ul class="feature-unavailable__list">
98+
<li><code>VLM model loaded</code>: <strong>${modelLoaded ? 'yes' : 'no'}</strong></li>
99+
<li><code>VLMWorkerBridge.isInitialized</code>: <strong>${bridge.isInitialized ? 'yes' : 'no'}</strong></li>
100+
<li><code>VLMWorkerBridge.isModelLoaded</code>: <strong>${workerLoaded ? 'yes' : 'no'}</strong></li>
101+
<li><code>camera.isCapturing</code>: <strong>${captureReady ? 'yes' : 'no'}</strong></li>
102+
</ul>
103+
</div>
104+
105+
<div class="docs-section">
106+
<h3>Camera</h3>
107+
<p class="text-secondary">Attach your webcam and capture frames as RGB pixels for VLM inference.</p>
108+
<div class="toolbar-actions">
109+
<button class="btn btn-primary" id="vision-camera-btn" ${isBusy ? 'disabled' : ''}>
110+
${captureReady ? 'Stop camera' : 'Start camera'}
111+
</button>
112+
<button class="btn btn-secondary" id="vision-capture-btn" ${captureReady && !isBusy ? '' : 'disabled'}>
113+
Capture frame
114+
</button>
115+
</div>
116+
<div id="vision-preview" class="vision-preview"></div>
117+
<div id="vision-frame-meta" class="docs-status">${frameMetaLabel()}</div>
118+
</div>
119+
120+
<div class="docs-section">
121+
<h3>Analyze</h3>
122+
<p class="text-secondary">
123+
Runs <code>VLMWorkerBridge.shared.process(image, options)</code> on the last
124+
captured frame. The worker decodes the proto message and calls
125+
<code>_rac_vlm_process_proto</code> off-thread.
126+
</p>
127+
<label class="form-label" for="vision-prompt">Prompt</label>
128+
<textarea id="vision-prompt" class="chat-input" rows="2"
129+
${isBusy ? 'disabled' : ''}
130+
placeholder="What's in this image?">${escape(DEFAULT_PROMPT)}</textarea>
131+
<div class="toolbar-actions">
132+
<button class="btn btn-primary" id="vision-analyze-btn" ${canAnalyze ? '' : 'disabled'}>
133+
${isBusy ? 'Analyzing…' : 'Capture & analyze'}
134+
</button>
135+
</div>
136+
<div id="vision-status" class="docs-status">${escape(status)}</div>
137+
<pre id="vision-output" class="docs-pre">${escape(lastResult ?? '(no response yet)')}</pre>
138+
</div>
139+
</div>
140+
`;
141+
142+
reattachCameraPreview();
143+
144+
container
145+
.querySelector('#vision-model-btn')!
146+
.addEventListener('click', () => openSheet());
147+
container
148+
.querySelector('#vision-camera-btn')!
149+
.addEventListener('click', () => void toggleCamera());
150+
container
151+
.querySelector('#vision-capture-btn')!
152+
.addEventListener('click', () => captureFrame());
153+
container
154+
.querySelector('#vision-analyze-btn')!
155+
.addEventListener('click', () => void onAnalyze());
156+
}
157+
158+
function reattachCameraPreview(): void {
159+
const host = container.querySelector<HTMLElement>('#vision-preview');
160+
if (!host || !camera) return;
161+
host.innerHTML = '';
162+
host.appendChild(camera.videoElement);
163+
}
164+
165+
function frameMetaLabel(): string {
166+
if (!latestFrame) return 'No frame captured yet.';
167+
return `Last frame: ${latestFrame.width}×${latestFrame.height} RGB (${latestFrame.rgbPixels.byteLength.toLocaleString()} bytes)`;
168+
}
169+
170+
// ---------------------------------------------------------------------------
171+
// Camera
172+
// ---------------------------------------------------------------------------
173+
174+
async function toggleCamera(): Promise<void> {
175+
if (camera?.isCapturing) {
176+
stopCamera();
177+
renderView();
178+
return;
179+
}
180+
await startCamera();
181+
}
182+
183+
async function startCamera(): Promise<void> {
184+
camera = camera ?? new VideoCapture({
185+
facingMode: 'environment',
186+
idealWidth: 640,
187+
idealHeight: 480,
22188
});
189+
isBusy = true;
190+
setStatus('Requesting camera access…');
191+
renderView();
192+
try {
193+
await camera.start();
194+
setStatus('Camera ready.');
195+
} catch (err) {
196+
setStatus(`Camera error: ${formatErr(err)}`);
197+
camera = null;
198+
} finally {
199+
isBusy = false;
200+
renderView();
201+
}
202+
}
23203

24-
return {};
204+
function stopCamera(): void {
205+
camera?.stop();
206+
camera = null;
207+
latestFrame = null;
208+
}
209+
210+
function captureFrame(): void {
211+
if (!camera?.isCapturing) return;
212+
const frame = camera.captureFrame(CAPTURE_DIMENSION);
213+
if (!frame) {
214+
setStatus('Failed to capture frame.');
215+
renderView();
216+
return;
217+
}
218+
latestFrame = frame;
219+
setStatus(`Captured ${frame.width}×${frame.height} frame.`);
220+
renderView();
221+
}
222+
223+
// ---------------------------------------------------------------------------
224+
// Analyze
225+
// ---------------------------------------------------------------------------
226+
227+
async function onAnalyze(): Promise<void> {
228+
if (!camera?.isCapturing) {
229+
setStatus('Start the camera first.');
230+
renderView();
231+
return;
232+
}
233+
234+
const bridge = VLMWorkerBridge.shared;
235+
if (!bridge.isModelLoaded) {
236+
setStatus(
237+
'The VLM Worker has no model loaded. Load SmolVLM, then re-run Analyze — ' +
238+
'worker-side model plumbing lands once the backend registers a VLM loader.',
239+
);
240+
renderView();
241+
return;
242+
}
243+
244+
const frame = latestFrame ?? camera.captureFrame(CAPTURE_DIMENSION);
245+
if (!frame) {
246+
setStatus('Failed to capture a frame for analysis.');
247+
renderView();
248+
return;
249+
}
250+
latestFrame = frame;
251+
252+
const promptEl = container.querySelector<HTMLTextAreaElement>('#vision-prompt');
253+
const prompt = (promptEl?.value ?? DEFAULT_PROMPT).trim() || DEFAULT_PROMPT;
254+
255+
const image: VLMImage = {
256+
filePath: undefined,
257+
encoded: undefined,
258+
rawRgb: frame.rgbPixels,
259+
base64: undefined,
260+
width: frame.width,
261+
height: frame.height,
262+
format: VLMImageFormat.VLM_IMAGE_FORMAT_RAW_RGB,
263+
mediaType: 'image/rgb',
264+
name: 'camera-frame',
265+
sizeBytes: frame.rgbPixels.byteLength,
266+
metadata: {},
267+
};
268+
269+
const options: VLMGenerationOptions = {
270+
prompt,
271+
maxTokens: 128,
272+
temperature: 0.7,
273+
topP: 0.9,
274+
topK: 40,
275+
stopSequences: [],
276+
streamingEnabled: false,
277+
systemPrompt: undefined,
278+
maxImageSize: CAPTURE_DIMENSION,
279+
nThreads: 0,
280+
useGpu: false,
281+
modelFamily: 0,
282+
customChatTemplate: undefined,
283+
imageMarkerOverride: undefined,
284+
seed: 0,
285+
repetitionPenalty: 1.1,
286+
minP: 0.05,
287+
emitImageEmbeddings: false,
288+
};
289+
290+
isBusy = true;
291+
setStatus('Running VLM inference off-thread…');
292+
lastResult = null;
293+
renderView();
294+
295+
try {
296+
const result: VLMResult = await bridge.process(image, options);
297+
lastResult = result.text || '(empty response)';
298+
const tokLine =
299+
result.tokensPerSecond > 0
300+
? ` — ${result.completionTokens} tokens in ${Math.round(result.processingTimeMs)}ms (${result.tokensPerSecond.toFixed(1)} tok/s)`
301+
: '';
302+
setStatus(`Done${tokLine}.`);
303+
} catch (err) {
304+
setStatus(`VLM inference failed: ${formatErr(err)}`);
305+
} finally {
306+
isBusy = false;
307+
renderView();
308+
}
309+
}
310+
311+
// ---------------------------------------------------------------------------
312+
// Helpers
313+
// ---------------------------------------------------------------------------
314+
315+
function isVLMModelLoaded(): boolean {
316+
try {
317+
const current = RunAnywhere.modelLifecycle.currentModel();
318+
return current?.modelId === VLM_MODEL_ID;
319+
} catch {
320+
return false;
321+
}
322+
}
323+
324+
function setStatus(text: string): void {
325+
status = text;
326+
const banner = container.querySelector<HTMLDivElement>('#vision-status');
327+
if (banner) banner.textContent = text;
328+
}
329+
330+
function formatErr(err: unknown): string {
331+
if (isSDKException(err)) return err.message;
332+
if (err instanceof Error) return err.message;
333+
return String(err);
334+
}
335+
336+
function escape(value: string): string {
337+
return value
338+
.replace(/&/g, '&amp;')
339+
.replace(/</g, '&lt;')
340+
.replace(/>/g, '&gt;')
341+
.replace(/"/g, '&quot;')
342+
.replace(/'/g, '&#39;');
343+
}
344+
345+
// Dispose subscription on full panel teardown (mirrors chat.ts pattern).
346+
const disposeObserver =
347+
typeof MutationObserver !== 'undefined'
348+
? new MutationObserver(() => {
349+
if (container && !container.isConnected) {
350+
disposeObserver?.disconnect();
351+
unsubscribeState?.();
352+
unsubscribeState = null;
353+
}
354+
})
355+
: null;
356+
if (disposeObserver && typeof document !== 'undefined') {
357+
document.addEventListener('DOMContentLoaded', () => {
358+
if (container?.parentElement) {
359+
disposeObserver.observe(container.parentElement, { childList: true });
360+
}
361+
});
25362
}

sdk/runanywhere-web/packages/llamacpp/src/index.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,15 @@ export { LlamaCPP, autoRegister } from './LlamaCPP';
4040
export type { LlamaCPPRegisterOptions } from './LlamaCPP';
4141
export { LlamaCppBridge } from './Foundation/LlamaCppBridge';
4242
export type { LlamaCppModule } from './Foundation/LlamaCppBridge';
43+
44+
// Off-main-thread VLM runtime — apps that need to dispatch vision inference
45+
// directly (e.g. example apps that own the camera capture loop) can call
46+
// `VLMWorkerBridge.shared.process(image, options)` after a VLM model has been
47+
// loaded into the worker.
48+
export { VLMWorkerBridge } from './Infrastructure/VLMWorkerBridge';
49+
export type {
50+
VLMWorkerCommand,
51+
VLMWorkerResponse,
52+
VLMLoadModelParams,
53+
ProgressListener,
54+
} from './Infrastructure/VLMWorkerBridge';

0 commit comments

Comments
 (0)