Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
</p>

## Features
- Supports OpenRouter, OpenAI, Anthropic, Google Gemini, AWS Bedrock, Azure, Groq, [Ollama](https://ollama.com/), [Open WebUI](https://github.com/open-webui/open-webui), [LocalAI](https://github.com/mudler/LocalAI) and any provider with OpenAI compatible endpoints.
- Supports OpenRouter, OpenAI, Anthropic, Google Gemini, AWS Bedrock, Azure, Groq, [TwelveLabs](https://twelvelabs.io) (Pegasus video understanding), [Ollama](https://ollama.com/), [Open WebUI](https://github.com/open-webui/open-webui), [LocalAI](https://github.com/mudler/LocalAI) and any provider with OpenAI compatible endpoints.
- Answers questions and provides descriptions of images, video files, live camera feeds, and Frigate events based on your prompt.
- Remembers people, pets and objects
- Keeps a timeline of camera events, so you can display them on your dashboard or ask Assist about them.
Expand Down
86 changes: 86 additions & 0 deletions custom_components/llmvision/config_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Ollama,
AWSBedrock,
Mistral,
TwelveLabs,
)
from .const import (
DOMAIN,
Expand Down Expand Up @@ -56,6 +57,7 @@
DEFAULT_OPENWEBUI_MODEL,
DEFAULT_OPENROUTER_MODEL,
DEFAULT_MISTRAL_MODEL,
DEFAULT_TWELVELABS_MODEL,
ENDPOINT_OPENWEBUI,
ENDPOINT_AZURE,
ENDPOINT_OPENROUTER,
Expand Down Expand Up @@ -90,6 +92,7 @@ async def handle_provider(self, provider):
"OpenWebUI": self.async_step_openwebui,
"OpenRouter": self.async_step_openrouter,
"Mistral": self.async_step_mistral,
"TwelveLabs": self.async_step_twelvelabs,
}

step_method = provider_steps.get(provider)
Expand Down Expand Up @@ -129,6 +132,7 @@ async def async_step_user(self, user_input=None):
"OpenWebUI",
"OpenRouter",
"Mistral",
"TwelveLabs",
"Custom OpenAI",
],
"mode": "dropdown",
Expand Down Expand Up @@ -1729,6 +1733,88 @@ async def async_step_mistral(self, user_input=None):
data_schema=data_schema,
)

async def async_step_twelvelabs(self, user_input=None):
data_schema = vol.Schema(
{
vol.Optional("connection_section"): section(
vol.Schema(
{
vol.Required(CONF_API_KEY): selector(
{"text": {"type": "password"}}
)
}
),
{"collapsed": False},
),
vol.Optional("model_section"): section(
vol.Schema(
{
vol.Required(
CONF_DEFAULT_MODEL, default=DEFAULT_TWELVELABS_MODEL
): str,
vol.Optional(CONF_TEMPERATURE, default=0.2): selector(
{
"number": {
"min": 0,
"max": 1,
"step": 0.1,
"mode": "slider",
}
}
),
}
),
{"collapsed": False},
),
}
)

if self.source == config_entries.SOURCE_RECONFIGURE:
self.init_info = self._get_reconfigure_entry().data
suggested = {
"connection_section": {CONF_API_KEY: self.init_info.get(CONF_API_KEY)},
"model_section": {
CONF_DEFAULT_MODEL: self.init_info.get(
CONF_DEFAULT_MODEL, DEFAULT_TWELVELABS_MODEL
),
CONF_TEMPERATURE: self.init_info.get(CONF_TEMPERATURE, 0.2),
},
}
data_schema = self.add_suggested_values_to_schema(data_schema, suggested)

if user_input is not None:
user_input[CONF_PROVIDER] = self.init_info[CONF_PROVIDER]
user_input = flatten_dict(user_input)
try:
twelvelabs = TwelveLabs(
self.hass,
api_key=user_input[CONF_API_KEY],
model=user_input[CONF_DEFAULT_MODEL],
)
await twelvelabs.validate()
user_input[CONF_PROVIDER] = self.init_info[CONF_PROVIDER]
if self.source == config_entries.SOURCE_RECONFIGURE:
return self.async_update_reload_and_abort(
self._get_reconfigure_entry(),
data_updates=user_input,
)
else:
return self.async_create_entry(
title="TwelveLabs Pegasus", data=user_input
)
except ServiceValidationError as e:
_LOGGER.error(f"Validation failed: {e}")
return self.async_show_form(
step_id="twelvelabs",
data_schema=data_schema,
errors={"base": "empty_api_key"},
)

return self.async_show_form(
step_id="twelvelabs",
data_schema=data_schema,
)

async def async_step_reconfigure(self, user_input):
data = self._get_reconfigure_entry().data
provider = data[CONF_PROVIDER]
Expand Down
5 changes: 5 additions & 0 deletions custom_components/llmvision/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@
ERROR_GROQ_MULTIPLE_IMAGES = "Groq does not support videos or streams"
ERROR_NO_IMAGE_INPUT = "No image input provided"
ERROR_HANDSHAKE_FAILED = "Connection could not be established"
ERROR_TWELVELABS_ENCODE_FAILED = (
"Could not encode frames into a video clip for TwelveLabs Pegasus"
)

# Versions
VERSION_ANTHROPIC = "2023-06-01" # https://docs.anthropic.com/en/api/versioning
Expand Down Expand Up @@ -147,6 +150,7 @@
DEFAULT_OPENWEBUI_MODEL = "gemma3:4b"
DEFAULT_OPENROUTER_MODEL = "google/gemma-3-4b-it:free"
DEFAULT_MISTRAL_MODEL = "pixtral-12b-2409"
DEFAULT_TWELVELABS_MODEL = "pegasus1.5"

DEFAULT_SUMMARY_PROMPT = "Provide a brief summary for the following titles. Focus on the key actions or changes that occurred over time and avoid unnecessary details or subjective interpretations. The summary should be concise, objective, and relevant to the content of the images. Keep the summary under 50 words and ensure it captures the main events or activities described in the descriptions. Here are the descriptions:\n "

Expand All @@ -161,3 +165,4 @@
ENDPOINT_AZURE = "{base_url}openai/deployments/{deployment}/chat/completions?api-version={api_version}"
ENDPOINT_OPENROUTER = "https://openrouter.ai/api/v1/chat/completions"
ENDPOINT_MISTRAL = "https://api.mistral.ai/v1/chat/completions"
ENDPOINT_TWELVELABS = "https://api.twelvelabs.io/v1.3/analyze"
161 changes: 161 additions & 0 deletions custom_components/llmvision/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import re
import json
import base64
import asyncio
import os
import tempfile
from .const import (
DOMAIN,
CONF_API_KEY,
Expand All @@ -37,9 +40,11 @@
ENDPOINT_GROQ,
ENDPOINT_OPENROUTER,
ENDPOINT_MISTRAL,
ENDPOINT_TWELVELABS,
ERROR_NOT_CONFIGURED,
ERROR_GROQ_MULTIPLE_IMAGES,
ERROR_NO_IMAGE_INPUT,
ERROR_TWELVELABS_ENCODE_FAILED,
DEFAULT_OPENAI_MODEL,
DEFAULT_ANTHROPIC_MODEL,
DEFAULT_AZURE_MODEL,
Expand All @@ -52,6 +57,7 @@
DEFAULT_OPENWEBUI_MODEL,
DEFAULT_OPENROUTER_MODEL,
DEFAULT_MISTRAL_MODEL,
DEFAULT_TWELVELABS_MODEL,
CONF_KEEP_ALIVE,
CONF_CONTEXT_WINDOW,
CONF_TEMPERATURE,
Expand Down Expand Up @@ -133,6 +139,7 @@ def get_default_model(self, provider):
"Open WebUI": DEFAULT_OPENWEBUI_MODEL,
"OpenRouter": DEFAULT_OPENROUTER_MODEL,
"Mistral": DEFAULT_MISTRAL_MODEL,
"TwelveLabs": DEFAULT_TWELVELABS_MODEL,
}.get(provider_name)

def validate(self, call: Any) -> None | ServiceValidationError:
Expand Down Expand Up @@ -2070,6 +2077,155 @@ def supports_structured_output(self) -> bool:
return True


class TwelveLabs(Provider):
"""TwelveLabs Pegasus video-understanding provider.

Unlike the other providers, Pegasus is a video model: it reasons over a
short clip rather than independent stills. LLM Vision always decomposes its
inputs (videos, camera snapshots, Frigate events) into base64 JPEG frames
before a provider is called, so this provider re-encodes those frames back
into a tiny in-memory MP4 (using the ffmpeg binary the integration already
requires) and sends it to the Pegasus `/analyze` endpoint. This lets
Pegasus pick up on motion across the keyframes instead of treating them as
unrelated images.
"""

def __init__(self, hass: HomeAssistant, api_key: str, model: str):
super().__init__(hass, api_key, model)

def _generate_headers(self) -> dict:
return {"x-api-key": self.api_key, "content-type": "application/json"}

async def _frames_to_mp4_base64(self, base64_images: list, fps: int = 1) -> str:
"""Encode an ordered list of base64 JPEG frames into a base64 MP4 clip.

Pegasus needs a real (seekable) MP4 container, so we mux through a
temporary file rather than a pipe. ffmpeg is already a runtime
requirement of this integration (see media_handlers).
"""
if not base64_images:
raise ServiceValidationError(ERROR_NO_IMAGE_INPUT)

try:
jpeg_bytes = b"".join(base64.b64decode(img) for img in base64_images)
except Exception as e:
_LOGGER.error(f"Failed to decode frames for TwelveLabs: {e}")
raise ServiceValidationError(ERROR_TWELVELABS_ENCODE_FAILED)

with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
output_path = tmp.name

try:
ffmpeg_cmd = [
"ffmpeg",
"-y",
"-loglevel",
"error",
"-f",
"image2pipe",
"-vcodec",
"mjpeg",
"-framerate",
str(fps),
"-i",
"-",
"-c:v",
"libx264",
"-pix_fmt",
"yuv420p",
"-movflags",
"+faststart",
output_path,
]
process = await asyncio.create_subprocess_exec(
*ffmpeg_cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await process.communicate(input=jpeg_bytes)
if process.returncode != 0:
_LOGGER.error(
f"ffmpeg failed to build clip for TwelveLabs: "
f"{(stderr or b'').decode(errors='ignore')[:300]}"
)
raise ServiceValidationError(ERROR_TWELVELABS_ENCODE_FAILED)

with open(output_path, "rb") as f:
mp4_bytes = f.read()
finally:
try:
os.unlink(output_path)
except OSError:
pass

if not mp4_bytes:
raise ServiceValidationError(ERROR_TWELVELABS_ENCODE_FAILED)

return base64.b64encode(mp4_bytes).decode("utf-8")

async def _make_request(self, data: dict) -> str:
headers = self._generate_headers()
response = await self._post(url=ENDPOINT_TWELVELABS, headers=headers, data=data)
if not isinstance(response, dict):
raise ServiceValidationError("invalid_response")
# Pegasus may return None on an error finish_reason; surface what we can.
response_text = response.get("data")
if response_text is None:
raise ServiceValidationError("invalid_response")
return response_text.strip()

async def _prepare_vision_data(self, call: Any) -> dict:
# Pegasus's max_tokens has a model minimum (512); clamp to stay valid.
max_tokens = max(int(getattr(call, "max_tokens", 512) or 512), 512)
clip_base64 = await self._frames_to_mp4_base64(call.base64_images)
prompt = f"{self._get_system_prompt()}\n\n{call.message}"
return {
"model_name": self.model,
"video": {"type": "base64_string", "base64_string": clip_base64},
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": self._get_default_parameters(call).get("temperature"),
"stream": False,
}

async def _prepare_text_data(self, call: Any) -> dict:
# Title generation operates on text only; Pegasus requires a video, so
# this path is not used (see vision_request / title_request overrides).
raise ServiceValidationError("invalid_provider")

async def vision_request(self, call: Any) -> str:
data = await self._prepare_vision_data(call)
return await self._make_request(data)

async def title_request(self, call: Any) -> str:
# Pegasus is multimodal-in/text-out only; it cannot summarise raw text.
# Titles are derived from the description by the orchestrator instead.
return "Event Detected"

async def validate(self) -> None | ServiceValidationError:
if not self.api_key:
raise ServiceValidationError("empty_api_key")
# Validate the key with a cheap, well-formed request. A missing video
# yields a 400 with a parameter error (key accepted); an invalid key
# yields 401. Either non-auth response confirms the key is usable.
headers = self._generate_headers()
data = {"model_name": self.model, "prompt": "Hi", "max_tokens": 512}
try:
await self._post(url=ENDPOINT_TWELVELABS, headers=headers, data=data)
except ServiceValidationError as e:
message = str(e).lower()
if (
"api" in message
and "key" in message
or "auth" in message
or "401" in message
):
raise ServiceValidationError("empty_api_key")
# A parameter/validation error means the key authenticated fine.
return None


class ProviderFactory:
"""
Factory to create provider instances from a provider name and config
Expand Down Expand Up @@ -2192,4 +2348,9 @@ def create(
model=model,
)

if provider_name == "TwelveLabs":
return TwelveLabs(
hass, api_key=cast(str, config.get(CONF_API_KEY) or ""), model=model
)

raise ServiceValidationError("invalid_provider")
30 changes: 29 additions & 1 deletion custom_components/llmvision/strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,34 @@
}
}
},
"twelvelabs": {
"title": "Configure TwelveLabs Pegasus",
"description": "Provide a valid TwelveLabs API key from the TwelveLabs dashboard.",
"sections": {
"connection_section": {
"name": "Connection",
"description": "TwelveLabs authentication",
"data": {
"api_key": "API key"
},
"data_description": {
"api_key": "Your TwelveLabs API key from the TwelveLabs dashboard."
}
},
"model_section": {
"name": "Model",
"description": "Set default model parameters",
"data": {
"default_model": "Default model",
"temperature": "Temperature"
},
"data_description": {
"default_model": "The Pegasus model to use (for example pegasus1.5). Pegasus analyzes the captured frames as a short video clip.",
"temperature": "Controls the randomness of the output. Lower values make the output more deterministic."
}
}
}
},
"settings": {
"title": "Settings",
"description": "Configure the LLM Vision integration. This entry is required before setting up other providers. If you wish to use the default settings, just press 'Submit'.",
Expand Down Expand Up @@ -461,4 +489,4 @@
"reconfigure_successful": "Saved settings successfully"
}
}
}
}
Loading