Skip to content

Commit fbdbdab

Browse files
Merge pull request #10 from valentinfrlch/dev
Added support for 'detail' parameter
2 parents 7246472 + 9e499a4 commit fbdbdab

6 files changed

Lines changed: 67 additions & 39 deletions

File tree

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,20 +108,23 @@ To get GPT's analysis of a local image, use the following service call.
108108
```yaml
109109
service: gpt4vision.image_analyzer
110110
data:
111+
provider: OpenAI
112+
message: Describe what you see?
111113
max_tokens: 100
112-
message: Describe what you see
114+
model: gpt-4o
113115
image_file: |-
114116
/config/www/tmp/example.jpg
115117
/config/www/tmp/example2.jpg
116-
provider: LocalAI
117-
model: gpt-4-vision-preview
118118
target_width: 1280
119+
detail: low
119120
temperature: 0.5
120121
```
121-
The parameters `message`, `max_tokens`, `image_file`, `provider` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.
122-
123-
Optionally, the `model` and `target_width` properties can be set. For available models check these pages: [OpenAI](https://platform.openai.com/docs/models) and [LocalAI](https://localai.io/models/).
122+
The parameters `provider`, `message`, `max_tokens`, `image_file` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.
124123

124+
Optionally, the `model`, `target_width` and `detail` properties can be set.
125+
- For available **models** check these pages: [supported models for OpenAI](https://platform.openai.com/docs/models) and [LocalAI model gallery](https://localai.io/models/).
126+
- The **target_width** is an integer between 640 and 3840 representing the image width in pixels. It is used to downscale the image before encoding it.
127+
- The **detail** parameter can be set to `low` or `high`. If it is not set, it is set to 'auto'. OpenAI will then use the image size to determine the detail level. For more information check the [OpenAI documentation](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
125128
## How to report a bug or request a feature
126129
> [!NOTE]
127130
> **Bugs:** If you encounter any bugs and have followed the instructions carefully, feel free to file a bug report.

custom_components/gpt4vision/__init__.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
# Declare variables
22
from .const import (
33
DOMAIN,
4-
CONF_PROVIDER,
54
CONF_OPENAI_API_KEY,
6-
CONF_MAXTOKENS,
7-
CONF_TARGET_WIDTH,
8-
CONF_MODEL,
9-
CONF_MESSAGE,
10-
CONF_IMAGE_FILE,
115
CONF_LOCALAI_IP_ADDRESS,
126
CONF_LOCALAI_PORT,
137
CONF_OLLAMA_IP_ADDRESS,
148
CONF_OLLAMA_PORT,
15-
CONF_TEMPERATURE
9+
PROVIDER,
10+
MAXTOKENS,
11+
TARGET_WIDTH,
12+
MODEL,
13+
MESSAGE,
14+
IMAGE_FILE,
15+
TEMPERATURE,
16+
DETAIL
1617
)
1718
from .request_handlers import (
1819
handle_localai_request,
@@ -22,11 +23,14 @@
2223
import base64
2324
import io
2425
import os
26+
import logging
2527
from homeassistant.helpers.aiohttp_client import async_get_clientsession
2628
from homeassistant.core import SupportsResponse
2729
from homeassistant.exceptions import ServiceValidationError
2830
from PIL import Image
2931

32+
_LOGGER = logging.getLogger(__name__)
33+
3034

3135
async def async_setup_entry(hass, entry):
3236
"""Set up gpt4vision from a config entry."""
@@ -103,30 +107,32 @@ async def image_analyzer(data_call):
103107
ollama_port = hass.data.get(DOMAIN, {}).get(CONF_OLLAMA_PORT)
104108

105109
# Read data from service call
106-
mode = str(data_call.data.get(CONF_PROVIDER))
110+
mode = str(data_call.data.get(PROVIDER))
107111
# Message to be sent to AI model
108-
message = str(data_call.data.get(CONF_MESSAGE)[0:2000])
112+
message = str(data_call.data.get(MESSAGE)[0:2000])
109113
# Local path to your image. Example: "/config/www/images/garage.jpg"
110-
image_path = data_call.data.get(CONF_IMAGE_FILE)
114+
image_path = data_call.data.get(IMAGE_FILE)
111115
# create a list of image paths (separator: newline character)
112116
image_paths = image_path.split("\n")
113117
# Resolution (width only) of the image. Example: 1280 for 720p etc.
114-
target_width = data_call.data.get(CONF_TARGET_WIDTH, 1280)
118+
target_width = data_call.data.get(TARGET_WIDTH, 1280)
115119
# Temperature parameter. Default is 0.5
116-
temperature = float(data_call.data.get(CONF_TEMPERATURE, 0.5))
120+
temperature = float(data_call.data.get(TEMPERATURE, 0.5))
117121
# Maximum number of tokens used by model. Default is 100.
118-
max_tokens = int(data_call.data.get(CONF_MAXTOKENS))
122+
max_tokens = int(data_call.data.get(MAXTOKENS))
123+
# Detail one of ["high", "low", "auto"] default is "auto"
124+
detail = str(data_call.data.get(DETAIL, "auto"))
119125

120126
# Validate configuration and input data and set model
121127
if mode == 'OpenAI':
122128
validate(mode, api_key, image_paths)
123-
model = str(data_call.data.get(CONF_MODEL, "gpt-4o"))
129+
model = str(data_call.data.get(MODEL, "gpt-4o"))
124130
elif mode == 'LocalAI':
125131
validate(mode, None, image_paths, localai_ip_address, localai_port)
126-
model = str(data_call.data.get(CONF_MODEL, "gpt-4-vision-preview"))
132+
model = str(data_call.data.get(MODEL, "gpt-4-vision-preview"))
127133
elif mode == 'Ollama':
128134
validate(mode, None, image_paths, ollama_ip_address, ollama_port)
129-
model = str(data_call.data.get(CONF_MODEL, "llava"))
135+
model = str(data_call.data.get(MODEL, "llava"))
130136

131137

132138
def encode_image(image_path):
@@ -171,7 +177,7 @@ def encode_image(image_path):
171177
response_text = await handle_localai_request(session, model, message, base64_images, localai_ip_address, localai_port, max_tokens, temperature)
172178

173179
elif mode == "OpenAI":
174-
response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature)
180+
response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail)
175181

176182
elif mode == 'Ollama':
177183
response_text = await handle_ollama_request(session, model, message, base64_images, ollama_ip_address, ollama_port, max_tokens, temperature)

custom_components/gpt4vision/config_flow.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
from homeassistant.exceptions import ServiceValidationError
44
from homeassistant.helpers.aiohttp_client import async_get_clientsession
55
from .const import (
6-
DOMAIN,
7-
CONF_PROVIDER,
6+
DOMAIN,
87
CONF_OPENAI_API_KEY,
98
CONF_LOCALAI_IP_ADDRESS,
109
CONF_LOCALAI_PORT,
@@ -19,7 +18,7 @@
1918

2019
async def validate_mode(user_input: dict):
2120
# check CONF_MODE is not empty
22-
if not user_input[CONF_PROVIDER]:
21+
if not user_input["provider"]:
2322
raise ServiceValidationError("empty_mode")
2423

2524

@@ -33,6 +32,7 @@ async def validate_localai(hass, user_input: dict):
3332
raise ServiceValidationError("empty_port")
3433
# perform handshake with LocalAI server
3534
if not await validate_connection(hass, user_input[CONF_LOCALAI_IP_ADDRESS], user_input[CONF_LOCALAI_PORT], "/readyz"):
35+
_LOGGER.error("Could not connect to LocalAI server.")
3636
raise ServiceValidationError("handshake_failed")
3737

3838

@@ -46,12 +46,14 @@ async def validate_ollama(hass, user_input: dict):
4646
raise ServiceValidationError("empty_port")
4747
# perform handshake with LocalAI server
4848
if not await validate_connection(hass, user_input[CONF_OLLAMA_IP_ADDRESS], user_input[CONF_OLLAMA_PORT], "/api/tags"):
49+
_LOGGER.error("Could not connect to Ollama server.")
4950
raise ServiceValidationError("handshake_failed")
5051

5152

5253
def validate_openai(user_input: dict):
5354
# check CONF_API_KEY is not empty
5455
if not user_input[CONF_OPENAI_API_KEY]:
56+
_LOGGER.error("OpenAI API key is empty.")
5557
raise ServiceValidationError("empty_api_key")
5658

5759

@@ -65,6 +67,7 @@ async def validate_connection(hass, ip_address, port, endpoint, expected_status=
6567
else:
6668
return False
6769
except Exception as e:
70+
_LOGGER.error(f"Could not connect to {url}: {e}")
6871
return False
6972

7073

@@ -86,16 +89,19 @@ async def async_step_user(self, user_input=None):
8689

8790
if user_input is not None:
8891
self.init_info = user_input
89-
if user_input[CONF_PROVIDER] == "LocalAI":
92+
if user_input["provider"] == "LocalAI":
9093
if DOMAIN in self.hass.data and CONF_LOCALAI_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_LOCALAI_PORT in self.hass.data[DOMAIN]:
94+
_LOGGER.error("LocalAI already configured.")
9195
return self.async_abort(reason="already_configured")
9296
return await self.async_step_localai()
93-
elif user_input[CONF_PROVIDER] == "Ollama":
97+
elif user_input["provider"] == "Ollama":
9498
if DOMAIN in self.hass.data and CONF_OLLAMA_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_OLLAMA_PORT in self.hass.data[DOMAIN]:
99+
_LOGGER.error("Ollama already configured.")
95100
return self.async_abort(reason="already_configured")
96101
return await self.async_step_ollama()
97102
else:
98103
if DOMAIN in self.hass.data and CONF_OPENAI_API_KEY in self.hass.data[DOMAIN]:
104+
_LOGGER.error("OpenAI already configured.")
99105
return self.async_abort(reason="already_configured")
100106
return await self.async_step_openai()
101107

@@ -117,6 +123,7 @@ async def async_step_localai(self, user_input=None):
117123
# add the mode to user_input
118124
return self.async_create_entry(title="GPT4Vision LocalAI", data=user_input)
119125
except ServiceValidationError as e:
126+
_LOGGER.error(f"Validation failed: {e}")
120127
return self.async_show_form(
121128
step_id="localai",
122129
data_schema=data_schema,
@@ -140,6 +147,7 @@ async def async_step_ollama(self, user_input=None):
140147
# add the mode to user_input
141148
return self.async_create_entry(title="GPT4Vision Ollama", data=user_input)
142149
except ServiceValidationError as e:
150+
_LOGGER.error(f"Validation failed: {e}")
143151
return self.async_show_form(
144152
step_id="ollama",
145153
data_schema=data_schema,
@@ -163,6 +171,7 @@ async def async_step_openai(self, user_input=None):
163171
user_input["provider"] = self.init_info["provider"]
164172
return self.async_create_entry(title="GPT4Vision OpenAI", data=user_input)
165173
except ServiceValidationError as e:
174+
_LOGGER.error(f"Validation failed: {e}")
166175
return self.async_show_form(
167176
step_id="openai",
168177
data_schema=data_schema,
Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,21 @@
11
""" Constants for gpt4vision component"""
22

3-
# Global values
3+
# Global constants
44
DOMAIN = "gpt4vision"
55

66
# Configuration values from setup
7-
CONF_PROVIDER = 'provider'
87
CONF_OPENAI_API_KEY = 'api_key'
98
CONF_LOCALAI_IP_ADDRESS = 'localai_ip'
109
CONF_LOCALAI_PORT = 'localai_port'
1110
CONF_OLLAMA_IP_ADDRESS = 'ollama_ip'
1211
CONF_OLLAMA_PORT = 'ollama_port'
1312

14-
# Values from service call
15-
CONF_MAXTOKENS = 'max_tokens'
16-
CONF_TARGET_WIDTH = 'target_width'
17-
CONF_MODEL = 'model'
18-
CONF_MESSAGE = 'message'
19-
CONF_IMAGE_FILE = 'image_file'
20-
CONF_TEMPERATURE = 'temperature'
13+
# service call constants
14+
PROVIDER = 'provider'
15+
MAXTOKENS = 'max_tokens'
16+
TARGET_WIDTH = 'target_width'
17+
MODEL = 'model'
18+
MESSAGE = 'message'
19+
IMAGE_FILE = 'image_file'
20+
TEMPERATURE = 'temperature'
21+
DETAIL = 'detail'

custom_components/gpt4vision/request_handlers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ async def handle_localai_request(session, model, message, base64_images, ip_addr
3131
return response_text
3232

3333

34-
async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature):
34+
async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail):
3535
headers = {'Content-type': 'application/json',
3636
'Authorization': 'Bearer ' + api_key}
3737
data = {"model": model,
@@ -45,7 +45,7 @@ async def handle_openai_request(session, model, message, base64_images, api_key,
4545
# Add the images to the request
4646
for image in base64_images:
4747
data["messages"][0]["content"].append(
48-
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}})
48+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": detail}})
4949

5050
try:
5151
response = await session.post(

custom_components/gpt4vision/services.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ image_analyzer:
5050
number:
5151
min: 640
5252
max: 3840
53+
detail:
54+
required: false
55+
description: "Detail parameter (OpenAI only), leave empty for 'auto'"
56+
default: 'high'
57+
selector:
58+
select:
59+
options:
60+
- 'high'
61+
- 'low'
5362
temperature:
5463
required: true
5564
description: 'Randomness. Lower is more accurate, higher is more creative'

0 commit comments

Comments
 (0)