Merge pull request #1672 from arash77/llm_hub_update_image_models

bgruening · web-flow · commit 4d2ecbf82e3e · 2025-08-28T14:34:55.000+02:00
Add handling for image models
diff --git a/tools/llm_hub/llm_hub.py b/tools/llm_hub/llm_hub.py
@@ -14,25 +14,22 @@
 
 litellm_config_file = os.environ.get("LITELLM_CONFIG_FILE")
 if not litellm_config_file:
-    print("LITELLM_CONFIG_FILE environment variable is not set.")
-    sys.exit(1)
+    sys.exit("LITELLM_CONFIG_FILE environment variable is not set.")
 with open(litellm_config_file, "r") as f:
     config = yaml.safe_load(f)
 
 litellm_api_key = config.get("LITELLM_API_KEY")
 litellm_base_url = config.get("LITELLM_BASE_URL")
 
 if not litellm_api_key:
-    print(
+    sys.exit(
         "LiteLLM API key is not configured! Please set LITELLM_API_KEY environment variable."
     )
-    sys.exit(1)
 
 if not litellm_base_url:
-    print(
+    sys.exit(
         "LiteLLM base URL is not configured! Please set LITELLM_BASE_URL environment variable."
     )
-    sys.exit(1)
 
 client = OpenAI(
     api_key=litellm_api_key,
@@ -49,8 +46,7 @@ def read_text_file(file_path):
             with open(file_path, "r", encoding="latin-1") as f:
                 return f.read()
         except Exception:
-            print(f"Could not read file {file_path} as text")
-            sys.exit(1)
+            sys.exit(f"Could not read file {file_path} as text")
 
 
 def get_image_mime_type(image_path):
@@ -59,7 +55,7 @@ def get_image_mime_type(image_path):
     mime_type, _ = mimetypes.guess_type(image_path)
     if mime_type and mime_type.startswith("image/"):
         return mime_type
-    if image_path.lower().endswith((".png", ".jpg", ".jpeg", ".gif")):
+    if image_path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".tiff", ".bmp")):
         ext = image_path.lower().split(".")[-1]
         if ext == "jpg":
             ext = "jpeg"
@@ -76,53 +72,46 @@ def encode_image_to_base64(image_path):
         mime_type = get_image_mime_type(image_path)
         return f"data:{mime_type};base64,{base64_image}"
     except Exception:
-        print(f"Could not process image file: {image_path}")
-        sys.exit(1)
+        sys.exit(f"Could not process image file: {image_path}")
 
 
-content_text = question
-messages = []
+valid_model_types = {
+    "text": {"text"},
+    "image": {"image"},
+    "multimodal": {"text", "image"},
+}
 
-valid_model_types = ["text", "multimodal"]
 if model_type not in valid_model_types:
-    print(
+    sys.exit(
         f"Invalid model_type '{model_type}'. Must be one of: {', '.join(valid_model_types)}"
     )
-    sys.exit(1)
-
-if context_files:
-    context_text_parts = []
-    image_contents = []
-
-    for file_path, file_type in context_files:
-        if file_type == "image":
-            if model_type == "multimodal":
-                base64_image_url = encode_image_to_base64(file_path)
-                image_contents.append(
-                    {"type": "image_url", "image_url": {"url": base64_image_url}}
-                )
-            else:
-                print(
-                    f"Image file '{file_path}' provided, but model_type is not 'multimodal'."
-                )
-                sys.exit(1)
-        else:
-            text_content = read_text_file(file_path)
-            context_text_parts.append(
-                f"File: {file_path}\nContent:\n{text_content}\n---\n"
-            )
-
-    if context_text_parts:
-        context_text = "Context files:\n\n" + "\n".join(context_text_parts)
-        content_text = f"{context_text}\n\nUser Question: {question}"
-
-    if model_type == "multimodal" and image_contents:
-        content = [{"type": "text", "text": content_text}, *image_contents]
-        messages = [{"role": "user", "content": content}]
+
+contents = []
+for file_path, file_type in context_files:
+    if file_type not in valid_model_types[model_type]:
+        sys.exit(f"File type '{file_type}' not allowed for model_type '{model_type}'.")
+    if file_type == "image":
+        contents.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": encode_image_to_base64(file_path)},
+            }
+        )
     else:
-        messages = [{"role": "user", "content": content_text}]
-else:
-    messages = [{"role": "user", "content": content_text}]
+        contents.append(
+            {
+                "type": "text",
+                "text": f"File: {file_path}\nContent:\n{read_text_file(file_path)}",
+            }
+        )
+
+if question and "text" in valid_model_types[model_type]:
+    contents.append({"type": "text", "text": question})
+
+if not contents:
+    sys.exit("No input content provided.")
+
+messages = [{"role": "user", "content": contents}]
 
 
 max_retries = config.get("MAX_RETRIES", 3)
@@ -135,8 +124,7 @@ def encode_image_to_base64(image_path):
         break
     except InternalServerError as e:
         if attempt == max_retries - 1:
-            print("Max retries reached. Exiting.")
-            sys.exit(1)
+            sys.exit("Max retries reached. Exiting.")
         sleep_time = min(2**attempt + random.uniform(0, 1), max_delay)
         print(
             f"InternalServerError encountered ({e}). Retrying in {sleep_time:.2f} seconds..."
diff --git a/tools/llm_hub/llm_hub.xml b/tools/llm_hub/llm_hub.xml
@@ -2,7 +2,7 @@
     <description>Call any LLM</description>
     <macros>
         <import>macros.xml</import>
-        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@VERSION_SUFFIX@">1</token>
         <token name="@PROFILE@">24.0</token>
     </macros>
     <requirements>
@@ -14,28 +14,37 @@
 #import json
 #import os
 #import re
+
 #set LINK_LIST = []
-#for $input in $context
-    #set file_name = os.path.splitext($input.element_identifier)[0]
-    #set ext = $input.ext if $input.ext in ['html', 'json', 'txt', 'jpg', 'jpeg', 'png', 'gif'] else 'txt'
-    #set LINK = re.sub('[^\w\-]', '_', $file_name)+'.'+$ext
-    ln -s '$input' '$LINK' &&
-    #set type = 'image' if $input.ext in ['jpg', 'jpeg', 'png', 'gif'] else 'text'
-    $LINK_LIST.append([$LINK, $type])
+#for $input_type, $param in [('text', 'text_context'), ('image', 'image_context')]
+    #set context = $getVar($param, None)
+    #if $model_type in [$input_type, 'multimodal'] and $context
+        #for $input in ($context if isinstance($context, list) else [$context])
+            #set file_name = re.sub('[^\w\-]', '_', os.path.splitext($input.element_identifier)[0])
+            #set link_name = '%s_%s.%s' % ($input.hid, $file_name, $input.ext)
+            ln -s '$input' '$link_name' &&
+            $LINK_LIST.append(($link_name, $input_type))
+        #end for
+    #end if
 #end for
 #set context_files = json.dumps($LINK_LIST)
 
+#if $model_type == 'image'
+    #set prompt = ''
+#end if
+
 python '$__tool_directory__/llm_hub.py'
 '$context_files'
 '$prompt'
 '$model.fields.value'
-'$input_type_selector'
+'$model_type'
     ]]></command>
     <inputs>
-        <conditional name="input_type">
-            <param name="input_type_selector" type="select" label="Choose the model" help="Multimodal models are capable to have image and text as input.">
+        <conditional name="model_condition">
+            <param name="model_type" type="select" label="Choose the model" help="Multimodal models are capable to have image and text as input.">
                 <option value="multimodal" selected="true">Multimodal models</option>
                 <option value="text">Text models</option>
+                <option value="image">Image models</option>
             </param>
             <when value="multimodal">
                 <param name="model" type="select" optional="false" label="Model" help="Select the model you want to use.">
@@ -44,7 +53,11 @@ python '$__tool_directory__/llm_hub.py'
                     </options>
                     <validator message="No model annotation is available for LLM Hub" type="no_options"/>
                 </param>
-                <param name="context" type="data" multiple="true" optional="true" format="html,json,txt,jpg,png,gif" label="Context" max="500"/>
+                <param name="text_context" type="data" multiple="true" optional="true" format="html,json,txt" label="Text Context"/>
+                <param name="image_context" type="data" optional="true" format="jpg,png,gif,tiff,bmp" label="Image Context"/>
+                <param name="prompt" type="text" optional="false" label="Prompt" help="Prompts or tasks you want the LLM to perform." area="true">
+                    <validator type="empty_field"/>
+                </param>
             </when>
             <when value="text">
                 <param name="model" type="select" optional="false" label="Model" help="Select the model you want to use.">
@@ -53,24 +66,33 @@ python '$__tool_directory__/llm_hub.py'
                     </options>
                     <validator message="No model annotation is available for LLM Hub" type="no_options"/>
                 </param>
-                <param name="context" type="data" multiple="true" optional="true" format="html,json,txt" label="Context" max="500"/>
+                <param name="text_context" type="data" multiple="true" optional="true" format="html,json,txt" label="Text Context"/>
+                <param name="prompt" type="text" optional="false" label="Prompt" help="Prompts or tasks you want the LLM to perform." area="true">
+                    <validator type="empty_field"/>
+                </param>
+            </when>
+            <when value="image">
+                <param name="model" type="select" optional="false" label="Model" help="Select the model you want to use.">
+                    <options from_data_table="llm_models">
+                        <filter type="static_value" column="2" value="image"/>
+                    </options>
+                    <validator message="No model annotation is available for LLM Hub" type="no_options"/>
+                </param>
+                <param name="image_context" type="data" optional="false" format="jpg,png,gif,tiff,bmp" label="Image Context"/>
             </when>
         </conditional>
-        <param name="prompt" type="text" optional="false" label="Prompt" help="Prompts or tasks you want the LLM to perform." area="true">
-            <validator type="empty_field"/>
-        </param>
     </inputs>
     <outputs>
-        <data name="output" format="markdown" label="${tool.name} on ${on_string}" from_work_dir="./output.md"/>
+        <data name="output" format="markdown" label="${tool.name}(${model}) #if $on_string then ' on ' + $on_string else ''#" from_work_dir="./output.md"/>
     </outputs>
     <tests>
         <test expect_failure="true" expect_exit_code="1">
-            <conditional name="input_type">
-                <param name="input_type_selector" value="text"/>
+            <conditional name="model_condition">
+                <param name="model_type" value="text"/>
                 <param name="model" value="unknown"/>
-                <param name="context" value="test.txt" ftype="txt"/>
+                <param name="text_context" value="test.txt" ftype="txt"/>
+                <param name="prompt" value="What is this?"/>
             </conditional>
-            <param name="prompt" value="What is this?"/>
             <assert_stdout>
                 <has_text text="LiteLLM API key is not configured!"/>
             </assert_stdout>
@@ -98,9 +120,13 @@ Usage
 1. **Select a Model**: Choose the LLM model that best fits your needs. 
 Available models depend on what's configured in the LiteLLM proxy by your Galaxy administrators.
 
-2. **Upload Context Data**: You can upload files in formats such as TXT, HTML, JSON, JPG, PNG, or GIF. 
-This context data serves as additional input for the prompt you wish to execute.
-Vision-capable models can process image files.
+2. **Upload Context Data**: You can upload context data in different ways depending on the model type:
+
+   - **Text models**: Upload multiple text files (TXT, HTML, JSON) as context
+   - **Image models**: Upload a single image file (JPG, PNG, GIF, TIFF, BMP) as context  
+   - **Multimodal models**: Upload multiple text files and/or a single image file as context
+
+   Vision-capable (multimodal and image) models can process image files, but only one image file is supported per request.
 
 3. **Provide a Prompt**: Specify the task or question you want the LLM to address.
 The more specific the prompt, the more tailored the response will be.