Significant-Gravitas · Bentlybro · Dec 13, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
@@ -111,6 +111,7 @@ class LlmModel(str, Enum, metaclass=LlmModelMeta):
     # Ollama models
     OLLAMA_LLAMA3_8B = "llama3"
     OLLAMA_LLAMA3_405B = "llama3.1:405b"
+    OLLAMA_DOLPHIN = "dolphin-mistral:latest"
     # OpenRouter models
     GEMINI_FLASH_1_5_8B = "google/gemini-flash-1.5"
     GROK_BETA = "x-ai/grok-beta"
@@ -164,6 +165,7 @@ def context_window(self) -> int:
     LlmModel.LLAMA3_1_8B: ModelMetadata("groq", 131072),
     LlmModel.OLLAMA_LLAMA3_8B: ModelMetadata("ollama", 8192),
     LlmModel.OLLAMA_LLAMA3_405B: ModelMetadata("ollama", 8192),
+    LlmModel.OLLAMA_DOLPHIN: ModelMetadata("ollama", 32768),
     LlmModel.GEMINI_FLASH_1_5_8B: ModelMetadata("open_router", 8192),
     LlmModel.GROK_BETA: ModelMetadata("open_router", 8192),
     LlmModel.MISTRAL_NEMO: ModelMetadata("open_router", 4000),
@@ -240,6 +242,12 @@ class Input(BlockSchema):
             description="The maximum number of tokens to generate in the chat completion.",
         )
 
+        ollama_host: str = SchemaField(
+            advanced=True,
+            default="localhost:11434",
+            description="Ollama host for local  models",
+        )
+
     class Output(BlockSchema):
         response: dict[str, Any] = SchemaField(
             description="The response object generated by the language model."
@@ -285,6 +293,7 @@ def llm_call(
         prompt: list[dict],
         json_format: bool,
         max_tokens: int | None = None,
+        ollama_host: str = "localhost:11434",
     ) -> tuple[str, int, int]:
         """
         Args:
@@ -293,6 +302,7 @@ def llm_call(
             prompt: The prompt to send to the LLM.
             json_format: Whether the response should be in JSON format.
             max_tokens: The maximum number of tokens to generate in the chat completion.
+            ollama_host: The host for ollama to use
 
         Returns:
             The response from the LLM.
@@ -382,9 +392,10 @@ def llm_call(
                 response.usage.completion_tokens if response.usage else 0,
             )
         elif provider == "ollama":
+            client = ollama.Client(host=ollama_host)
             sys_messages = [p["content"] for p in prompt if p["role"] == "system"]
             usr_messages = [p["content"] for p in prompt if p["role"] != "system"]
-            response = ollama.generate(
+            response = client.generate(
                 model=llm_model.value,
                 prompt=f"{sys_messages}\n\n{usr_messages}",
                 stream=False,
@@ -484,6 +495,7 @@ def parse_response(resp: str) -> tuple[dict[str, Any], str | None]:
                     llm_model=llm_model,
                     prompt=prompt,
                     json_format=bool(input_data.expected_format),
+                    ollama_host=input_data.ollama_host,
                     max_tokens=input_data.max_tokens,
                 )
                 self.merge_stats(
@@ -566,6 +578,11 @@ class Input(BlockSchema):
         prompt_values: dict[str, str] = SchemaField(
             advanced=False, default={}, description="Values used to fill in the prompt."
         )
+        ollama_host: str = SchemaField(
+            advanced=True,
+            default="localhost:11434",
+            description="Ollama host for local  models",
+        )
         max_tokens: int | None = SchemaField(
             advanced=True,
             default=None,
@@ -656,6 +673,11 @@ class Input(BlockSchema):
             description="The number of overlapping tokens between chunks to maintain context.",
             ge=0,
         )
+        ollama_host: str = SchemaField(
+            advanced=True,
+            default="localhost:11434",
+            description="Ollama host for local  models",
+        )
 
     class Output(BlockSchema):
         summary: str = SchemaField(description="The final summary of the text.")
@@ -794,6 +816,11 @@ class Input(BlockSchema):
             default=None,
             description="The maximum number of tokens to generate in the chat completion.",
         )
+        ollama_host: str = SchemaField(
+            advanced=True,
+            default="localhost:11434",
+            description="Ollama host for local  models",
+        )
 
     class Output(BlockSchema):
         response: str = SchemaField(
@@ -891,6 +918,11 @@ class Input(BlockSchema):
             default=None,
             description="The maximum number of tokens to generate in the chat completion.",
         )
+        ollama_host: str = SchemaField(
+            advanced=True,
+            default="localhost:11434",
+            description="Ollama host for local  models",
+        )
 
     class Output(BlockSchema):
         generated_list: List[str] = SchemaField(description="The generated list.")
@@ -1042,6 +1074,7 @@ def run(
                         credentials=input_data.credentials,
                         model=input_data.model,
                         expected_format={},  # Do not use structured response
+                        ollama_host=input_data.ollama_host,
                     ),
                     credentials=credentials,
                 )

@@ -53,6 +53,7 @@
     LlmModel.LLAMA3_1_8B: 1,
     LlmModel.OLLAMA_LLAMA3_8B: 1,
     LlmModel.OLLAMA_LLAMA3_405B: 1,
+    LlmModel.OLLAMA_DOLPHIN: 1,
     LlmModel.GEMINI_FLASH_1_5_8B: 1,
     LlmModel.GROK_BETA: 5,
     LlmModel.MISTRAL_NEMO: 1,