Reading file images

Aman Rusia · Aman Rusia · commit 041e1008849b · 2024-10-17T13:08:25.000+05:30
diff --git a/src/relay/serve.py b/src/relay/serve.py
@@ -42,6 +42,7 @@ class Mdata(BaseModel):
 
 @app.websocket("/register_serve_image/{uuid}")
 async def register_serve_image(websocket: WebSocket, uuid: UUID) -> None:
+    raise Exception("Disabled")
     await websocket.accept()
     received_data = await websocket.receive_json()
     name = received_data["name"]
diff --git a/src/wcgw/basic.py b/src/wcgw/basic.py
@@ -1,4 +1,6 @@
+import base64
 import json
+import mimetypes
 from pathlib import Path
 import sys
 import traceback
@@ -8,17 +10,20 @@
 from openai.types.chat import (
     ChatCompletionMessageParam,
     ChatCompletionAssistantMessageParam,
+    ChatCompletionUserMessageParam,
+    ChatCompletionContentPartParam,
     ChatCompletionMessage,
     ParsedChatCompletionMessage,
 )
 import rich
+import petname
 from typer import Typer
 import uuid
 
 from .common import Models, discard_input
 from .common import CostData, History
 from .openai_utils import get_input_cost, get_output_cost
-from .tools import ExecuteBash
+from .tools import ExecuteBash, ReadImage, ImageData
 
 from .tools import (
     BASH_CLF_OUTPUT,
@@ -80,6 +85,38 @@ def save_history(history: History, session_id: str) -> None:
         json.dump(history, f, indent=3)
 
 
+def parse_user_message_special(msg: str) -> ChatCompletionUserMessageParam:
+    # Search for lines starting with `%` and treat them as special commands
+    parts: list[ChatCompletionContentPartParam] = []
+    for line in msg.split("\n"):
+        if line.startswith("%"):
+            args = line[1:].strip().split(" ")
+            command = args[0]
+            assert command == 'image'
+            image_path = args[1]
+            with open(image_path, 'rb') as f:
+                image_bytes = f.read()
+                image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+                image_type = mimetypes.guess_type(image_path)[0]
+                dataurl=f'data:{image_type};base64,{image_b64}'
+            parts.append({
+                'type': 'image_url',
+                'image_url': {
+                    'url': dataurl,
+                    'detail': 'auto'
+                }
+            })
+        else:
+            if len(parts) > 0 and parts[-1]['type'] == 'text':
+                parts[-1]['text'] += '\n' + line
+            else:
+                parts.append({'type': 'text', 'text': line})
+    return {
+        'role': 'user',
+        'content': parts
+    }
+
+
 app = Typer(pretty_exceptions_show_locals=False)
 
 
@@ -94,6 +131,7 @@ def loop(
     session_id = str(uuid.uuid4())[:6]
 
     history: History = []
+    waiting_for_assistant = False
     if resume:
         if resume == "latest":
             resume_path = sorted(Path(".wcgw").iterdir(), key=os.path.getmtime)[-1]
@@ -108,6 +146,7 @@ def loop(
         if history[1]["role"] != "user":
             raise ValueError("Invalid history file, second message should be user")
         first_message = ""
+        waiting_for_assistant = history[-1]['role'] != 'assistant'
 
     my_dir = os.path.dirname(__file__)
     config_file = os.path.join(my_dir, "..", "..", "config.toml")
@@ -164,12 +203,11 @@ def loop(
     - Machine: {uname_machine}
 """
 
-    has_tool_output = False
     if not history:
         history = [{"role": "system", "content": system}]
     else:
         if history[-1]["role"] == "tool":
-            has_tool_output = True
+            waiting_for_assistant = True
 
     client = OpenAI()
 
@@ -188,16 +226,16 @@ def loop(
             )
             break
 
-        if not has_tool_output:
+        if not waiting_for_assistant:
             if first_message:
                 msg = first_message
                 first_message = ""
             else:
                 msg = text_from_editor(user_console)
 
-            history.append({"role": "user", "content": msg})
+            history.append(parse_user_message_special(msg))
         else:
-            has_tool_output = False
+            waiting_for_assistant = False
 
         cost_, input_toks_ = get_input_cost(
             config.cost_file[config.model], enc, history
@@ -222,6 +260,7 @@ def loop(
         _histories: History = []
         item: ChatCompletionMessageParam
         full_response: str = ""
+        image_histories: History = []
         try:
             for chunk in stream:
                 if chunk.choices[0].finish_reason == "tool_calls":
@@ -235,7 +274,7 @@ def loop(
                                 "type": "function",
                                 "function": {
                                     "arguments": tool_args,
-                                    "name": "execute_bash",
+                                    "name": type(which_tool(tool_args)).__name__,
                                 },
                             }
                             for tool_call_id, toolcallargs in tool_call_args_by_id.items()
@@ -251,7 +290,7 @@ def loop(
                     )
                     system_console.print(f"\nTotal cost: {config.cost_unit}{cost:.3f}")
                     output_toks += output_toks_
-
+                    
                     _histories.append(item)
                     for tool_call_id, toolcallargs in tool_call_args_by_id.items():
                         for toolindex, tool_args in toolcallargs.items():
@@ -283,21 +322,58 @@ def loop(
                                     f"\nTotal cost: {config.cost_unit}{cost:.3f}"
                                 )
                                 return output_or_done.task_output, cost
+                                
                             output = output_or_done
 
-                            item = {
-                                "role": "tool",
-                                "content": str(output),
-                                "tool_call_id": tool_call_id + str(toolindex),
-                            }
+                            if isinstance(output, ImageData):
+                                randomId = petname.Generate(2, "-")
+                                if not image_histories:
+                                    image_histories.extend([
+                                        {
+                                            'role': 'assistant',
+                                            'content': f'Share images with ids: {randomId}'
+
+                                        },
+                                        {
+                                            'role': 'user',
+                                            'content': [{
+                                                'type': 'image_url',
+                                                'image_url': {
+                                                    'url': output.dataurl,
+                                                    'detail': 'auto'
+                                                }
+                                            }]
+                                        }]
+                                    )
+                                else:
+                                    image_histories[0]['content'] += ', ' + randomId
+                                    image_histories[1]["content"].append({ # type: ignore
+                                        'type': 'image_url',
+                                        'image_url': {
+                                            'url': output.dataurl,
+                                            'detail': 'auto'
+                                        }
+                                    })
+
+                                item = {
+                                    "role": "tool",
+                                    "content": f'Ask user for image id: {randomId}',
+                                    "tool_call_id": tool_call_id + str(toolindex),
+                                }
+                            else:
+                                item = {
+                                    "role": "tool",
+                                    "content": str(output),
+                                    "tool_call_id": tool_call_id + str(toolindex),
+                                }
                             cost_, output_toks_ = get_output_cost(
                                 config.cost_file[config.model], enc, item
                             )
                             cost += cost_
                             output_toks += output_toks_
 
                             _histories.append(item)
-                    has_tool_output = True
+                    waiting_for_assistant = True
                     break
                 elif chunk.choices[0].finish_reason:
                     assistant_console.print("")
@@ -326,11 +402,11 @@ def loop(
                 assistant_console.print(chunk_str, end="")
                 full_response += chunk_str
         except KeyboardInterrupt:
-            has_tool_output = False
+            waiting_for_assistant = False
             input("Interrupted...enter to redo the current turn")
         else:
             history.extend(_histories)
-
+            history.extend(image_histories)
             save_history(history, session_id)
 
     return "Couldn't finish the task", cost
diff --git a/src/wcgw/openai_utils.py b/src/wcgw/openai_utils.py
@@ -28,9 +28,19 @@ def get_input_cost(
     input_tokens = 0
     for msg in history:
         content = msg["content"]
-        if not isinstance(content, str):
+        refusal = msg.get("refusal")
+        if isinstance(content, list):
+            for part in content:
+                if 'text' in part:
+                    input_tokens += len(enc.encode(part['text']))
+        elif content is None:
+            if refusal is None:
+                raise ValueError("Expected content or refusal to be present")
+            input_tokens += len(enc.encode(str(refusal)))
+        elif not isinstance(content, str):
             raise ValueError(f"Expected content to be string, got {type(content)}")
-        input_tokens += len(enc.encode(content))
+        else:
+            input_tokens += len(enc.encode(content))
     cost = input_tokens * cost_map.cost_per_1m_input_tokens / 1_000_000
     return cost, input_tokens
 
diff --git a/src/wcgw/tools.py b/src/wcgw/tools.py
@@ -5,7 +5,7 @@
 import sys
 import threading
 import traceback
-from typing import Callable, Literal, Optional, ParamSpec, Sequence, TypeVar, TypedDict
+from typing import Callable, Literal, NewType, Optional, ParamSpec, Sequence, TypeVar, TypedDict
 import uuid
 from pydantic import BaseModel, TypeAdapter
 from websockets.sync.client import connect as syncconnect
@@ -70,7 +70,7 @@ class Writefile(BaseModel):
 
 def start_shell():
     SHELL = pexpect.spawn(
-        "/bin/bash",
+        "/bin/bash --noprofile --norc",
         env={**os.environ, **{"PS1": "#@@"}},
         echo=False,
         encoding="utf-8",
@@ -236,6 +236,7 @@ def execute_bash(
 
 class ReadImage(BaseModel):
     file_path: str
+    type: Literal['ReadImage'] = 'ReadImage'
 
 
 def serve_image_in_bg(file_path: str, client_uuid: str, name: str) -> None:
@@ -257,15 +258,9 @@ def serve_image_in_bg(file_path: str, client_uuid: str, name: str) -> None:
             print(f"Connection closed for UUID: {client_uuid}, retrying")
             serve_image_in_bg(file_path, client_uuid, name)
 
+class ImageData(BaseModel):
+    dataurl: str
 
-def read_image_from_shell(file_path: str) -> str:
-    name = petname.Generate(3)
-    client_uuid = str(uuid.uuid4())
-    thread = threading.Thread(
-        target=serve_image_in_bg, args=(file_path, client_uuid, name), daemon=True
-    )
-    thread.start()
-    return f"https://wcgw.arcfu.com/get_image/{client_uuid}/{name}"
 
 
 Param = ParamSpec("Param")
@@ -286,6 +281,24 @@ def wrapper(*args: Param.args, **kwargs: Param.kwargs) -> T:
 
     return wrapper
 
+@ensure_no_previous_output
+def read_image_from_shell(file_path: str) -> ImageData:
+    if not os.path.isabs(file_path):
+        SHELL.sendline("pwd")
+        SHELL.expect("#@@")
+        assert isinstance(SHELL.before, str)
+        current_dir = SHELL.before.strip()
+        file_path = os.path.join(current_dir, file_path)
+
+    if not os.path.exists(file_path):
+        raise ValueError(f"File {file_path} does not exist")
+
+    with open(file_path, "rb") as image_file:
+        image_bytes = image_file.read()
+        image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+        image_type = mimetypes.guess_type(file_path)[0]
+        return ImageData(dataurl=f'data:{image_type};base64,{image_b64}')
+    
 
 @ensure_no_previous_output
 def write_file(writefile: Writefile) -> str:
@@ -330,22 +343,22 @@ def take_help_of_ai_assistant(
 
 def which_tool(args: str) -> BaseModel:
     adapter = TypeAdapter[
-        Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag
-    ](Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag)
+        Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag | ReadImage
+    ](Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag | ReadImage)
     return adapter.validate_python(json.loads(args))
 
 
 def get_tool_output(
-    args: dict | Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag,
+    args: dict | Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag | ReadImage,
     enc: tiktoken.Encoding,
     limit: float,
     loop_call: Callable[[str, float], tuple[str, float]],
     is_waiting_user_input: Callable[[str], tuple[BASH_CLF_OUTPUT, float]],
-) -> tuple[str | DoneFlag, float]:
+) -> tuple[str | ImageData | DoneFlag, float]:
     if isinstance(args, dict):
         adapter = TypeAdapter[
-            Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag
-        ](Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag)
+            Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag | ReadImage
+        ](Confirmation | ExecuteBash | Writefile | AIAssistant | DoneFlag | ReadImage)
         arg = adapter.validate_python(args)
     else:
         arg = args
@@ -365,9 +378,9 @@ def get_tool_output(
     elif isinstance(arg, AIAssistant):
         console.print("Calling AI assistant tool")
         output = take_help_of_ai_assistant(arg, limit, loop_call)
-    elif isinstance(arg, get_output_of_last_command):
-        console.print("Calling get output of last program tool")
-        output = get_output_of_last_command(enc), 0
+    elif isinstance(arg, ReadImage):
+        console.print("Calling read image tool")
+        output = read_image_from_shell(arg.file_path), 0.0
     else:
         raise ValueError(f"Unknown tool: {arg}")
 
@@ -438,7 +451,7 @@ def execute_user_input() -> None:
                         ExecuteBash(
                             send_ascii=[ord(x) for x in user_input] + [ord("\n")]
                         ),
-                        lambda x: ("wont_exit", 0),
+                        lambda x: ("waiting_for_input", 0),
                     )[0]
                 )
             except Exception as e:
@@ -451,10 +464,10 @@ async def register_client(server_url: str, client_uuid: str = "") -> None:
     # Generate a unique UUID for this client
     if not client_uuid:
         client_uuid = str(uuid.uuid4())
-    print(f"Connecting with UUID: {client_uuid}")
 
     # Create the WebSocket connection
     async with websockets.connect(f"{server_url}/{client_uuid}") as websocket:
+        print(f"Connected. Share this user id with the chatbot: {client_uuid}")
         try:
             while True:
                 # Wait to receive data from the server
@@ -481,7 +494,7 @@ async def register_client(server_url: str, client_uuid: str = "") -> None:
                     assert not isinstance(output, DoneFlag)
                     await websocket.send(output)
 
-        except websockets.ConnectionClosed:
+        except (websockets.ConnectionClosed, ConnectionError):
             print(f"Connection closed for UUID: {client_uuid}, retrying")
             await register_client(server_url, client_uuid)