diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ff1d6f2 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,32 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Environment +- Language: Python +- Task: UI-TARS is a multimodal agent for GUI interaction + +## Commands +- No explicit build/lint/test commands found in the codebase +- For coordinate processing: `python coordinate_processing_script.py` +- For visualization: Use matplotlib to display coordinate outputs + +## Code Style +- Indent: 4 spaces +- Quotes: Double quotes for strings +- Imports: Standard library first, then third-party, then local imports +- Error handling: Use specific exceptions with descriptive messages +- Naming: snake_case for functions/variables, UPPER_CASE for constants +- Documentation: Docstrings for functions (as seen in smart_resize) +- Comments: Descriptive comments for complex operations + +## Dependencies +- PIL/Pillow for image processing +- matplotlib for visualization +- re for parsing model outputs +- Other common imports: json, math, io + +## Model-Specific Notes +- Coordinates are processed with IMAGE_FACTOR=28 +- Model outputs need to be rescaled to original dimensions +- Parse model action outputs carefully for accurate coordinate extraction \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..406382a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,189 @@ +FROM python:3.10-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + wget \ + curl \ + libgl1-mesa-glx \ + libglib2.0-0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Set up virtual environment +RUN python -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install PyTorch with CUDA support +RUN pip install --upgrade pip && \ + pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118 + +# Install UI-TARS dependencies +RUN pip install --no-cache-dir \ + transformers==4.35.0 \ + accelerate==0.23.0 \ + bitsandbytes==0.41.1 \ + pillow==10.0.1 \ + matplotlib==3.7.3 \ + numpy==1.24.3 \ + sentencepiece==0.1.99 \ + openai==1.0.0 \ + requests==2.31.0 \ + pydantic==2.5.1 \ + safetensors==0.4.0 \ + scipy==1.11.3 \ + vllm==0.6.1 + +# Copy project files +COPY . /app/ + +# Create directories for model and data +RUN mkdir -p /app/model /app/data + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV HF_MODEL_ID="ByteDance-Seed/UI-TARS-1.5-7B" +ENV HF_HOME="/app/model" +ENV TRANSFORMERS_CACHE="/app/model" + +# Download UI-TARS model from Hugging Face (comment out if you want to download separately) +RUN echo "Starting model download..." && \ + python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; \ + print('Downloading tokenizer...'); \ + tokenizer = AutoTokenizer.from_pretrained('${HF_MODEL_ID}', trust_remote_code=True); \ + print('Tokenizer downloaded successfully'); \ + # If you have enough memory and want to download the model directly, uncomment the next line \ + # model = AutoModelForCausalLM.from_pretrained('${HF_MODEL_ID}', trust_remote_code=True, device_map='auto'); \ + # print('Model downloaded successfully');" || echo "Model will be downloaded at runtime" + +# Create model server script +RUN echo '#!/usr/bin/env python3\n\ +import os\n\ +import torch\n\ +from vllm import LLM, SamplingParams\n\ +from vllm.entrypoints.openai.api_server import serve_vllm_api_server\n\ +from transformers import AutoTokenizer\n\ +\n\ +def main():\n\ + model_id = os.environ.get("HF_MODEL_ID", "ByteDance-Seed/UI-TARS-1.5-7B")\n\ + print(f"Starting server with model: {model_id}")\n\ + \n\ + # Load tokenizer\n\ + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n\ + \n\ + # Start vLLM server\n\ + serve_vllm_api_server(\n\ + model=model_id,\n\ + tensor_parallel_size=1, # Change based on available GPUs\n\ + gpu_memory_utilization=0.9,\n\ + trust_remote_code=True,\n\ + dtype="bfloat16", # Use float16 if bfloat16 is not supported\n\ + host="0.0.0.0",\n\ + port=8000\n\ + )\n\ +\n\ +if __name__ == "__main__":\n\ + main()\n\ +' > /app/server.py && chmod +x /app/server.py + +# Create inference script +RUN echo '#!/usr/bin/env python3\n\ +import os\n\ +import sys\n\ +import json\n\ +import base64\n\ +import argparse\n\ +from PIL import Image\n\ +from io import BytesIO\n\ +import requests\n\ +\n\ +def encode_image(image_path):\n\ + with open(image_path, "rb") as image_file:\n\ + return base64.b64encode(image_file.read()).decode("utf-8")\n\ +\n\ +def query_model(image_path, instruction, server_url="http://localhost:8000/v1/chat/completions"):\n\ + # Encode the image\n\ + base64_image = encode_image(image_path)\n\ + \n\ + # Prepare the messages with system prompt from prompts.py\n\ + with open("/app/prompts.py", "r") as f:\n\ + prompts_content = f.read()\n\ + \n\ + # Extract computer use prompt\n\ + import re\n\ + computer_prompt = re.search(r\'COMPUTER_USE = \"\"\"(.+?)\"\"\"\', prompts_content, re.DOTALL)\n\ + if computer_prompt:\n\ + system_prompt = computer_prompt.group(1).replace("{language}", "English").replace("{instruction}", instruction)\n\ + else:\n\ + system_prompt = "You are a GUI agent. You are given a task and your action history, with screenshots."\n\ + \n\ + # Prepare the API request\n\ + headers = {"Content-Type": "application/json"}\n\ + payload = {\n\ + "model": "UI-TARS",\n\ + "messages": [\n\ + {"role": "system", "content": system_prompt},\n\ + {"role": "user", "content": [\n\ + {"type": "text", "text": instruction},\n\ + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}\n\ + ]}\n\ + ],\n\ + "temperature": 0.01,\n\ + "max_tokens": 512\n\ + }\n\ + \n\ + # Make the API request\n\ + try:\n\ + response = requests.post(server_url, headers=headers, json=payload)\n\ + response.raise_for_status()\n\ + result = response.json()\n\ + return result["choices"][0]["message"]["content"]\n\ + except Exception as e:\n\ + return f"Error: {str(e)}"\n\ +\n\ +def main():\n\ + parser = argparse.ArgumentParser(description="UI-TARS Model Inference")\n\ + parser.add_argument("--image", required=True, help="Path to screenshot image")\n\ + parser.add_argument("--instruction", required=True, help="Task instruction")\n\ + parser.add_argument("--server", default="http://localhost:8000/v1/chat/completions", help="Server URL")\n\ + \n\ + args = parser.parse_args()\n\ + \n\ + result = query_model(args.image, args.instruction, args.server)\n\ + print(result)\n\ +\n\ +if __name__ == "__main__":\n\ + main()\n\ +' > /app/inference.py && chmod +x /app/inference.py + +# Create entrypoint script +RUN echo '#!/bin/bash\n\ +if [ "$1" = "serve" ]; then\n\ + echo "Starting UI-TARS server..."\n\ + python /app/server.py\n\ +elif [ "$1" = "infer" ]; then\n\ + echo "Running inference..."\n\ + python /app/inference.py --image "$2" --instruction "$3"\n\ +elif [ "$1" = "process-coordinates" ]; then\n\ + echo "Processing coordinates..."\n\ + python /app/coordinate_processing_script.py --image "$2" --model-output "$3" --output "$4"\n\ +elif [ "$1" = "analyze-webpage" ]; then\n\ + echo "Analyzing webpage..."\n\ + python /app/webpage_analyzer.py --image "$2" ${3:+--output "$3"}\n\ +else\n\ + echo "UI-TARS Docker container"\n\ + echo "Usage:"\n\ + echo " serve - Start the model server"\n\ + echo " infer IMAGE INSTRUCTION - Run inference on an image"\n\ + echo " process-coordinates IMAGE MODEL_OUTPUT OUTPUT - Process and visualize coordinates"\n\ + echo " analyze-webpage IMAGE [OUTPUT_FILE] - Analyze a webpage screenshot and output description"\n\ + echo "Environment:"\n\ + echo " HF_MODEL_ID - HuggingFace model ID (default: ByteDance-Seed/UI-TARS-1.5-7B)"\n\ +fi\n\ +' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh + +ENTRYPOINT ["/app/entrypoint.sh"] +CMD ["help"] \ No newline at end of file diff --git a/README.md b/README.md index 0de1922..919146a 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ We also offer a **UI-TARS-desktop** version, which can operate on your **local personal device**. To use it, please visit [https://github.com/bytedance/UI-TARS-desktop](https://github.com/bytedance/UI-TARS-desktop). To use UI-TARS in web automation, you may refer to the open-source project [Midscene.js](https://github.com/web-infra-dev/Midscene). ## Updates +- 🐳 2025.04.24: Added Docker containerization, coordinate processing tools, and webpage analysis features. See [Docker Deployment Guide](README_docker.md). - 🌟 2025.04.16: We shared the latest progress of the UI-TARS-1.5 model in our [blog](https://seed-tars.com/1.5), which excels in playing games and performing GUI tasks, and we open-sourced the [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B). - ✨ 2025.03.23: We updated the OSWorld inference scripts from the original official [OSWorld repository](https://github.com/xlang-ai/OSWorld/blob/main/run_uitars.py). Now, you can use the OSWorld official inference scripts to reproduce our results. @@ -34,13 +35,63 @@ Leveraging the foundational architecture introduced in [our recent paper](https:
-## Deployment -- See the deploy guide here. -- For coordinates processing, refer to here. -- For full action space parsing, refer to [OSWorld uitars_agent.py](https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/uitars_agent.py) +## Core Features + +UI-TARS provides several key capabilities: + +1. **GUI Interaction**: Automatically interact with graphical user interfaces through vision-language models +2. **Multi-Platform Support**: Works with desktop, mobile, and web interfaces +3. **Action Generation**: Produces precise interface actions (clicks, typing, scrolling) with coordinate mapping +4. **Visual Understanding**: Comprehends interface elements, their relationships, and functions +5. **Webpage Analysis**: Converts UI screenshots to structured plaintext descriptions +6. **Coordinate Processing**: Maps model output coordinates to actual screen positions + +## System Architecture + +UI-TARS consists of the following components: + +1. **Vision-Language Model**: Processes screenshots to understand interface elements +2. **Action Space**: Defines possible interactions (click, drag, type, etc.) +3. **Coordinate System**: Maps model outputs to actual screen positions +4. **Prompt System**: Configures model behavior for different platforms and tasks +5. **API Interface**: Provides OpenAI-compatible endpoints for integration + +## Deployment Options + +### 1. Docker Container (Recommended) +- Comprehensive Docker setup with GPU support +- See the [Docker Deployment Guide](README_docker.md) +- Includes web analysis and coordinate processing tools + +### 2. HuggingFace Inference Endpoints +- Cloud-based deployment on HuggingFace infrastructure +- See the [HuggingFace deploy guide](README_deploy.md) + +### 3. Local Development +- For coordinates processing, refer to [Coordinates Guide](README_coordinates.md) +- For action space parsing, refer to [OSWorld uitars_agent.py](https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/uitars_agent.py) + +## Usage Examples + +### GUI Interaction +```python +# Example of using UI-TARS for GUI interaction +response = ui_tars.process_screenshot( + image_path="screenshot.png", + instruction="Click on the search button" +) +# Response: Action: click(start_box='(197,525)') +``` + +### Webpage Analysis +```bash +# Docker container command for webpage analysis +docker-compose exec ui-tars /app/entrypoint.sh analyze-webpage \ + /app/data/webpage_screenshot.png /app/data/analysis.txt +``` ## System Prompts -- Refer to prompts.py +- Refer to prompts.py for system prompt templates ## Performance diff --git a/README_docker.md b/README_docker.md new file mode 100644 index 0000000..3631228 --- /dev/null +++ b/README_docker.md @@ -0,0 +1,164 @@ +# Docker Setup for UI-TARS + +This guide explains how to build and run UI-TARS inside a Docker container. The setup includes the full UI-TARS-1.5-7B model, allowing you to run inference on images and process coordinates. + +## Prerequisites + +- Docker and Docker Compose +- NVIDIA GPU with at least 24GB VRAM +- NVIDIA Container Toolkit (nvidia-docker2) + +## Setup + +### 1. Clone the Repository + +```bash +git clone https://github.com/bytedance/UI-TARS.git +cd UI-TARS +``` + +### 2. Build and Start the Docker Container + +```bash +# Build and start the container +docker-compose up -d ui-tars + +# Check logs to monitor download progress +docker-compose logs -f ui-tars +``` + +This will: +- Build the Docker image +- Download the UI-TARS-1.5-7B model from Hugging Face (first run) +- Start a vLLM server exposing an OpenAI-compatible API on port 8000 + +### 3. Usage + +#### Running Inference + +Place your screenshots in the `data` directory, then: + +```bash +# Using the docker container directly +docker-compose exec ui-tars /app/entrypoint.sh infer /app/data/your_screenshot.png "Click on the search button" + +# Or using the inference service +docker-compose --profile infer run ui-tars-infer infer /app/data/your_screenshot.png "Click on the search button" +``` + +#### Processing Coordinates + +```bash +docker-compose exec ui-tars /app/entrypoint.sh process-coordinates \ + /app/data/your_screenshot.png \ + "Action: click(start_box='(197,525)')" \ + /app/data/result.png +``` + +#### Webpage Analysis + +Analyze a webpage screenshot and convert it to a detailed plaintext description: + +```bash +# Generate analysis and print to console +docker-compose exec ui-tars /app/entrypoint.sh analyze-webpage /app/data/webpage_screenshot.png + +# Save analysis to a file +docker-compose exec ui-tars /app/entrypoint.sh analyze-webpage /app/data/webpage_screenshot.png /app/data/analysis.txt +``` + +This will produce a structured description of the webpage including: +- Page layout and structure +- Navigation elements +- Main content sections +- Interactive elements (buttons, forms, menus) +- Visual design elements + +## Advanced Configuration + +### Using a Different Model + +You can set the `HF_MODEL_ID` environment variable to use a different model: + +```bash +HF_MODEL_ID=ByteDance-Seed/UI-TARS-1.5-7B docker-compose up -d ui-tars +``` + +### GPU Configuration + +Edit the `docker-compose.yml` file to change GPU settings: + +```yaml +deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 # Change based on available GPUs + capabilities: [gpu] +``` + +Edit the `server.py` script for tensor parallelism settings: + +```python +serve_vllm_api_server( + model=model_id, + tensor_parallel_size=1, # Increase for multi-GPU setup + # ... +) +``` + +## Troubleshooting + +### CUDA/GPU Issues + +If encountering GPU or CUDA errors: + +1. Verify NVIDIA drivers are correctly installed: + ```bash + nvidia-smi + ``` + +2. Check NVIDIA Container Toolkit is installed: + ```bash + dpkg -l | grep nvidia-container-toolkit + ``` + +3. Try with `float16` instead of `bfloat16` in `server.py` if your GPU doesn't support bfloat16 + +### Memory Issues + +- For lower memory GPUs (16GB), reduce `gpu_memory_utilization` in `server.py` +- Consider CPU-only inference by removing GPU-specific settings and adding: + ```python + serve_vllm_api_server( + model=model_id, + device="cpu", + # ... + ) + ``` + +## API Documentation + +Once the server is running, an OpenAI-compatible API is exposed at: +``` +http://localhost:8000/v1/chat/completions +``` + +Example curl request: +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "UI-TARS", + "messages": [ + {"role": "system", "content": "You are a GUI agent..."}, + {"role": "user", "content": [ + {"type": "text", "text": "Click on the search button"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ]} + ], + "temperature": 0.01, + "max_tokens": 512 + }' +``` \ No newline at end of file diff --git a/changes.log b/changes.log new file mode 100644 index 0000000..fac7044 --- /dev/null +++ b/changes.log @@ -0,0 +1,31 @@ +# UI-TARS Project Changes Log + +## [2025-04-24] Docker and Analysis Features + +Added Docker containerization and web analysis capabilities: + +### New Files +1. `CLAUDE.md` - Guidelines for Claude AI when working with this repository +2. `Dockerfile` - Container definition for running UI-TARS with GPU support +3. `docker-compose.yml` - Docker Compose configuration for easy deployment +4. `README_docker.md` - Comprehensive guide for Docker deployment +5. `client_script.py` - Python client for interacting with the UI-TARS API +6. `coordinate_processing_script.py` - Standalone script for processing model coordinate outputs +7. `webpage_analyzer.py` - Tool for analyzing webpage screenshots and generating plaintext descriptions +8. `example_webpage_analysis.txt` - Sample output showing webpage analysis format + +### Key Features Added +1. Complete Docker containerization of UI-TARS model +2. OpenAI-compatible API server using vLLM +3. GPU-accelerated inference with tensor parallelism support +4. Coordinate processing standalone tool +5. Webpage screenshot analysis capability +6. Extensive documentation for Docker deployment + +### Technical Details +- Added support for the UI-TARS-1.5-7B model from HuggingFace +- Implemented efficient model loading and caching +- Created GPU-optimized container configuration +- Built OpenAI-compatible API interface +- Developed helper scripts for coordinate transformation +- Created webpage analysis functionality for converting UI to plaintext \ No newline at end of file diff --git a/client_script.py b/client_script.py new file mode 100644 index 0000000..c965fde --- /dev/null +++ b/client_script.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# Client Script for UI-TARS + +import os +import re +import json +import base64 +import argparse +from PIL import Image +from io import BytesIO + +# Optional dependency - only needed if using OpenAI API format +try: + from openai import OpenAI + HAS_OPENAI = True +except ImportError: + HAS_OPENAI = False + print("OpenAI package not installed. Using requests instead.") + import requests + +def add_box_token(input_string): + """ + Adds box tokens to the model output coordinates. + This is needed for processing the model's raw output. + """ + # Split the string into individual actions + if "Action: " in input_string and "start_box=" in input_string: + suffix = input_string.split("Action: ")[0] + "Action: " + actions = input_string.split("Action: ")[1:] + processed_actions = [] + for action in actions: + action = action.strip() + # Extract coordinates (start_box or end_box) using regex + coordinates = re.findall(r"(start_box|end_box)='\((\d+),\s*(\d+)\)'", action) + + updated_action = action # Start with the original action + for coord_type, x, y in coordinates: + # Convert x and y to integers + updated_action = updated_action.replace( + f"{coord_type}='({x},{y})'", + f"{coord_type}='<|box_start|>({x},{y})<|box_end|>'" + ) + processed_actions.append(updated_action) + + # Reconstruct the final string + final_string = suffix + "\n\n".join(processed_actions) + else: + final_string = input_string + return final_string + +def encode_image(image_path): + """Encode an image to base64.""" + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + +def query_model_openai_compatible(base_url, api_key, messages, image_path=None): + """Query the model using OpenAI-compatible API.""" + if not HAS_OPENAI: + raise ImportError("OpenAI package is required. Install with 'pip install openai'") + + client = OpenAI( + base_url=base_url, + api_key=api_key + ) + + # If an image is provided, add it to the user's latest message + if image_path: + for i in range(len(messages) - 1, -1, -1): + if messages[i]["role"] == "user": + # Add base64 encoded image + base64_image = encode_image(image_path) + # Add the image to the content + content = [{"type": "text", "text": messages[i]["content"]}] + content.append( + {"type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + ) + messages[i]["content"] = content + break + + # Process assistant messages for proper box token formatting + for message in messages: + if message["role"] == "assistant" and isinstance(message["content"], str): + message["content"] = add_box_token(message["content"]) + + # Make the API call + response = client.chat.completions.create( + model="tgi", # Model name used by HuggingFace TGI + messages=messages, + temperature=0.0, + max_tokens=400, + stream=False + ) + + return response.choices[0].message.content + +def main(): + parser = argparse.ArgumentParser(description='UI-TARS client script for interacting with the model API.') + parser.add_argument('--api-url', type=str, default=os.environ.get('HF_BASE_URL', ''), + help='API endpoint URL') + parser.add_argument('--api-key', type=str, default=os.environ.get('HF_API_KEY', ''), + help='API key for authentication') + parser.add_argument('--task', type=str, default="Click on the search button", + help='Task description for the model') + parser.add_argument('--image', type=str, default='', + help='Path to screenshot image (optional)') + parser.add_argument('--messages-file', type=str, default='', + help='Path to JSON file containing message history (optional)') + + args = parser.parse_args() + + # Check if API URL and key are provided + if not args.api_url or not args.api_url.startswith('http'): + print("Error: Valid API URL is required.") + print("Set with --api-url or HF_BASE_URL environment variable.") + return + + if not args.api_key: + print("Error: API key is required.") + print("Set with --api-key or HF_API_KEY environment variable.") + return + + # Load message history if provided, otherwise create a new one + if args.messages_file and os.path.exists(args.messages_file): + with open(args.messages_file, 'r') as f: + messages = json.load(f) + else: + # Default system prompt from prompts.py + system_prompt = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + +## Output Format +``` +Thought: ... +Action: ... +``` + +## Action Space + +click(start_box='<|box_start|>(x1,y1)<|box_end|>') +left_double(start_box='<|box_start|>(x1,y1)<|box_end|>') +right_single(start_box='<|box_start|>(x1,y1)<|box_end|>') +drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>') +hotkey(key='') +type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content. +scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. + + +## Note +- Use English in `Thought` part. +- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. + +## User Instruction +{instruction} +""" + # Create a new message history + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": args.task} + ] + + # Query the model + try: + response = query_model_openai_compatible( + args.api_url, + args.api_key, + messages, + args.image + ) + print("\nModel Response:") + print(response) + + # Add the response to messages and save if message file is provided + messages.append({"role": "assistant", "content": response}) + if args.messages_file: + with open(args.messages_file, 'w') as f: + json.dump(messages, f, indent=2) + + except Exception as e: + print(f"Error querying the model: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/coordinate_processing_script.py b/coordinate_processing_script.py new file mode 100644 index 0000000..f5ca12b --- /dev/null +++ b/coordinate_processing_script.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Coordinate Processing Script for UI-TARS + +import os +import re +import math +from PIL import Image +import matplotlib.pyplot as plt +import json +import argparse + +# Constants from the README_coordinates.md +IMAGE_FACTOR = 28 +MIN_PIXELS = 100 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 +MAX_RATIO = 200 + +def round_by_factor(number: int, factor: int) -> int: + """Returns the closest integer to 'number' that is divisible by 'factor'.""" + return round(number / factor) * factor + +def ceil_by_factor(number: int, factor: int) -> int: + """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" + return math.ceil(number / factor) * factor + +def floor_by_factor(number: int, factor: int) -> int: + """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" + return math.floor(number / factor) * factor + +def smart_resize( + height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS +) -> tuple[int, int]: + """ + Rescales the image so that the following conditions are met: + 1. Both dimensions (height and width) are divisible by 'factor'. + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + 3. The aspect ratio of the image is maintained as closely as possible. + """ + if max(height, width) / min(height, width) > MAX_RATIO: + raise ValueError( + f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + ) + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(height / beta, factor) + w_bar = floor_by_factor(width / beta, factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(height * beta, factor) + w_bar = ceil_by_factor(width * beta, factor) + return h_bar, w_bar + +def process_coordinates(image_path, model_output, output_path=None): + """Process coordinates from model output and visualize them on the image.""" + # Extract coordinates using regex + coordinates_match = re.search(r"start_box='?\((\d+),\s*(\d+)\)'?", model_output) + if not coordinates_match: + print("No coordinates found in the model output.") + return + + model_output_width = int(coordinates_match.group(1)) + model_output_height = int(coordinates_match.group(2)) + + # Open the image + img = Image.open(image_path) + width, height = img.size + print(f'Original image dimensions: {width}x{height}') + + # Calculate the new dimensions + new_height, new_width = smart_resize(height, width) + new_coordinate = ( + int(model_output_width/new_width * width), + int(model_output_height/new_height * height) + ) + print(f'Resized dimensions: {new_width}x{new_height}') + print(f'Original model coordinates: ({model_output_width},{model_output_height})') + print(f'Mapped screen coordinates: {new_coordinate}') + + # Display the image + plt.figure(figsize=(10, 10)) + plt.imshow(img) + plt.scatter([new_coordinate[0]], [new_coordinate[1]], c='red', s=100) # Mark the point with a red dot + plt.title('Visualized Coordinate') + plt.axis('off') # Set to 'off' to hide the axes + + if output_path: + plt.savefig(output_path, dpi=350) + print(f"Visualization saved to {output_path}") + else: + plt.show() + +def main(): + parser = argparse.ArgumentParser(description='Process UI-TARS model coordinates.') + parser.add_argument('--image', type=str, default='./data/coordinate_process_image.png', + help='Path to the image') + parser.add_argument('--model-output', type=str, + default="Action: click(start_box='(197,525)')", + help='Model output containing coordinates') + parser.add_argument('--output', type=str, default='./data/processed_image.png', + help='Output path for visualization') + + args = parser.parse_args() + + process_coordinates(args.image, args.model_output, args.output) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..2863abd --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,43 @@ +version: '3.8' + +services: + ui-tars: + build: + context: . + dockerfile: Dockerfile + volumes: + - ./data:/app/data + - model-cache:/app/model + environment: + - HF_MODEL_ID=ByteDance-Seed/UI-TARS-1.5-7B + - TRANSFORMERS_CACHE=/app/model + - HF_HOME=/app/model + ports: + - "8000:8000" + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + + # Optional inference client + ui-tars-infer: + build: + context: . + dockerfile: Dockerfile + volumes: + - ./data:/app/data + - model-cache:/app/model + environment: + - SERVER_URL=http://ui-tars:8000/v1/chat/completions + depends_on: + - ui-tars + command: infer /app/data/your_screenshot.png "Your instruction here" + profiles: + - infer + +volumes: + model-cache: + driver: local \ No newline at end of file diff --git a/example_webpage_analysis.txt b/example_webpage_analysis.txt new file mode 100644 index 0000000..77230a9 --- /dev/null +++ b/example_webpage_analysis.txt @@ -0,0 +1,103 @@ +# Example Webpage Analysis Output + +## Layout and Structure +The webpage follows a standard e-commerce layout with a clear hierarchy. The page is organized into the following sections from top to bottom: +- Header with navigation bar (top) +- Hero banner with promotional content (upper section) +- Product categories grid (middle section) +- Featured products carousel (middle-lower section) +- Customer testimonials (lower section) +- Newsletter signup (bottom) +- Footer with site links (very bottom) + +The layout uses a responsive grid system, with content organized in both single-column and multi-column formats depending on the section. + +## Navigation Elements +- Primary navigation bar at the top featuring: + * Company logo on the upper left + * Main menu categories (Home, Products, Categories, About, Contact) + * Search bar centered in the navigation + * User account icon, wishlist icon, and shopping cart icon on the upper right +- Secondary navigation in the footer containing: + * Customer service links + * Company information links + * Social media icons + * Legal information links +- Breadcrumb navigation below the header showing: Home > Products + +## Main Content Sections +1. Hero Banner: + - Large promotional image featuring a summer sale + - Overlay text: "Summer Collection 2025" + - Call-to-action button: "Shop Now" + +2. Product Categories: + - Grid of 6 category cards with images + - Categories include: Electronics, Clothing, Home & Garden, Beauty, Sports, Books + - Each card has a category name and a "Browse" link + +3. Featured Products: + - Horizontal scrollable carousel with 4 visible products + - Each product card includes: + * Product image + * Product name + * Price (with original and discounted prices) + * Star rating (1-5 stars) + * "Add to Cart" button + +4. Customer Testimonials: + - Three customer review cards + - Each card contains a customer photo, name, quote, and rating + +5. Newsletter Section: + - Heading: "Subscribe to our newsletter" + - Text input field for email address + - "Subscribe" button + +## Interactive Elements +- Buttons: + * Primary CTA buttons in blue (Shop Now, Subscribe) + * Secondary buttons in white with borders (Browse, View All) + * Icon buttons in the navigation (search, account, wishlist, cart) + * "Add to Cart" buttons on product cards + +- Forms: + * Search form in the navigation bar + * Newsletter signup form with email input field + +- Menus: + * Main navigation menu with dropdown functionality + * Mobile menu hamburger icon (visible only on small screens) + +- Other interactive elements: + * Carousel navigation arrows for featured products + * Product rating stars + * Social media sharing buttons + * Pagination dots below the carousel + +## Visual Design Elements +- Color scheme: + * Primary brand color: Blue (#0066CC) + * Secondary colors: White, light gray, dark gray + * Accent color: Orange (#FF6600) for sale prices and promotions + +- Typography: + * Sans-serif font family for headings and body text + * Larger, bold typography for headings + * Medium-weight fonts for navigation and buttons + * Light-weight fonts for descriptive text + +- Imagery: + * High-quality product photography with consistent style + * Lifestyle imagery in the hero banner + * User avatars in the testimonial section + * Category thumbnail images with slight overlay + +- Visual hierarchy: + * Prominent hero banner draws initial attention + * Consistent use of whitespace between sections + * Clear visual separation between different content areas + * Contrasting colors for call-to-action elements + * Shadow effects on cards to create depth + +This webpage is well-structured, visually appealing, and follows modern e-commerce design principles with clear navigation and intuitive interaction patterns. \ No newline at end of file diff --git a/webpage_analyzer.py b/webpage_analyzer.py new file mode 100644 index 0000000..8197dbe --- /dev/null +++ b/webpage_analyzer.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# UI-TARS Webpage Analysis Script + +import os +import sys +import json +import base64 +import argparse +from PIL import Image +from io import BytesIO +import requests + +def encode_image(image_path): + """Encode an image to base64.""" + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + +def analyze_webpage(image_path, server_url="http://localhost:8000/v1/chat/completions"): + """ + Analyze a webpage screenshot and return a detailed plaintext description. + Uses the UI-TARS model to perform the analysis. + """ + # Encode the image + base64_image = encode_image(image_path) + + # Create analysis instruction + instruction = """ + Analyze this webpage screenshot and provide a detailed plaintext description of: + 1. Page layout and structure + 2. Navigation elements + 3. Main content sections + 4. Interactive elements (buttons, forms, menus) + 5. Visual design elements + + Focus on being comprehensive but concise. Organize your description logically. + """ + + # Prepare the API request + headers = {"Content-Type": "application/json"} + payload = { + "model": "UI-TARS", + "messages": [ + {"role": "system", "content": "You are an expert UI analyzer. Given a screenshot of a webpage, you provide detailed and structured descriptions of the interface elements, layout, and content organization."}, + {"role": "user", "content": [ + {"type": "text", "text": instruction}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + ]} + ], + "temperature": 0.1, + "max_tokens": 1024 + } + + # Make the API request + try: + response = requests.post(server_url, headers=headers, json=payload) + response.raise_for_status() + result = response.json() + return result["choices"][0]["message"]["content"] + except Exception as e: + return f"Error: {str(e)}\n\nIf the UI-TARS server is not running, start it with:\ndocker-compose up -d ui-tars" + +def main(): + parser = argparse.ArgumentParser(description="UI-TARS Webpage Analysis") + parser.add_argument("--image", required=True, help="Path to webpage screenshot") + parser.add_argument("--server", default="http://localhost:8000/v1/chat/completions", help="Server URL") + parser.add_argument("--output", help="Output file path (optional, outputs to console if not specified)") + + args = parser.parse_args() + + print(f"Analyzing webpage screenshot: {args.image}") + result = analyze_webpage(args.image, args.server) + + if args.output: + with open(args.output, "w") as f: + f.write(result) + print(f"Analysis saved to: {args.output}") + else: + print("\n=== WEBPAGE ANALYSIS ===\n") + print(result) + +if __name__ == "__main__": + main() \ No newline at end of file