diff --git a/.github/workflows/docker-scripts.yml b/.github/workflows/docker-scripts.yml new file mode 100644 index 00000000..25519d03 --- /dev/null +++ b/.github/workflows/docker-scripts.yml @@ -0,0 +1,231 @@ +name: Docker Scripts Validation + +on: + push: + branches: ["main"] + paths: + - "Dockerfile" + - "scripts/**" + - "pyproject.toml" + - "uv.lock" + pull_request: + branches: ["main"] + paths: + - "Dockerfile" + - "scripts/**" + - "pyproject.toml" + - "uv.lock" + +permissions: + contents: read + +jobs: + validate-docker-scripts: + runs-on: ubuntu-latest + + strategy: + matrix: + script: + - name: "PDF Ingestion" + extras: "pdf" + script_file: "ingest_pdf.py" + test_timeout: "300" # 5 minutes + + env: + # Test environment variables + LOG_LEVEL: INFO + AGENT__GEMINI_MODEL: "gemini-2.0-flash" + AGENT__GEMINI_API_KEY: ${{ secrets.AGENT__GEMINI_API_KEY }} + ECOSYSTEM__WEB3_PROVIDER_URL: "https://stylish-light-theorem.flare-mainnet.quiknode.pro/ext/bc/C/rpc" + INGESTION__CHUNK_SIZE: 5000 + TEE__SIMULATE_ATTESTATION_TOKEN: true + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build Docker image for ${{ matrix.script.name }} + run: | + docker build \ + --build-arg EXTRAS=${{ matrix.script.extras }} \ + --build-arg SCRIPT=${{ matrix.script.script_file }} \ + --tag fai-script-${{ matrix.script.extras }} \ + --cache-from type=gha \ + --cache-to type=gha,mode=max \ + . + + - name: Validate script exists in image + run: | + docker run --rm fai-script-${{ matrix.script.extras }} \ + test -f "/app/scripts/${{ matrix.script.script_file }}" + + - name: Test script startup (dry run) + timeout-minutes: 5 + run: | + # Simple validation that the script exists and dependencies are available + docker run --rm \ + -e LOG_LEVEL="$LOG_LEVEL" \ + -e AGENT__GEMINI_MODEL="$AGENT__GEMINI_MODEL" \ + -e AGENT__GEMINI_API_KEY="$AGENT__GEMINI_API_KEY" \ + -e ECOSYSTEM__WEB3_PROVIDER_URL="$ECOSYSTEM__WEB3_PROVIDER_URL" \ + -e INGESTION__CHUNK_SIZE="$INGESTION__CHUNK_SIZE" \ + -e TEE__SIMULATE_ATTESTATION_TOKEN="$TEE__SIMULATE_ATTESTATION_TOKEN" \ + fai-script-${{ matrix.script.extras }} \ + python -c " + import sys + import os + + # Test that script file exists + script_path = '/app/scripts/${{ matrix.script.script_file }}' + if not os.path.exists(script_path): + print(f'❌ Script not found: {script_path}') + sys.exit(1) + print(f'✅ Script exists: {script_path}') + + # Test that required dependencies are available + if '${{ matrix.script.extras }}' == 'pdf': + try: + import PIL + import fitz # pymupdf + import pytesseract + print('✅ PDF dependencies available') + except ImportError as e: + print(f'❌ PDF dependency missing: {e}') + sys.exit(1) + + print('✅ Script validation completed successfully') + " + + - name: Test container health + run: | + # Test that the container can start and the Python environment is healthy + docker run --rm fai-script-${{ matrix.script.extras }} \ + python -c " + import sys + print(f'Python version: {sys.version}') + print(f'Python path: {sys.path}') + + # Test core dependencies (some modules may require optional deps) + try: + import flare_ai_kit + print('✅ flare-ai-kit imported successfully') + except ImportError as e: + print(f'⚠️ flare-ai-kit import issue (may need more extras): {e}') + # Test basic Python packages instead + import httpx, pydantic, structlog + print('✅ Core Python dependencies available') + + # Test that uv environment is working + import subprocess + result = subprocess.run(['/app/.venv/bin/python', '--version'], + capture_output=True, text=True) + print(f'Virtual env Python: {result.stdout.strip()}') + + print('✅ Container health check passed') + " + + - name: Test script dependencies for ${{ matrix.script.name }} + run: | + # Test that the specific extras are properly installed + docker run --rm fai-script-${{ matrix.script.extras }} \ + python -c " + import sys + + extras = '${{ matrix.script.extras }}' + print(f'Testing dependencies for extras: {extras}') + + if 'pdf' in extras: + try: + import PIL + import fitz + import pytesseract + print('✅ PDF dependencies (PIL, fitz, pytesseract) available') + except ImportError as e: + print(f'❌ PDF dependency missing: {e}') + sys.exit(1) + + if 'rag' in extras: + try: + import qdrant_client + import dulwich + print('✅ RAG dependencies (qdrant_client, dulwich) available') + except ImportError as e: + print(f'❌ RAG dependency missing: {e}') + sys.exit(1) + + if 'a2a' in extras: + try: + import fastapi + print('✅ A2A dependencies (fastapi) available') + except ImportError as e: + print(f'❌ A2A dependency missing: {e}') + sys.exit(1) + + print('✅ All expected dependencies are available') + " + + + + validate-build-args: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Test build without extras + run: | + docker build \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-base \ + . + + - name: Test build with multiple extras + run: | + docker build \ + --build-arg EXTRAS=pdf,rag \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-multi \ + . + + - name: Validate multi-extras build + run: | + docker run --rm fai-script-multi \ + python -c " + import PIL, fitz, pytesseract # PDF deps + import qdrant_client, dulwich # RAG deps + print('✅ Multiple extras build successful') + " + + validate-documentation: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Check documentation exists + run: | + test -f docs/docker_scripts_guide.md + echo "✅ Docker scripts guide exists" + + - name: Validate README updates + run: | + grep -q "parametric Dockerfile" README.md + grep -q "EXTRAS" README.md + grep -q "docker_scripts_guide.md" README.md + echo "✅ README contains Docker scripts documentation" + + - name: Check scripts directory structure + run: | + test -d scripts + test -f scripts/ingest_pdf.py + test -d scripts/data + test -f scripts/data/create_sample_invoice.py + echo "✅ Scripts directory structure is correct" diff --git a/Dockerfile b/Dockerfile index 2e319315..631b044b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,27 +1,99 @@ +# Parametric Dockerfile for running scripts with specific extras +# Usage: +# docker build -t fai-script-pdf --build-arg EXTRAS=pdf --build-arg SCRIPT=ingest_pdf.py . +# docker run --rm -it -v "$PWD/data:/app/scripts/data" fai-script-pdf + +# Build arguments for parametric behavior +ARG EXTRAS="" +ARG SCRIPT="ingest_pdf.py" + # Add in prod FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder + +# Pass build args to builder stage +ARG EXTRAS +ARG SCRIPT + ENV UV_COMPILE_BYTECODE=1 \ UV_LINK_MODE=copy \ UV_PYTHON_DOWNLOADS=0 + WORKDIR /app + +# Install system dependencies for PDF processing (if needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-eng \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +# Copy dependency files first for better caching +COPY uv.lock pyproject.toml ./ + +# Install dependencies based on EXTRAS parameter RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,source=uv.lock,target=uv.lock \ - --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - uv sync --locked --no-install-project --all-extras --no-dev --no-editable + if [ -n "$EXTRAS" ]; then \ + echo "Installing with extras: $EXTRAS"; \ + # Convert comma-separated extras to space-separated for uv + EXTRAS_ARGS=$(echo "$EXTRAS" | sed 's/,/ --extra /g'); \ + echo "Installing extras: $EXTRAS_ARGS"; \ + uv sync --locked --no-install-project --extra $EXTRAS_ARGS --no-dev --no-editable; \ + else \ + echo "Installing base dependencies only"; \ + uv sync --locked --no-install-project --no-dev --no-editable; \ + fi + +# Copy the entire project COPY . /app + +# Install the project itself RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync --locked --all-extras --no-dev --no-editable + if [ -n "$EXTRAS" ]; then \ + # Convert comma-separated extras to space-separated for uv + EXTRAS_ARGS=$(echo "$EXTRAS" | sed 's/,/ --extra /g'); \ + echo "Installing project with extras: $EXTRAS_ARGS"; \ + uv sync --locked --extra $EXTRAS_ARGS --no-dev --no-editable; \ + else \ + uv sync --locked --no-dev --no-editable; \ + fi + +# Clean up cache RUN rm -rf /root/.cache/uv /root/.cache/pip # Add in prod FROM python:3.12-slim-bookworm AS runtime +# Pass build args to runtime stage +ARG EXTRAS +ARG SCRIPT + +# Install runtime system dependencies for PDF processing (if needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-eng \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + ENV PIP_NO_CACHE_DIR=1 \ - UV_PYTHON_DOWNLOADS=0 + UV_PYTHON_DOWNLOADS=0 \ + SCRIPT_NAME="$SCRIPT" + +# Create non-root user RUN groupadd -r app && \ useradd -r -g app -d /nonexistent -s /usr/sbin/nologin app -USER app -WORKDIR /app + +# Copy built application from builder stage COPY --from=builder --chown=app:app /app /app + +# Set working directory and PATH +WORKDIR /app ENV PATH="/app/.venv/bin:$PATH" -CMD ["/app/.venv/bin/flare-ai-kit"] \ No newline at end of file + +# Switch to non-root user +USER app + +# Validate that the script exists +RUN test -f "/app/scripts/$SCRIPT" || (echo "Error: Script /app/scripts/$SCRIPT not found" && exit 1) + +# Default command runs the specified script +CMD ["sh", "-c", "cd /app/scripts && python \"$SCRIPT_NAME\""] \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..3377b6c4 --- /dev/null +++ b/Makefile @@ -0,0 +1,208 @@ +# Flare AI Kit - Docker Scripts Makefile +# Provides convenient targets for building and running Docker scripts + +.PHONY: help build-pdf run-pdf build-rag run-rag build-a2a run-a2a build-multi run-multi clean-images list-images + +# Default target +help: ## Show this help message + @echo "Flare AI Kit - Docker Scripts" + @echo "=============================" + @echo "" + @echo "Available targets:" + @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST) + @echo "" + @echo "Environment variables:" + @echo " DATA_DIR - Local directory to mount as /app/scripts/data (default: ./scripts/data)" + @echo " ENV_FILE - Environment file to use (default: .env)" + @echo " DOCKER_OPTS - Additional docker run options" + @echo "" + @echo "Examples:" + @echo " make run-pdf DATA_DIR=./my-pdfs" + @echo " make run-pdf ENV_FILE=.env.production" + @echo " make run-pdf DOCKER_OPTS='--rm -it'" + +# Configuration +DATA_DIR ?= ./scripts/data +ENV_FILE ?= .env +DOCKER_OPTS ?= --rm -it + +# PDF Processing +build-pdf: ## Build Docker image for PDF processing + @echo "Building PDF processing image..." + docker build \ + --build-arg EXTRAS=pdf \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-pdf \ + . + @echo "✅ PDF image built: fai-script-pdf" + +run-pdf: build-pdf ## Build and run PDF processing script + @echo "Running PDF processing script..." + @if [ ! -f "$(ENV_FILE)" ]; then \ + echo "⚠️ Environment file $(ENV_FILE) not found. Creating example..."; \ + echo "AGENT__GEMINI_API_KEY=your_gemini_api_key_here" > $(ENV_FILE).example; \ + echo "LOG_LEVEL=INFO" >> $(ENV_FILE).example; \ + echo "Please copy $(ENV_FILE).example to $(ENV_FILE) and configure your API keys"; \ + exit 1; \ + fi + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + fai-script-pdf + +# RAG Processing +build-rag: ## Build Docker image for RAG processing + @echo "Building RAG processing image..." + docker build \ + --build-arg EXTRAS=rag \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-rag \ + . + @echo "✅ RAG image built: fai-script-rag" + +run-rag: build-rag ## Build and run RAG processing script + @echo "Running RAG processing script..." + @if [ ! -f "$(ENV_FILE)" ]; then \ + echo "⚠️ Environment file $(ENV_FILE) not found"; \ + exit 1; \ + fi + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + fai-script-rag + +# A2A Processing +build-a2a: ## Build Docker image for A2A processing + @echo "Building A2A processing image..." + docker build \ + --build-arg EXTRAS=a2a \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-a2a \ + . + @echo "✅ A2A image built: fai-script-a2a" + +run-a2a: build-a2a ## Build and run A2A processing script + @echo "Running A2A processing script..." + @if [ ! -f "$(ENV_FILE)" ]; then \ + echo "⚠️ Environment file $(ENV_FILE) not found"; \ + exit 1; \ + fi + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + fai-script-a2a + +# Multi-functionality build +build-multi: ## Build Docker image with multiple extras (pdf,rag,a2a) + @echo "Building multi-functionality image..." + docker build \ + --build-arg EXTRAS=pdf,rag,a2a \ + --build-arg SCRIPT=ingest_pdf.py \ + --tag fai-script-multi \ + . + @echo "✅ Multi image built: fai-script-multi" + +run-multi: build-multi ## Build and run multi-functionality script + @echo "Running multi-functionality script..." + @if [ ! -f "$(ENV_FILE)" ]; then \ + echo "⚠️ Environment file $(ENV_FILE) not found"; \ + exit 1; \ + fi + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + fai-script-multi + +# Custom builds +build-custom: ## Build custom image (use EXTRAS and SCRIPT env vars) + @if [ -z "$(EXTRAS)" ]; then \ + echo "❌ EXTRAS environment variable is required"; \ + echo "Usage: make build-custom EXTRAS=pdf,rag SCRIPT=my_script.py"; \ + exit 1; \ + fi + @if [ -z "$(SCRIPT)" ]; then \ + echo "❌ SCRIPT environment variable is required"; \ + echo "Usage: make build-custom EXTRAS=pdf,rag SCRIPT=my_script.py"; \ + exit 1; \ + fi + @echo "Building custom image with EXTRAS=$(EXTRAS) SCRIPT=$(SCRIPT)..." + docker build \ + --build-arg EXTRAS=$(EXTRAS) \ + --build-arg SCRIPT=$(SCRIPT) \ + --tag fai-script-custom \ + . + @echo "✅ Custom image built: fai-script-custom" + +run-custom: build-custom ## Build and run custom script (use EXTRAS and SCRIPT env vars) + @echo "Running custom script..." + @if [ ! -f "$(ENV_FILE)" ]; then \ + echo "⚠️ Environment file $(ENV_FILE) not found"; \ + exit 1; \ + fi + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + fai-script-custom + +# Development helpers +dev-shell: build-pdf ## Start interactive shell in PDF container for development + @echo "Starting development shell..." + @mkdir -p $(DATA_DIR) + docker run $(DOCKER_OPTS) \ + --env-file $(ENV_FILE) \ + -v "$(shell pwd)/scripts:/app/scripts" \ + -v "$(shell pwd)/src:/app/src" \ + -v "$(shell pwd)/$(DATA_DIR):/app/scripts/data" \ + --entrypoint /bin/bash \ + fai-script-pdf + +# Utility targets +list-images: ## List all fai-script Docker images + @echo "Flare AI Kit script images:" + @docker images --filter "reference=fai-script-*" --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}" + +clean-images: ## Remove all fai-script Docker images + @echo "Removing all fai-script images..." + @docker images --filter "reference=fai-script-*" -q | xargs -r docker rmi -f + @echo "✅ Cleaned up fai-script images" + +# Test targets +test-build: ## Test building all main image variants + @echo "Testing all main builds..." + @make build-pdf + @make build-rag + @make build-a2a + @make build-multi + @echo "✅ All builds completed successfully" + +# Environment setup +setup-env: ## Create example environment file + @if [ -f "$(ENV_FILE)" ]; then \ + echo "⚠️ $(ENV_FILE) already exists"; \ + else \ + echo "Creating example environment file: $(ENV_FILE)"; \ + echo "# Flare AI Kit Configuration" > $(ENV_FILE); \ + echo "LOG_LEVEL=INFO" >> $(ENV_FILE); \ + echo "" >> $(ENV_FILE); \ + echo "# AI Agent Configuration" >> $(ENV_FILE); \ + echo "AGENT__GEMINI_API_KEY=your_gemini_api_key_here" >> $(ENV_FILE); \ + echo "AGENT__GEMINI_MODEL=gemini-2.0-flash" >> $(ENV_FILE); \ + echo "" >> $(ENV_FILE); \ + echo "# Blockchain Configuration" >> $(ENV_FILE); \ + echo "ECOSYSTEM__WEB3_PROVIDER_URL=https://flare-api.flare.network/ext/C/rpc" >> $(ENV_FILE); \ + echo "" >> $(ENV_FILE); \ + echo "# Processing Configuration" >> $(ENV_FILE); \ + echo "INGESTION__CHUNK_SIZE=5000" >> $(ENV_FILE); \ + echo "" >> $(ENV_FILE); \ + echo "# Testing Configuration" >> $(ENV_FILE); \ + echo "TEE__SIMULATE_ATTESTATION_TOKEN=true" >> $(ENV_FILE); \ + echo "✅ Created $(ENV_FILE) - please configure your API keys"; \ + fi + +# Quick start +quick-start: setup-env run-pdf ## Quick start: setup environment and run PDF script diff --git a/README.md b/README.md index a5968bd4..b7ce6adf 100644 --- a/README.md +++ b/README.md @@ -107,11 +107,33 @@ uv run ruff format && uv run ruff check --fix && uv run pyright && uv run pytest ## 🐳 Docker +### Running the Full SDK + ```bash docker build -t flare-ai-kit . docker run --rm --env-file .env flare-ai-kit ``` +### Running Individual Scripts + +The repository includes a parametric Dockerfile for running specific scripts with only the dependencies they need: + +```bash +# Build and run PDF ingestion script +docker build -t fai-script-pdf \ + --build-arg EXTRAS=pdf \ + --build-arg SCRIPT=ingest_pdf.py . + +docker run --rm -it \ + -v "$PWD/scripts/data:/app/scripts/data" \ + --env-file .env \ + fai-script-pdf +``` + +Available `EXTRAS`: `pdf`, `rag`, `a2a`, `ftso`, `da`, `fassets`, `social`, `tee`, `wallet`, `ingestion` + +See [Docker Scripts Guide](docs/docker_scripts_guide.md) for detailed usage instructions. + ## ☁️ Deploy to Confidential Space **Prerequisites:** Authenticated [gcloud CLI](https://cloud.google.com/sdk/docs/install). diff --git a/docs/docker_scripts_guide.md b/docs/docker_scripts_guide.md new file mode 100644 index 00000000..6050d311 --- /dev/null +++ b/docs/docker_scripts_guide.md @@ -0,0 +1,206 @@ +# Docker Scripts Guide + +This guide explains how to use the parametric Dockerfile to run scripts from the `scripts/` directory with specific dependency groups. + +## Overview + +The Dockerfile at the repository root is designed to be parametric, allowing you to: +- Install only the dependencies needed for specific functionality (via `EXTRAS`) +- Run any script from the `scripts/` directory (via `SCRIPT`) +- Keep images minimal and reproducible using `uv.lock` + +## Build Arguments + +### `EXTRAS` +Specifies which optional dependency groups to install. Available options: +- `pdf` - PDF processing (pillow, pymupdf, pytesseract) +- `rag` - Vector RAG (qdrant-client, dulwich) +- `a2a` - Agent-to-Agent communication (fastapi) +- `ftso` - FTSO price oracle functionality +- `da` - Data Availability layer functionality +- `fassets` - FAssets protocol functionality +- `social` - Social media integrations (telegram, twitter, etc.) +- `tee` - Trusted Execution Environment (cryptography, jwt) +- `wallet` - Wallet functionality (eth-account, cryptography) +- `ingestion` - General ingestion capabilities + +### `SCRIPT` +Specifies which script to run from the `scripts/` directory. Default: `ingest_pdf.py` + +## Basic Usage + +### PDF Ingestion Script + +```bash +# Build the image for PDF processing +docker build -t fai-script-pdf \ + --build-arg EXTRAS=pdf \ + --build-arg SCRIPT=ingest_pdf.py . + +# Run the script +docker run --rm -it \ + -v "$PWD/scripts/data:/app/scripts/data" \ + fai-script-pdf +``` + +### With Environment Variables + +```bash +# Run with environment variables for API keys and configuration +docker run --rm -it \ + -e AGENT__GEMINI_API_KEY="your_gemini_api_key" \ + -e ECOSYSTEM__WEB3_PROVIDER_URL="https://flare-api.flare.network/ext/C/rpc" \ + -e LOG_LEVEL="INFO" \ + -v "$PWD/scripts/data:/app/scripts/data" \ + fai-script-pdf +``` + +### Using Environment File + +```bash +# Create a .env file with your configuration +cat > .env.docker << EOF +AGENT__GEMINI_API_KEY=your_gemini_api_key +ECOSYSTEM__WEB3_PROVIDER_URL=https://flare-api.flare.network/ext/C/rpc +LOG_LEVEL=INFO +EOF + +# Run with environment file +docker run --rm -it \ + --env-file .env.docker \ + -v "$PWD/scripts/data:/app/scripts/data" \ + fai-script-pdf +``` + +## Advanced Usage + +### Multiple Extras + +```bash +# Build with multiple dependency groups +docker build -t fai-script-multi \ + --build-arg EXTRAS="pdf,rag,a2a" \ + --build-arg SCRIPT=ingest_pdf.py . +``` + +### Custom Script + +```bash +# Build for a custom script (once you create more scripts) +docker build -t fai-script-custom \ + --build-arg EXTRAS=rag \ + --build-arg SCRIPT=my_custom_script.py . +``` + +### Development Mode with Volume Mounts + +```bash +# Mount the entire scripts directory for development +docker run --rm -it \ + -v "$PWD/scripts:/app/scripts" \ + -v "$PWD/src:/app/src" \ + --env-file .env.docker \ + fai-script-pdf +``` + +## Data Mounting + +### PDF Data Directory + +The PDF ingestion script expects data in `/app/scripts/data/`. Mount your local data: + +```bash +# Mount local data directory +docker run --rm -it \ + -v "$PWD/my-pdfs:/app/scripts/data" \ + fai-script-pdf +``` + +### Persistent Output + +```bash +# Mount output directory for persistent results +docker run --rm -it \ + -v "$PWD/scripts/data:/app/scripts/data" \ + -v "$PWD/output:/app/output" \ + fai-script-pdf +``` + +## Environment Variables + +### Required for PDF Processing +- `AGENT__GEMINI_API_KEY` - Google Gemini API key for AI processing + +### Optional Configuration +- `LOG_LEVEL` - Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) +- `ECOSYSTEM__WEB3_PROVIDER_URL` - Web3 provider URL for blockchain interactions +- `INGESTION__CHUNK_SIZE` - Text chunk size for processing (default: 5000) + +### For Other Functionality +- `VECTOR_DB__QDRANT_URL` - Qdrant vector database URL (for RAG) +- `SOCIAL__TELEGRAM_API_TOKEN` - Telegram bot token (for social features) +- `TEE__SIMULATE_ATTESTATION_TOKEN` - Simulate TEE attestation (for testing) + +## Native Package Dependencies + +The Dockerfile includes native packages required for PDF processing: + +### Included Packages +- `tesseract-ocr` - OCR engine for text extraction from images +- `tesseract-ocr-eng` - English language pack for Tesseract +- `poppler-utils` - PDF utilities for document processing + +### Adding More Languages + +To support additional languages for OCR, extend the Dockerfile: + +```dockerfile +# Add more Tesseract language packs +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr-fra \ + tesseract-ocr-deu \ + tesseract-ocr-spa \ + && rm -rf /var/lib/apt/lists/* +``` + +## Troubleshooting + +### Script Not Found Error +``` +Error: Script /app/scripts/my_script.py not found +``` +Ensure your script exists in the `scripts/` directory and the filename matches the `SCRIPT` build arg. + +### Missing Dependencies +``` +ModuleNotFoundError: No module named 'qdrant_client' +``` +Make sure you included the correct `EXTRAS` when building the image. + +### Permission Issues +``` +PermissionError: [Errno 13] Permission denied +``` +Check that mounted volumes have correct permissions. The container runs as user `app` (non-root). + +### OCR Issues +``` +TesseractNotFoundError: tesseract is not installed +``` +This shouldn't happen with the provided Dockerfile, but if it does, ensure Tesseract is properly installed in the image. + +## Best Practices + +1. **Use specific EXTRAS**: Only install the dependencies you need +2. **Environment files**: Use `.env` files for configuration instead of command-line args +3. **Volume mounts**: Mount only necessary directories to keep containers lightweight +4. **Non-root user**: The container runs as non-root user `app` for security +5. **Caching**: The Dockerfile is optimized for Docker layer caching + +## Examples Repository + +See the `scripts/` directory for example scripts: +- `ingest_pdf.py` - PDF ingestion and processing +- More scripts will be added as the project grows + +Each script should be self-contained and follow the same pattern for consistency. diff --git a/pyproject.toml b/pyproject.toml index 1394a002..64a7c045 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,6 @@ dependencies = [ "structlog>=25.2.0", "tenacity>=8.2.3,<9.0.0", "web3>=7.10.0", - "pillow>=11.3.0", - "pymupdf>=1.26.1", - "pytesseract>=0.3.13", "gitpython>=3.1.45", "google-adk>=1.9.0", ] @@ -47,10 +44,32 @@ Issues = "https://github.com/flare-foundation/flare-ai-kit/issues" flare-ai-kit = "flare_ai_kit.main:start" [project.optional-dependencies] +# Core functionality groups +ftso = [ + # FTSO functionality uses core dependencies (web3, httpx, pydantic) + # No additional dependencies required +] +da = [ + # Data Availability layer functionality uses core dependencies + # No additional dependencies required +] +fassets = [ + # FAssets functionality uses core dependencies + # No additional dependencies required +] +pdf = [ + "pillow>=11.3.0", + "pymupdf>=1.26.1", + "pytesseract>=0.3.13" +] rag = [ "qdrant-client>=1.13.3", "dulwich>=0.23.2" ] +a2a = [ + "fastapi[standard]>=0.116.1", +] +# Additional functionality groups social = [ "python-telegram-bot>=22.0", "tweepy>=4.15.0", @@ -63,9 +82,6 @@ tee = [ "pyjwt>=2.10.1", "pyopenssl>=25.0.0" ] -a2a = [ - "fastapi[standard]>=0.116.1", -] wallet = [ "httpx>=0.28.1", "cryptography>=44.0.2", @@ -106,6 +122,7 @@ ignore = ["D203", "D212", "COM812", "S105", "D401", "D104", "ANN401", "ISC003", "src/flare_ai_kit/agent/ecosystem_tools_wrapper.py" = ["PLC0415"] "tests/**/*.py" = ["S", "ARG", "PLR2004", "SLF001", "BLE001", "E501", "T201", "D", "ANN", "F821", "PLC"] "examples/**/*.py" = ["D", "T201", "BLE001", "INP001", "E501"] +"scripts/**/*.py" = ["D", "T201", "BLE001", "INP001", "E501"] [tool.pyright] pythonVersion = "3.12" diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..636b8582 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"""Scripts package for flare-ai-kit.""" diff --git a/scripts/data/__init__.py b/scripts/data/__init__.py new file mode 100644 index 00000000..f23f82b8 --- /dev/null +++ b/scripts/data/__init__.py @@ -0,0 +1 @@ +"""Data utilities for scripts.""" diff --git a/scripts/data/create_sample_invoice.py b/scripts/data/create_sample_invoice.py new file mode 100644 index 00000000..cece7f58 --- /dev/null +++ b/scripts/data/create_sample_invoice.py @@ -0,0 +1,177 @@ +"""Create a sample invoice PDF and build a template for PDF extraction.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING + +import fitz # type: ignore[reportMissingTypeStubs] +from reportlab.lib.pagesizes import letter +from reportlab.lib.units import inch +from reportlab.pdfgen import canvas + +from flare_ai_kit.ingestion.settings import ( + PDFFieldExtractionSettings, + PDFTemplateSettings, +) + +if TYPE_CHECKING: + from collections.abc import Iterable + + +# ---------- Constants ---------- +FILE_PATH = Path(__file__).resolve().parent / "sample_invoice.pdf" +INVOICE_ID = "FAI-2025-001" +ISSUE_DATE = "July 10, 2025" +AMOUNT_DUE = "1,250,000" + + +# ---------- Types ---------- +@dataclass(frozen=True) +class RectI: + """Rectangle with integer coordinates.""" + + x0: int + y0: int + x1: int + y1: int + + +# ---------- Helpers ---------- +def _ensure_parent(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def _round_rect(r: fitz.Rect) -> RectI: + return RectI(*(round(v) for v in (r.x0, r.y0, r.x1, r.y1))) # type: ignore[reportUnknownMemberType] + + +def _find_label_rect(page: fitz.Page, label: str) -> fitz.Rect: + hits = page.search_for(label) # type: ignore[reportUnknownMemberType] + if not hits: + msg = f"Label not found: '{label}'" + raise ValueError(msg) + return hits[0] # type: ignore[reportUnknownVariableType] + + +def _value_rect_right_of_label( + page: fitz.Page, + label: str, + right_x: float, + pad_x: float = 5.0, + pad_y: float = 2.0, +) -> fitz.Rect: + lr = _find_label_rect(page, label) + return fitz.Rect(lr.x1 + pad_x, lr.y0 - pad_y, right_x, lr.y1 + pad_y) + + +def _value_rect_same_line_after_label( + page: fitz.Page, label: str, pad: float = 1.0 +) -> fitz.Rect: + """Union words on the same line to the right of the label for a tight value bbox.""" + lr = _find_label_rect(page, label) + words: Iterable[tuple[float, float, float, float, str, int, int, int]] = ( + page.get_text("words") + ) + y_mid = (lr.y0 + lr.y1) / 2.0 + line_words = [w for w in words if w[1] <= y_mid <= w[3]] + right_side = [w for w in line_words if w[0] >= lr.x1 - 0.5] + if not right_side: + # Fallback to a region to the page right if nothing parsed + return fitz.Rect(lr.x1 + pad, lr.y0 - pad, page.rect.x1 - pad, lr.y1 + pad) + x0 = min(w[0] for w in right_side) - pad + y0 = min(w[1] for w in right_side) - pad + x1 = max(w[2] for w in right_side) + pad + y1 = max(w[3] for w in right_side) + pad + return fitz.Rect(x0, y0, x1, y1) + + +def _coords_to_template( + template_name: str, coords: dict[str, RectI] +) -> PDFTemplateSettings: + fields = [ + PDFFieldExtractionSettings( + field_name=name, + x0=r.x0, + y0=r.y0, + x1=r.x1, + y1=r.y1, + data_type="string", # adjust per-field if needed + ) + for name, r in coords.items() + ] + return PDFTemplateSettings(template_name=template_name, fields=fields) + + +# ---------- Main API ---------- +def create_invoice_and_build_template( + template_name: str = "generated_invoice", +) -> tuple[Path, PDFTemplateSettings]: + """Generate the sample PDF and build PDFTemplateSettings.""" + _ensure_parent(FILE_PATH) + + # ----- Generate PDF ----- + c = canvas.Canvas(str(FILE_PATH), pagesize=letter) + width, _ = letter + + # Header & addresses + c.setFont("Helvetica-Bold", 16) + c.drawString(0.5 * inch, 10 * inch, "Flare AI Systems") + c.setFont("Helvetica", 12) + c.drawString(0.5 * inch, 9.8 * inch, "Wuse II, Abuja, FCT, Nigeria") + c.setFont("Helvetica-Bold", 24) + c.drawRightString(width - 0.5 * inch, 10 * inch, "INVOICE") + c.setFont("Helvetica-Bold", 12) + c.drawString(0.5 * inch, 9.0 * inch, "BILL TO:") + c.setFont("Helvetica", 12) + c.drawString(0.5 * inch, 8.8 * inch, "Customer Corp") + c.drawString(0.5 * inch, 8.6 * inch, "123 Innovation Drive, Maitama, Abuja") + + # Invoice details + c.setFont("Helvetica-Bold", 12) + c.drawString(5.0 * inch, 9.25 * inch, "Invoice ID:") + c.drawString(5.0 * inch, 9.0 * inch, "Issue Date:") + c.setFont("Helvetica", 12) + c.drawString(6.0 * inch, 9.25 * inch, INVOICE_ID) + c.drawString(6.0 * inch, 9.0 * inch, ISSUE_DATE) + + # Placeholder table + c.line(0.5 * inch, 8.0 * inch, width - 0.5 * inch, 8.0 * inch) + + # Total + c.setFont("Helvetica-Bold", 14) + c.drawString(5.0 * inch, 4.0 * inch, "Total Due:") + c.setFont("Helvetica-Bold", 14) + c.drawRightString(width - 0.7 * inch, 4.0 * inch, AMOUNT_DUE) + + c.save() + print(f"✅ Created {FILE_PATH}") + + # ----- Discover coordinates & build template ----- + with fitz.open(str(FILE_PATH)) as doc: + page = doc[0] + page_right = page.rect.x1 + + coords: dict[str, RectI] = { + "invoice_id": _round_rect( + _value_rect_same_line_after_label(page, "Invoice ID:") + ), + "issue_date": _round_rect( + _value_rect_same_line_after_label(page, "Issue Date:") + ), + "amount_due": _round_rect( + _value_rect_right_of_label( + page, "Total Due:", right_x=page_right - 0.7 * inch + ) + ), + } + + template = _coords_to_template(template_name, coords) + print(f"✅ Built PDF template: {template.template_name}") + return FILE_PATH, template + + +if __name__ == "__main__": + _, t = create_invoice_and_build_template() + print(t.model_dump()) diff --git a/scripts/ingest_pdf.py b/scripts/ingest_pdf.py new file mode 100644 index 00000000..bdeca23a --- /dev/null +++ b/scripts/ingest_pdf.py @@ -0,0 +1,166 @@ +"""PDF ingestion script that extracts data from PDFs using AI agents.""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, mock_open, patch + +# Import from local data directory +from data.create_sample_invoice import create_invoice_and_build_template +from google.adk.agents import Agent +from google.adk.runners import Runner +from google.adk.sessions import InMemorySessionService +from google.genai import types + +from flare_ai_kit import FlareAIKit +from flare_ai_kit.agent.pdf_tools import read_pdf_text_tool +from flare_ai_kit.config import AppSettings +from flare_ai_kit.ingestion.settings import ( + IngestionSettings, + OnchainContractSettings, + PDFIngestionSettings, + PDFTemplateSettings, +) + +MOCK_TX_HASH = "0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef" + + +def _prompt(pdf: Path, template: PDFTemplateSettings, max_pages: int | None) -> str: + """Build the prompt from the template.""" + return ( + "Parse this PDF using tools and return ONLY JSON per the template.\n" + f"PDF_PATH: {pdf}\nMAX_PAGES: {max_pages or 'ALL'}\n\n" + "TEMPLATE_JSON:\n```json\n" + json.dumps(template.model_dump()) + "\n```\n\n" + "- Call read_pdf_text(file_path=PDF_PATH, max_pages=MAX_PAGES).\n" + "- Extract each field in TEMPLATE_JSON.fields.\n" + "- Reply with a single JSON object (no markdown)." + ) + + +def _json_from(text: str) -> dict[str, Any]: + """Extract JSON from agent return text.""" + try: + return json.loads(text) + except json.JSONDecodeError as e: + fence = re.search( + r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL | re.IGNORECASE + ) + if fence: + return json.loads(fence.group(1)) + blob = re.search(r"\{.*\}", text, re.DOTALL) + if blob: + return json.loads(blob.group(0)) + msg = f"Agent response is not valid JSON:\n{text}" + raise RuntimeError(msg) from e + + +async def parse_pdf_to_template_json( + agent: Agent, + pdf: str | Path, + template: PDFTemplateSettings, + max_pages: int | None = None, +) -> dict[str, Any]: + """Setup in-memory ADK agent, give it the PDF, template and prompt.""" + pdf = Path(pdf) + svc = InMemorySessionService() + await svc.create_session(app_name="app", user_id="u", session_id="s") + runner = Runner(agent=agent, app_name="app", session_service=svc) + + msg = types.Content( + role="user", parts=[types.Part(text=_prompt(pdf, template, max_pages))] + ) + final_text = None + print(f"Calling {agent.name} using model: {agent.model}") + async for ev in runner.run_async(user_id="u", session_id="s", new_message=msg): + if ev.is_final_response() and ev.content and ev.content.parts: + final_text = ev.content.parts[0].text + break + if not final_text: + msg = "Agent produced no response." + raise RuntimeError(msg) + return _json_from(final_text) + + +async def main() -> None: + """Main function to demonstrate PDF ingestion and processing.""" + # Create PDF and save it + pdf_path, template = create_invoice_and_build_template("generated_invoice") + + # Add template to global settings + app_settings = AppSettings( + log_level="INFO", + ingestion=IngestionSettings( + pdf_ingestion=PDFIngestionSettings( + templates=[template], + use_ocr=False, + contract_settings=OnchainContractSettings( + contract_address="0x0000000000000000000000000000000000000000", + abi_name="OnchainDataRegistry", + function_name="registerDocument", + ), + ) + ), + ) + + # Inject Gemini API Key + if app_settings.agent and app_settings.agent.gemini_api_key: + api_key = app_settings.agent.gemini_api_key.get_secret_value() + os.environ["GOOGLE_API_KEY"] = api_key + + # Create ADK agent with tool access. + pdf_agent_instruction = ( + "You are a PDF extraction agent. " + "Independently read PDFs using tools and return ONLY JSON matching this " + "schema:\n" + "{\n" + ' "template_name": string,\n' + ' "fields": [ {"field_name": string, "value": string|null}, ... ]\n' + "}\n" + "- Always call read_pdf_text with the provided file path.\n" + "- Use ONLY the template JSON (field order and names) provided by the " + "user to decide what to extract.\n" + "- If a field is not found, set its value to null.\n" + "- Do not include prose or explanations. Reply with a single JSON object only." + ) + + # Construct the Agent instance using the imported tool and settings + pdf_agent = Agent( + name="flare_pdf_agent", + model=app_settings.agent.gemini_model, + tools=[read_pdf_text_tool], + instruction=pdf_agent_instruction, + generate_content_config=types.GenerateContentConfig( + temperature=0.0, top_k=1, top_p=0.3, candidate_count=1 + ), + ) + + # Mock onchain contract posting + with ( + patch( + "flare_ai_kit.onchain.contract_poster.ContractPoster.post_data", + new_callable=AsyncMock, + return_value=MOCK_TX_HASH, + ) as mock_post, + patch("flare_ai_kit.onchain.contract_poster.open", mock_open(read_data="[]")), + ): + kit = FlareAIKit(config=app_settings) + tx_hash = await kit.pdf_processor.ingest_and_post( + file_path=str(pdf_path), template_name=template.template_name + ) + print("✅ on-chain tx:", tx_hash) + print("📄 extracted:", mock_post.call_args[0][0]) + + # Agent PDF parsing + structured = await parse_pdf_to_template_json( + pdf_agent, pdf_path, template, max_pages=1 + ) + print("🧩 agent JSON:", json.dumps(structured, indent=2)) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/uv.lock b/uv.lock index 7dbe07f1..24429009 100644 --- a/uv.lock +++ b/uv.lock @@ -827,11 +827,8 @@ dependencies = [ { name = "google-adk" }, { name = "google-genai" }, { name = "httpx" }, - { name = "pillow" }, { name = "pydantic" }, { name = "pydantic-ai" }, - { name = "pymupdf" }, - { name = "pytesseract" }, { name = "structlog" }, { name = "tenacity" }, { name = "web3" }, @@ -846,6 +843,11 @@ ingestion = [ { name = "pymupdf" }, { name = "pytesseract" }, ] +pdf = [ + { name = "pillow" }, + { name = "pymupdf" }, + { name = "pytesseract" }, +] rag = [ { name = "dulwich" }, { name = "qdrant-client" }, @@ -895,17 +897,17 @@ requires-dist = [ { name = "google-genai", specifier = ">=1.8.0" }, { name = "httpx", specifier = ">=0.28.1" }, { name = "httpx", marker = "extra == 'wallet'", specifier = ">=0.28.1" }, - { name = "pillow", specifier = ">=11.3.0" }, { name = "pillow", marker = "extra == 'ingestion'", specifier = ">=11.3.0" }, + { name = "pillow", marker = "extra == 'pdf'", specifier = ">=11.3.0" }, { name = "pydantic", specifier = ">=2.11.1" }, { name = "pydantic-ai", specifier = ">=0.0.46" }, { name = "pyjwt", marker = "extra == 'tee'", specifier = ">=2.10.1" }, { name = "pyjwt", marker = "extra == 'wallet'", specifier = ">=2.10.1" }, - { name = "pymupdf", specifier = ">=1.26.1" }, { name = "pymupdf", marker = "extra == 'ingestion'", specifier = ">=1.26.1" }, + { name = "pymupdf", marker = "extra == 'pdf'", specifier = ">=1.26.1" }, { name = "pyopenssl", marker = "extra == 'tee'", specifier = ">=25.0.0" }, - { name = "pytesseract", specifier = ">=0.3.13" }, { name = "pytesseract", marker = "extra == 'ingestion'", specifier = ">=0.3.13" }, + { name = "pytesseract", marker = "extra == 'pdf'", specifier = ">=0.3.13" }, { name = "python-telegram-bot", marker = "extra == 'social'", specifier = ">=22.0" }, { name = "qdrant-client", marker = "extra == 'rag'", specifier = ">=1.13.3" }, { name = "slack-sdk", marker = "extra == 'social'", specifier = ">=3.26.2" }, @@ -914,7 +916,7 @@ requires-dist = [ { name = "tweepy", marker = "extra == 'social'", specifier = ">=4.15.0" }, { name = "web3", specifier = ">=7.10.0" }, ] -provides-extras = ["rag", "social", "tee", "a2a", "wallet", "ingestion"] +provides-extras = ["ftso", "da", "fassets", "pdf", "rag", "a2a", "social", "tee", "wallet", "ingestion"] [package.metadata.requires-dev] dev = [