From 68700941147756ec9cd49536a8696e9be2f95764 Mon Sep 17 00:00:00 2001 From: tharun634 Date: Sun, 5 Oct 2025 18:38:03 +0530 Subject: [PATCH 1/2] fix: mypy type errors in the examples and clean up --- examples/amazon_s3_embedding/main.py | 2 +- examples/azure_blob_embedding/main.py | 2 +- examples/code_embedding/main.py | 4 +- examples/custom_output_files/main.py | 2 +- examples/face_recognition/main.py | 22 +++---- examples/fastapi_server_docker/main.py | 19 +++--- examples/gdrive_text_embedding/main.py | 9 +-- examples/image_search/colpali_main.py | 57 ++++++++++++------ examples/image_search/main.py | 69 ++++++++++++---------- examples/manuals_llm_extraction/main.py | 14 ++--- examples/multi_format_indexing/main.py | 4 +- examples/paper_metadata/main.py | 12 ++-- examples/patient_intake_extraction/main.py | 10 ++-- examples/pdf_embedding/main.py | 31 ++++++---- examples/postgres_source/main.py | 10 ++-- examples/text_embedding/main.py | 7 ++- examples/text_embedding_qdrant/main.py | 2 +- python/cocoindex/functions/__init__.py | 2 + 18 files changed, 158 insertions(+), 120 deletions(-) diff --git a/examples/amazon_s3_embedding/main.py b/examples/amazon_s3_embedding/main.py index 4418d0da..8d94cea0 100644 --- a/examples/amazon_s3_embedding/main.py +++ b/examples/amazon_s3_embedding/main.py @@ -1,5 +1,5 @@ from dotenv import load_dotenv -from psycopg_pool import ConnectionPool +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] import cocoindex import os from typing import Any diff --git a/examples/azure_blob_embedding/main.py b/examples/azure_blob_embedding/main.py index a38c47ef..473fafa5 100644 --- a/examples/azure_blob_embedding/main.py +++ b/examples/azure_blob_embedding/main.py @@ -1,5 +1,5 @@ from dotenv import load_dotenv -from psycopg_pool import ConnectionPool +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] import cocoindex import os from typing import Any diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py index 053eacf9..7c319488 100644 --- a/examples/code_embedding/main.py +++ b/examples/code_embedding/main.py @@ -1,6 +1,6 @@ from dotenv import load_dotenv -from psycopg_pool import ConnectionPool -from pgvector.psycopg import register_vector +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] +from pgvector.psycopg import register_vector # type: ignore[import-not-found] from typing import Any import functools import cocoindex diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index 5bbfa83d..3efac7df 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -96,7 +96,7 @@ def mutate( @cocoindex.op.function() def markdown_to_html(text: str) -> str: - return _markdown_it.render(text) + return str(_markdown_it.render(text)) @cocoindex.flow_def(name="CustomOutputFiles") diff --git a/examples/face_recognition/main.py b/examples/face_recognition/main.py index cd05c705..4e3cac38 100644 --- a/examples/face_recognition/main.py +++ b/examples/face_recognition/main.py @@ -3,8 +3,9 @@ import datetime import io import os +from typing import cast -import face_recognition +import face_recognition # type: ignore[import-not-found] import numpy as np from PIL import Image @@ -52,8 +53,9 @@ def extract_faces(content: bytes) -> list[FaceBase]: ratio = 1.0 img = orig_img - # Extract face locations. - locs = face_recognition.face_locations(np.array(img), model="cnn") + locs: list[tuple[int, int, int, int]] = face_recognition.face_locations( # type: ignore[attr-defined] + np.array(img), model="cnn" + ) faces: list[FaceBase] = [] for min_y, max_x, max_y, min_x in locs: @@ -63,8 +65,6 @@ def extract_faces(content: bytes) -> list[FaceBase]: max_x=int(max_x * ratio), max_y=int(max_y * ratio), ) - - # Crop the face and save it as a PNG. buf = io.BytesIO() orig_img.crop((rect.min_x, rect.min_y, rect.max_x, rect.max_y)).save( buf, format="PNG" @@ -76,16 +76,16 @@ def extract_faces(content: bytes) -> list[FaceBase]: @cocoindex.op.function(cache=True, behavior_version=1, gpu=True) -def extract_face_embedding( - face: bytes, -) -> cocoindex.Vector[cocoindex.Float32]: +def extract_face_embedding(face: bytes) -> cocoindex.Vector[cocoindex.Float32]: """Extract the embedding of a face.""" img = Image.open(io.BytesIO(face)).convert("RGB") - embedding = face_recognition.face_encodings( + encoding: np.ndarray = face_recognition.face_encodings( # type: ignore[attr-defined] np.array(img), known_face_locations=[(0, img.width - 1, img.height - 1, 0)], - )[0] - return embedding + )[ + 0 + ] + return cast(cocoindex.Vector[cocoindex.Float32], encoding.astype(np.float32)) @cocoindex.flow_def(name="FaceRecognition") diff --git a/examples/fastapi_server_docker/main.py b/examples/fastapi_server_docker/main.py index 752e2436..b48693e0 100644 --- a/examples/fastapi_server_docker/main.py +++ b/examples/fastapi_server_docker/main.py @@ -1,11 +1,12 @@ import cocoindex -import uvicorn +import uvicorn # type: ignore[import-not-found] from dotenv import load_dotenv -from fastapi import FastAPI, Query -from fastapi import Request -from psycopg_pool import ConnectionPool +from fastapi import FastAPI, Query # type: ignore[import-not-found] +from fastapi import Request # type: ignore[import-not-found] +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] from contextlib import asynccontextmanager import os +from typing import Any, AsyncIterator @cocoindex.transform_flow() @@ -26,7 +27,7 @@ def text_to_embedding( @cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample") def markdown_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds markdown files into a vector database. """ @@ -65,7 +66,7 @@ def markdown_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( markdown_embedding_flow, "doc_embeddings" @@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): @asynccontextmanager -def lifespan(app: FastAPI): +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) @@ -103,12 +104,12 @@ def lifespan(app: FastAPI): fastapi_app = FastAPI(lifespan=lifespan) -@fastapi_app.get("/search") +@fastapi_app.get("/search") # type: ignore[misc] def search_endpoint( request: Request, q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -): +) -> dict[str, Any]: pool = request.app.state.pool results = search(pool, q, limit) return {"results": results} diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py index c9b7b630..62f3277b 100644 --- a/examples/gdrive_text_embedding/main.py +++ b/examples/gdrive_text_embedding/main.py @@ -1,8 +1,9 @@ from dotenv import load_dotenv -from psycopg_pool import ConnectionPool +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] import cocoindex import datetime import os +from typing import Any @cocoindex.transform_flow() @@ -23,7 +24,7 @@ def text_to_embedding( @cocoindex.flow_def(name="GoogleDriveTextEmbedding") def gdrive_text_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds text into a vector database. """ @@ -71,7 +72,7 @@ def gdrive_text_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]: # Get the table name, for the export target in the gdrive_text_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( gdrive_text_embedding_flow, "doc_embeddings" @@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): ] -def _main(): +def _main() -> None: # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. diff --git a/examples/image_search/colpali_main.py b/examples/image_search/colpali_main.py index feec3fab..940b27bd 100644 --- a/examples/image_search/colpali_main.py +++ b/examples/image_search/colpali_main.py @@ -1,14 +1,15 @@ import datetime import os from contextlib import asynccontextmanager -from typing import Any +from typing import AsyncIterator, List, Optional +from dataclasses import dataclass import cocoindex from dotenv import load_dotenv -from fastapi import FastAPI, Query -from fastapi.middleware.cors import CORSMiddleware -from fastapi.staticfiles import StaticFiles -from qdrant_client import QdrantClient +from fastapi import FastAPI, Query # type: ignore[import-not-found] +from fastapi.middleware.cors import CORSMiddleware # type: ignore[import-not-found] +from fastapi.staticfiles import StaticFiles # type: ignore[import-not-found] +from qdrant_client import QdrantClient # type: ignore[import-not-found] # --- Config --- @@ -17,7 +18,7 @@ QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6334") PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true" -# Use HTTP +# Use HTTP (uncomment if needed) # QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6333") # PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "false").lower() == "true" @@ -27,6 +28,9 @@ print(f"📐 Using ColPali model {COLPALI_MODEL_NAME}") +# --- Embedding helpers --- + + @cocoindex.transform_flow() def text_to_colpali_embedding( text: cocoindex.DataSlice[str], @@ -70,8 +74,11 @@ def image_object_embedding_flow( ) +# --- Lifespan context --- + + @asynccontextmanager -async def lifespan(app: FastAPI) -> None: +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() image_object_embedding_flow.setup(report_to_stdout=True) @@ -85,6 +92,21 @@ async def lifespan(app: FastAPI) -> None: yield +# --- Response Dataclasses --- + + +@dataclass +class SearchResult: + filename: str + score: float + caption: Optional[str] = None + + +@dataclass +class SearchResponse: + results: List[SearchResult] + + # --- FastAPI app for web API --- app = FastAPI(lifespan=lifespan) @@ -95,16 +117,17 @@ async def lifespan(app: FastAPI) -> None: allow_methods=["*"], allow_headers=["*"], ) + # Serve images from the 'img' directory at /img app.mount("/img", StaticFiles(directory="img"), name="img") # --- Search API --- -@app.get("/search") +@app.get("/search", response_model=SearchResponse) # type: ignore[misc] def search( q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -) -> Any: +) -> SearchResponse: # Get the multi-vector embedding for the query query_embedding = text_to_colpali_embedding.eval(q) print( @@ -122,13 +145,13 @@ def search( print(f"📈 Found {len(search_results.points)} results with MaxSim scoring") - return { - "results": [ - { - "filename": result.payload["filename"], - "score": result.score, - "caption": result.payload.get("caption"), - } + return SearchResponse( + results=[ + SearchResult( + filename=result.payload["filename"], + score=result.score, + caption=result.payload.get("caption"), + ) for result in search_results.points ] - } + ) diff --git a/examples/image_search/main.py b/examples/image_search/main.py index 2fd12c4a..3d57421c 100644 --- a/examples/image_search/main.py +++ b/examples/image_search/main.py @@ -3,23 +3,24 @@ import io import os from contextlib import asynccontextmanager -from typing import Any, Literal +from typing import Literal, cast, AsyncIterator, Final import cocoindex import torch from dotenv import load_dotenv -from fastapi import FastAPI, Query -from fastapi.middleware.cors import CORSMiddleware -from fastapi.staticfiles import StaticFiles +from fastapi import FastAPI, Query # type: ignore[import-not-found] +from fastapi.middleware.cors import CORSMiddleware # type: ignore[import-not-found] +from fastapi.staticfiles import StaticFiles # type: ignore[import-not-found] +from pydantic import BaseModel from PIL import Image -from qdrant_client import QdrantClient +from qdrant_client import QdrantClient # type: ignore[import-not-found] from transformers import CLIPModel, CLIPProcessor OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/") QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/") QDRANT_COLLECTION = "ImageSearch" CLIP_MODEL_NAME = "openai/clip-vit-large-patch14" -CLIP_MODEL_DIMENSION = 768 +CLIP_MODEL_DIMENSION: Final = 768 @functools.cache @@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]: inputs = processor(text=[text], return_tensors="pt", padding=True) with torch.no_grad(): features = model.get_text_features(**inputs) - return features[0].tolist() + return cast(list[float], features[0].tolist()) @cocoindex.op.function(cache=True, behavior_version=1, gpu=True) def embed_image( img_bytes: bytes, -) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]: +) -> cocoindex.Vector[cocoindex.Float32, Literal[768]]: """ Convert image to embedding using CLIP model. """ @@ -52,7 +53,7 @@ def embed_image( inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): features = model.get_image_features(**inputs) - return features[0].tolist() + return cast(list[float], features[0].tolist()) # CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant @@ -64,15 +65,12 @@ def image_object_embedding_flow( cocoindex.sources.LocalFile( path="img", included_patterns=["*.jpg", "*.jpeg", "*.png"], binary=True ), - refresh_interval=datetime.timedelta( - minutes=1 - ), # Poll for changes every 1 minute + refresh_interval=datetime.timedelta(minutes=1), ) img_embeddings = data_scope.add_collector() with data_scope["images"].row() as img: ollama_model_name = os.getenv("OLLAMA_MODEL") if ollama_model_name is not None: - # If an Ollama model is specified, generate an image caption img["caption"] = flow_builder.transform( cocoindex.functions.ExtractByLlm( llm_spec=cocoindex.llm.LlmSpec( @@ -112,18 +110,19 @@ def image_object_embedding_flow( @asynccontextmanager -async def lifespan(app: FastAPI) -> None: +async def lifespan(app: FastAPI) -> AsyncIterator[None]: load_dotenv() cocoindex.init() image_object_embedding_flow.setup(report_to_stdout=True) app.state.qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=True) - - # Start updater app.state.live_updater = cocoindex.FlowLiveUpdater(image_object_embedding_flow) app.state.live_updater.start() - yield + try: + yield None + finally: + app.state.live_updater.stop() # --- FastAPI app for web API --- @@ -136,20 +135,28 @@ async def lifespan(app: FastAPI) -> None: allow_methods=["*"], allow_headers=["*"], ) -# Serve images from the 'img' directory at /img app.mount("/img", StaticFiles(directory="img"), name="img") +# --- Typed response models --- +class SearchResult(BaseModel): + filename: str + score: float + caption: str | None = None + + +class SearchResponse(BaseModel): + results: list[SearchResult] + + # --- Search API --- -@app.get("/search") +@app.get("/search", response_model=SearchResponse) # type: ignore[misc] def search( q: str = Query(..., description="Search query"), limit: int = Query(5, description="Number of results"), -) -> Any: - # Get the embedding for the query +) -> SearchResponse: query_embedding = embed_query(q) - # Search in Qdrant search_results = app.state.qdrant_client.search( collection_name=QDRANT_COLLECTION, query_vector=("embedding", query_embedding), @@ -157,15 +164,13 @@ def search( with_payload=True, ) - return { - "results": [ - { - "filename": result.payload["filename"], - "score": result.score, - "caption": result.payload.get( - "caption" - ), # Include caption if available - } + return SearchResponse( + results=[ + SearchResult( + filename=result.payload["filename"], + score=result.score, + caption=result.payload.get("caption"), + ) for result in search_results ] - } + ) diff --git a/examples/manuals_llm_extraction/main.py b/examples/manuals_llm_extraction/main.py index e35a7700..297b9b1a 100644 --- a/examples/manuals_llm_extraction/main.py +++ b/examples/manuals_llm_extraction/main.py @@ -1,10 +1,10 @@ import tempfile import dataclasses -from marker.converters.pdf import PdfConverter -from marker.models import create_model_dict -from marker.output import text_from_rendered -from marker.config.parser import ConfigParser +from marker.converters.pdf import PdfConverter # type: ignore[import-not-found] +from marker.models import create_model_dict # type: ignore[import-not-found] +from marker.output import text_from_rendered # type: ignore[import-not-found] +from marker.config.parser import ConfigParser # type: ignore[import-not-found] import cocoindex @@ -20,7 +20,7 @@ class PdfToMarkdownExecutor: spec: PdfToMarkdown _converter: PdfConverter - def prepare(self): + def prepare(self) -> None: config_parser = ConfigParser({}) self._converter = PdfConverter( create_model_dict(), config=config_parser.generate_config_dict() @@ -31,7 +31,7 @@ def __call__(self, content: bytes) -> str: temp_file.write(content) temp_file.flush() text, _, _ = text_from_rendered(self._converter(temp_file.name)) - return text + return str(text) @dataclasses.dataclass @@ -90,7 +90,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary: @cocoindex.flow_def(name="ManualExtraction") def manual_extraction_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that extracts manual information from a Markdown. """ diff --git a/examples/multi_format_indexing/main.py b/examples/multi_format_indexing/main.py index aab794e1..ad10cf7b 100644 --- a/examples/multi_format_indexing/main.py +++ b/examples/multi_format_indexing/main.py @@ -4,10 +4,10 @@ from dotenv import load_dotenv from dataclasses import dataclass -from pdf2image import convert_from_bytes +from pdf2image import convert_from_bytes # type: ignore[import-not-found] from io import BytesIO -from qdrant_client import QdrantClient +from qdrant_client import QdrantClient # type: ignore[import-not-found] QDRANT_GRPC_URL = "http://localhost:6334" QDRANT_COLLECTION = "MultiFormatIndexings" diff --git a/examples/paper_metadata/main.py b/examples/paper_metadata/main.py index 195f472a..0c42bca6 100644 --- a/examples/paper_metadata/main.py +++ b/examples/paper_metadata/main.py @@ -4,12 +4,12 @@ import dataclasses import datetime -from marker.config.parser import ConfigParser -from marker.converters.pdf import PdfConverter -from marker.models import create_model_dict -from marker.output import text_from_rendered +from marker.config.parser import ConfigParser # type: ignore[import-not-found] +from marker.converters.pdf import PdfConverter # type: ignore[import-not-found] +from marker.models import create_model_dict # type: ignore[import-not-found] +from marker.output import text_from_rendered # type: ignore[import-not-found] from functools import cache -from pypdf import PdfReader, PdfWriter +from pypdf import PdfReader, PdfWriter # type: ignore[import-not-found] @cache @@ -67,7 +67,7 @@ def pdf_to_markdown(content: bytes) -> str: temp_file.write(content) temp_file.flush() text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name)) - return text + return str(text) @cocoindex.flow_def(name="PaperMetadata") diff --git a/examples/patient_intake_extraction/main.py b/examples/patient_intake_extraction/main.py index fd7ec21b..968fadf5 100644 --- a/examples/patient_intake_extraction/main.py +++ b/examples/patient_intake_extraction/main.py @@ -3,8 +3,8 @@ import dataclasses import os -from markitdown import MarkItDown -from openai import OpenAI +from markitdown import MarkItDown # type: ignore[import-not-found] +from openai import OpenAI # type: ignore[import-not-found] import cocoindex @@ -97,7 +97,7 @@ class ToMarkdownExecutor: spec: ToMarkdown _converter: MarkItDown - def prepare(self): + def prepare(self) -> None: client = OpenAI() self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o") @@ -106,14 +106,14 @@ def __call__(self, content: bytes, filename: str) -> str: with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file: temp_file.write(content) temp_file.flush() - text = self._converter.convert(temp_file.name).text_content + text: str = self._converter.convert(temp_file.name).text_content return text @cocoindex.flow_def(name="PatientIntakeExtraction") def patient_intake_extraction_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define a flow that extracts patient information from intake forms. """ diff --git a/examples/pdf_embedding/main.py b/examples/pdf_embedding/main.py index 4bbe4564..b0fa062f 100644 --- a/examples/pdf_embedding/main.py +++ b/examples/pdf_embedding/main.py @@ -3,11 +3,11 @@ import tempfile from dotenv import load_dotenv -from marker.config.parser import ConfigParser -from marker.converters.pdf import PdfConverter -from marker.models import create_model_dict -from marker.output import text_from_rendered -from psycopg_pool import ConnectionPool +from marker.config.parser import ConfigParser # type: ignore[import-not-found] +from marker.converters.pdf import PdfConverter # type: ignore[import-not-found] +from marker.models import create_model_dict # type: ignore[import-not-found] +from marker.output import text_from_rendered # type: ignore[import-not-found] +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] from jinja2 import Template @@ -22,7 +22,7 @@ class PdfToMarkdownExecutor: spec: PdfToMarkdown _converter: PdfConverter - def prepare(self): + def prepare(self) -> None: config_parser = ConfigParser({}) self._converter = PdfConverter( create_model_dict(), config=config_parser.generate_config_dict() @@ -33,7 +33,7 @@ def __call__(self, content: bytes) -> str: temp_file.write(content) temp_file.flush() text, _, _ = text_from_rendered(self._converter(temp_file.name)) - return text + return str(text) @cocoindex.transform_flow() @@ -54,7 +54,7 @@ def text_to_embedding( @cocoindex.flow_def(name="PdfEmbedding") def pdf_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -): +) -> None: """ Define an example flow that embeds files into a vector database. """ @@ -96,7 +96,9 @@ def pdf_embedding_flow( ) -def search(pool: ConnectionPool, query: str, top_k: int = 5): +def search( + pool: ConnectionPool, query: str, top_k: int = 5 +) -> list[dict[str, str | float]]: # Get the table name, for the export target in the pdf_embedding_flow above. table_name = cocoindex.utils.get_target_default_name( pdf_embedding_flow, "pdf_embeddings" @@ -113,24 +115,27 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5): """, (query_vector, top_k), ) - return [ + results: list[dict[str, str | float]] = [ {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} for row in cur.fetchall() ] + return results # Define the search results template using Jinja2 -SEARCH_RESULTS_TEMPLATE = Template(""" +SEARCH_RESULTS_TEMPLATE = Template( + """ Search results: {% for result in results %} [{{ "%.3f"|format(result.score) }}] {{ result.filename }} {{ result.text }} --- {% endfor %} -""") +""" +) -def _main(): +def _main() -> None: # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py index deef6172..84ccd546 100644 --- a/examples/postgres_source/main.py +++ b/examples/postgres_source/main.py @@ -1,11 +1,10 @@ from typing import Any import os -import datetime from dotenv import load_dotenv -from psycopg_pool import ConnectionPool -from pgvector.psycopg import register_vector # type: ignore[import-untyped] -from psycopg.rows import dict_row +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] +from pgvector.psycopg import register_vector # type: ignore[import-not-found] +from psycopg.rows import dict_row # type: ignore[import-not-found] from numpy.typing import NDArray import numpy as np @@ -134,7 +133,8 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, A """, (query_vector, top_k), ) - return cur.fetchall() + rows: list[dict[str, Any]] = cur.fetchall() + return rows def _main() -> None: diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py index 9a4f18a3..4affac3f 100644 --- a/examples/text_embedding/main.py +++ b/examples/text_embedding/main.py @@ -1,6 +1,6 @@ from dotenv import load_dotenv -from psycopg_pool import ConnectionPool -from pgvector.psycopg import register_vector +from psycopg_pool import ConnectionPool # type: ignore[import-not-found] +from pgvector.psycopg import register_vector # type: ignore[import-not-found] from typing import Any import cocoindex import os @@ -16,7 +16,8 @@ def text_to_embedding( ) -> cocoindex.DataSlice[NDArray[np.float32]]: """ Embed the text using a SentenceTransformer model. - This is a shared logic between indexing and querying, so extract it as a function.""" + This is a shared logic between indexing and querying, so extract it as a function. + """ # You can also switch to remote embedding model: # return text.transform( # cocoindex.functions.EmbedText( diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index 8f3f558a..33f2d5ed 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -1,6 +1,6 @@ import functools from dotenv import load_dotenv -from qdrant_client import QdrantClient +from qdrant_client import QdrantClient # type: ignore[import-not-found] import cocoindex # Define Qdrant connection constants diff --git a/python/cocoindex/functions/__init__.py b/python/cocoindex/functions/__init__.py index a54500d5..3b8792b4 100644 --- a/python/cocoindex/functions/__init__.py +++ b/python/cocoindex/functions/__init__.py @@ -11,6 +11,7 @@ SplitBySeparators, EmbedText, ExtractByLlm, + CustomLanguageSpec ) # Import SentenceTransformer embedding functionality @@ -34,6 +35,7 @@ "SplitBySeparators", "EmbedText", "ExtractByLlm", + "CustomLanguageSpec", # SentenceTransformer "SentenceTransformerEmbed", "SentenceTransformerEmbedExecutor", From 7202b5304728629d491a1cb5f6c088b7a2f75563 Mon Sep 17 00:00:00 2001 From: Tharun K <53267275+tharun634@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:53:12 +0530 Subject: [PATCH 2/2] refactor: don't use pydantic --- examples/image_search/main.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/image_search/main.py b/examples/image_search/main.py index 3d57421c..91d566cc 100644 --- a/examples/image_search/main.py +++ b/examples/image_search/main.py @@ -3,7 +3,8 @@ import io import os from contextlib import asynccontextmanager -from typing import Literal, cast, AsyncIterator, Final +from typing import Literal, cast, AsyncIterator, Final, Optional, List +from dataclasses import dataclass import cocoindex import torch @@ -11,7 +12,6 @@ from fastapi import FastAPI, Query # type: ignore[import-not-found] from fastapi.middleware.cors import CORSMiddleware # type: ignore[import-not-found] from fastapi.staticfiles import StaticFiles # type: ignore[import-not-found] -from pydantic import BaseModel from PIL import Image from qdrant_client import QdrantClient # type: ignore[import-not-found] from transformers import CLIPModel, CLIPProcessor @@ -138,15 +138,17 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: app.mount("/img", StaticFiles(directory="img"), name="img") -# --- Typed response models --- -class SearchResult(BaseModel): +# --- Response Dataclasses --- +@dataclass +class SearchResult: filename: str score: float - caption: str | None = None + caption: Optional[str] = None -class SearchResponse(BaseModel): - results: list[SearchResult] +@dataclass +class SearchResponse: + results: List[SearchResult] # --- Search API ---