From 68700941147756ec9cd49536a8696e9be2f95764 Mon Sep 17 00:00:00 2001
From: tharun634 <tharun634@gmail.com>
Date: Sun, 5 Oct 2025 18:38:03 +0530
Subject: [PATCH 1/2] fix: mypy type errors in the examples and clean up

---
 examples/amazon_s3_embedding/main.py       |  2 +-
 examples/azure_blob_embedding/main.py      |  2 +-
 examples/code_embedding/main.py            |  4 +-
 examples/custom_output_files/main.py       |  2 +-
 examples/face_recognition/main.py          | 22 +++----
 examples/fastapi_server_docker/main.py     | 19 +++---
 examples/gdrive_text_embedding/main.py     |  9 +--
 examples/image_search/colpali_main.py      | 57 ++++++++++++------
 examples/image_search/main.py              | 69 ++++++++++++----------
 examples/manuals_llm_extraction/main.py    | 14 ++---
 examples/multi_format_indexing/main.py     |  4 +-
 examples/paper_metadata/main.py            | 12 ++--
 examples/patient_intake_extraction/main.py | 10 ++--
 examples/pdf_embedding/main.py             | 31 ++++++----
 examples/postgres_source/main.py           | 10 ++--
 examples/text_embedding/main.py            |  7 ++-
 examples/text_embedding_qdrant/main.py     |  2 +-
 python/cocoindex/functions/__init__.py     |  2 +
 18 files changed, 158 insertions(+), 120 deletions(-)

diff --git a/examples/amazon_s3_embedding/main.py b/examples/amazon_s3_embedding/main.py
index 4418d0da..8d94cea0 100644
--- a/examples/amazon_s3_embedding/main.py
+++ b/examples/amazon_s3_embedding/main.py
@@ -1,5 +1,5 @@
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
 import cocoindex
 import os
 from typing import Any
diff --git a/examples/azure_blob_embedding/main.py b/examples/azure_blob_embedding/main.py
index a38c47ef..473fafa5 100644
--- a/examples/azure_blob_embedding/main.py
+++ b/examples/azure_blob_embedding/main.py
@@ -1,5 +1,5 @@
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
 import cocoindex
 import os
 from typing import Any
diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py
index 053eacf9..7c319488 100644
--- a/examples/code_embedding/main.py
+++ b/examples/code_embedding/main.py
@@ -1,6 +1,6 @@
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
-from pgvector.psycopg import register_vector
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
+from pgvector.psycopg import register_vector  # type: ignore[import-not-found]
 from typing import Any
 import functools
 import cocoindex
diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py
index 5bbfa83d..3efac7df 100644
--- a/examples/custom_output_files/main.py
+++ b/examples/custom_output_files/main.py
@@ -96,7 +96,7 @@ def mutate(
 
 @cocoindex.op.function()
 def markdown_to_html(text: str) -> str:
-    return _markdown_it.render(text)
+    return str(_markdown_it.render(text))
 
 
 @cocoindex.flow_def(name="CustomOutputFiles")
diff --git a/examples/face_recognition/main.py b/examples/face_recognition/main.py
index cd05c705..4e3cac38 100644
--- a/examples/face_recognition/main.py
+++ b/examples/face_recognition/main.py
@@ -3,8 +3,9 @@
 import datetime
 import io
 import os
+from typing import cast
 
-import face_recognition
+import face_recognition  # type: ignore[import-not-found]
 import numpy as np
 from PIL import Image
 
@@ -52,8 +53,9 @@ def extract_faces(content: bytes) -> list[FaceBase]:
         ratio = 1.0
         img = orig_img
 
-    # Extract face locations.
-    locs = face_recognition.face_locations(np.array(img), model="cnn")
+    locs: list[tuple[int, int, int, int]] = face_recognition.face_locations(  # type: ignore[attr-defined]
+        np.array(img), model="cnn"
+    )
 
     faces: list[FaceBase] = []
     for min_y, max_x, max_y, min_x in locs:
@@ -63,8 +65,6 @@ def extract_faces(content: bytes) -> list[FaceBase]:
             max_x=int(max_x * ratio),
             max_y=int(max_y * ratio),
         )
-
-        # Crop the face and save it as a PNG.
         buf = io.BytesIO()
         orig_img.crop((rect.min_x, rect.min_y, rect.max_x, rect.max_y)).save(
             buf, format="PNG"
@@ -76,16 +76,16 @@ def extract_faces(content: bytes) -> list[FaceBase]:
 
 
 @cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
-def extract_face_embedding(
-    face: bytes,
-) -> cocoindex.Vector[cocoindex.Float32]:
+def extract_face_embedding(face: bytes) -> cocoindex.Vector[cocoindex.Float32]:
     """Extract the embedding of a face."""
     img = Image.open(io.BytesIO(face)).convert("RGB")
-    embedding = face_recognition.face_encodings(
+    encoding: np.ndarray = face_recognition.face_encodings(  # type: ignore[attr-defined]
         np.array(img),
         known_face_locations=[(0, img.width - 1, img.height - 1, 0)],
-    )[0]
-    return embedding
+    )[
+        0
+    ]
+    return cast(cocoindex.Vector[cocoindex.Float32], encoding.astype(np.float32))
 
 
 @cocoindex.flow_def(name="FaceRecognition")
diff --git a/examples/fastapi_server_docker/main.py b/examples/fastapi_server_docker/main.py
index 752e2436..b48693e0 100644
--- a/examples/fastapi_server_docker/main.py
+++ b/examples/fastapi_server_docker/main.py
@@ -1,11 +1,12 @@
 import cocoindex
-import uvicorn
+import uvicorn  # type: ignore[import-not-found]
 from dotenv import load_dotenv
-from fastapi import FastAPI, Query
-from fastapi import Request
-from psycopg_pool import ConnectionPool
+from fastapi import FastAPI, Query  # type: ignore[import-not-found]
+from fastapi import Request  # type: ignore[import-not-found]
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
 from contextlib import asynccontextmanager
 import os
+from typing import Any, AsyncIterator
 
 
 @cocoindex.transform_flow()
@@ -26,7 +27,7 @@ def text_to_embedding(
 @cocoindex.flow_def(name="MarkdownEmbeddingFastApiExample")
 def markdown_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds markdown files into a vector database.
     """
@@ -65,7 +66,7 @@ def markdown_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
     # Get the table name, for the export target in the text_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         markdown_embedding_flow, "doc_embeddings"
@@ -89,7 +90,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
 
 
 @asynccontextmanager
-def lifespan(app: FastAPI):
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
@@ -103,12 +104,12 @@ def lifespan(app: FastAPI):
 fastapi_app = FastAPI(lifespan=lifespan)
 
 
-@fastapi_app.get("/search")
+@fastapi_app.get("/search")  # type: ignore[misc]
 def search_endpoint(
     request: Request,
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-):
+) -> dict[str, Any]:
     pool = request.app.state.pool
     results = search(pool, q, limit)
     return {"results": results}
diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py
index c9b7b630..62f3277b 100644
--- a/examples/gdrive_text_embedding/main.py
+++ b/examples/gdrive_text_embedding/main.py
@@ -1,8 +1,9 @@
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
 import cocoindex
 import datetime
 import os
+from typing import Any
 
 
 @cocoindex.transform_flow()
@@ -23,7 +24,7 @@ def text_to_embedding(
 @cocoindex.flow_def(name="GoogleDriveTextEmbedding")
 def gdrive_text_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds text into a vector database.
     """
@@ -71,7 +72,7 @@ def gdrive_text_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
     # Get the table name, for the export target in the gdrive_text_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         gdrive_text_embedding_flow, "doc_embeddings"
@@ -94,7 +95,7 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
             ]
 
 
-def _main():
+def _main() -> None:
     # Initialize the database connection pool.
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
     # Run queries in a loop to demonstrate the query capabilities.
diff --git a/examples/image_search/colpali_main.py b/examples/image_search/colpali_main.py
index feec3fab..940b27bd 100644
--- a/examples/image_search/colpali_main.py
+++ b/examples/image_search/colpali_main.py
@@ -1,14 +1,15 @@
 import datetime
 import os
 from contextlib import asynccontextmanager
-from typing import Any
+from typing import AsyncIterator, List, Optional
+from dataclasses import dataclass
 
 import cocoindex
 from dotenv import load_dotenv
-from fastapi import FastAPI, Query
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.staticfiles import StaticFiles
-from qdrant_client import QdrantClient
+from fastapi import FastAPI, Query  # type: ignore[import-not-found]
+from fastapi.middleware.cors import CORSMiddleware  # type: ignore[import-not-found]
+from fastapi.staticfiles import StaticFiles  # type: ignore[import-not-found]
+from qdrant_client import QdrantClient  # type: ignore[import-not-found]
 
 
 # --- Config ---
@@ -17,7 +18,7 @@
 QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6334")
 PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "true").lower() == "true"
 
-# Use HTTP
+# Use HTTP (uncomment if needed)
 # QDRANT_URL = os.getenv("QDRANT_URL", "localhost:6333")
 # PREFER_GRPC = os.getenv("QDRANT_PREFER_GRPC", "false").lower() == "true"
 
@@ -27,6 +28,9 @@
 print(f"📐 Using ColPali model {COLPALI_MODEL_NAME}")
 
 
+# --- Embedding helpers ---
+
+
 @cocoindex.transform_flow()
 def text_to_colpali_embedding(
     text: cocoindex.DataSlice[str],
@@ -70,8 +74,11 @@ def image_object_embedding_flow(
     )
 
 
+# --- Lifespan context ---
+
+
 @asynccontextmanager
-async def lifespan(app: FastAPI) -> None:
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     image_object_embedding_flow.setup(report_to_stdout=True)
@@ -85,6 +92,21 @@ async def lifespan(app: FastAPI) -> None:
     yield
 
 
+# --- Response Dataclasses ---
+
+
+@dataclass
+class SearchResult:
+    filename: str
+    score: float
+    caption: Optional[str] = None
+
+
+@dataclass
+class SearchResponse:
+    results: List[SearchResult]
+
+
 # --- FastAPI app for web API ---
 app = FastAPI(lifespan=lifespan)
 
@@ -95,16 +117,17 @@ async def lifespan(app: FastAPI) -> None:
     allow_methods=["*"],
     allow_headers=["*"],
 )
+
 # Serve images from the 'img' directory at /img
 app.mount("/img", StaticFiles(directory="img"), name="img")
 
 
 # --- Search API ---
-@app.get("/search")
+@app.get("/search", response_model=SearchResponse)  # type: ignore[misc]
 def search(
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-) -> Any:
+) -> SearchResponse:
     # Get the multi-vector embedding for the query
     query_embedding = text_to_colpali_embedding.eval(q)
     print(
@@ -122,13 +145,13 @@ def search(
 
     print(f"📈 Found {len(search_results.points)} results with MaxSim scoring")
 
-    return {
-        "results": [
-            {
-                "filename": result.payload["filename"],
-                "score": result.score,
-                "caption": result.payload.get("caption"),
-            }
+    return SearchResponse(
+        results=[
+            SearchResult(
+                filename=result.payload["filename"],
+                score=result.score,
+                caption=result.payload.get("caption"),
+            )
             for result in search_results.points
         ]
-    }
+    )
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
index 2fd12c4a..3d57421c 100644
--- a/examples/image_search/main.py
+++ b/examples/image_search/main.py
@@ -3,23 +3,24 @@
 import io
 import os
 from contextlib import asynccontextmanager
-from typing import Any, Literal
+from typing import Literal, cast, AsyncIterator, Final
 
 import cocoindex
 import torch
 from dotenv import load_dotenv
-from fastapi import FastAPI, Query
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.staticfiles import StaticFiles
+from fastapi import FastAPI, Query  # type: ignore[import-not-found]
+from fastapi.middleware.cors import CORSMiddleware  # type: ignore[import-not-found]
+from fastapi.staticfiles import StaticFiles  # type: ignore[import-not-found]
+from pydantic import BaseModel
 from PIL import Image
-from qdrant_client import QdrantClient
+from qdrant_client import QdrantClient  # type: ignore[import-not-found]
 from transformers import CLIPModel, CLIPProcessor
 
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
 QDRANT_COLLECTION = "ImageSearch"
 CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
-CLIP_MODEL_DIMENSION = 768
+CLIP_MODEL_DIMENSION: Final = 768
 
 
 @functools.cache
@@ -37,13 +38,13 @@ def embed_query(text: str) -> list[float]:
     inputs = processor(text=[text], return_tensors="pt", padding=True)
     with torch.no_grad():
         features = model.get_text_features(**inputs)
-    return features[0].tolist()
+    return cast(list[float], features[0].tolist())
 
 
 @cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
 def embed_image(
     img_bytes: bytes,
-) -> cocoindex.Vector[cocoindex.Float32, Literal[CLIP_MODEL_DIMENSION]]:
+) -> cocoindex.Vector[cocoindex.Float32, Literal[768]]:
     """
     Convert image to embedding using CLIP model.
     """
@@ -52,7 +53,7 @@ def embed_image(
     inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
         features = model.get_image_features(**inputs)
-    return features[0].tolist()
+    return cast(list[float], features[0].tolist())
 
 
 # CocoIndex flow: Ingest images, extract captions, embed, export to Qdrant
@@ -64,15 +65,12 @@ def image_object_embedding_flow(
         cocoindex.sources.LocalFile(
             path="img", included_patterns=["*.jpg", "*.jpeg", "*.png"], binary=True
         ),
-        refresh_interval=datetime.timedelta(
-            minutes=1
-        ),  # Poll for changes every 1 minute
+        refresh_interval=datetime.timedelta(minutes=1),
     )
     img_embeddings = data_scope.add_collector()
     with data_scope["images"].row() as img:
         ollama_model_name = os.getenv("OLLAMA_MODEL")
         if ollama_model_name is not None:
-            # If an Ollama model is specified, generate an image caption
             img["caption"] = flow_builder.transform(
                 cocoindex.functions.ExtractByLlm(
                     llm_spec=cocoindex.llm.LlmSpec(
@@ -112,18 +110,19 @@ def image_object_embedding_flow(
 
 
 @asynccontextmanager
-async def lifespan(app: FastAPI) -> None:
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     load_dotenv()
     cocoindex.init()
     image_object_embedding_flow.setup(report_to_stdout=True)
 
     app.state.qdrant_client = QdrantClient(url=QDRANT_URL, prefer_grpc=True)
-
-    # Start updater
     app.state.live_updater = cocoindex.FlowLiveUpdater(image_object_embedding_flow)
     app.state.live_updater.start()
 
-    yield
+    try:
+        yield None
+    finally:
+        app.state.live_updater.stop()
 
 
 # --- FastAPI app for web API ---
@@ -136,20 +135,28 @@ async def lifespan(app: FastAPI) -> None:
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Serve images from the 'img' directory at /img
 app.mount("/img", StaticFiles(directory="img"), name="img")
 
 
+# --- Typed response models ---
+class SearchResult(BaseModel):
+    filename: str
+    score: float
+    caption: str | None = None
+
+
+class SearchResponse(BaseModel):
+    results: list[SearchResult]
+
+
 # --- Search API ---
-@app.get("/search")
+@app.get("/search", response_model=SearchResponse)  # type: ignore[misc]
 def search(
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
-) -> Any:
-    # Get the embedding for the query
+) -> SearchResponse:
     query_embedding = embed_query(q)
 
-    # Search in Qdrant
     search_results = app.state.qdrant_client.search(
         collection_name=QDRANT_COLLECTION,
         query_vector=("embedding", query_embedding),
@@ -157,15 +164,13 @@ def search(
         with_payload=True,
     )
 
-    return {
-        "results": [
-            {
-                "filename": result.payload["filename"],
-                "score": result.score,
-                "caption": result.payload.get(
-                    "caption"
-                ),  # Include caption if available
-            }
+    return SearchResponse(
+        results=[
+            SearchResult(
+                filename=result.payload["filename"],
+                score=result.score,
+                caption=result.payload.get("caption"),
+            )
             for result in search_results
         ]
-    }
+    )
diff --git a/examples/manuals_llm_extraction/main.py b/examples/manuals_llm_extraction/main.py
index e35a7700..297b9b1a 100644
--- a/examples/manuals_llm_extraction/main.py
+++ b/examples/manuals_llm_extraction/main.py
@@ -1,10 +1,10 @@
 import tempfile
 import dataclasses
 
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.output import text_from_rendered
-from marker.config.parser import ConfigParser
+from marker.converters.pdf import PdfConverter  # type: ignore[import-not-found]
+from marker.models import create_model_dict  # type: ignore[import-not-found]
+from marker.output import text_from_rendered  # type: ignore[import-not-found]
+from marker.config.parser import ConfigParser  # type: ignore[import-not-found]
 
 import cocoindex
 
@@ -20,7 +20,7 @@ class PdfToMarkdownExecutor:
     spec: PdfToMarkdown
     _converter: PdfConverter
 
-    def prepare(self):
+    def prepare(self) -> None:
         config_parser = ConfigParser({})
         self._converter = PdfConverter(
             create_model_dict(), config=config_parser.generate_config_dict()
@@ -31,7 +31,7 @@ def __call__(self, content: bytes) -> str:
             temp_file.write(content)
             temp_file.flush()
             text, _, _ = text_from_rendered(self._converter(temp_file.name))
-            return text
+            return str(text)
 
 
 @dataclasses.dataclass
@@ -90,7 +90,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
 @cocoindex.flow_def(name="ManualExtraction")
 def manual_extraction_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that extracts manual information from a Markdown.
     """
diff --git a/examples/multi_format_indexing/main.py b/examples/multi_format_indexing/main.py
index aab794e1..ad10cf7b 100644
--- a/examples/multi_format_indexing/main.py
+++ b/examples/multi_format_indexing/main.py
@@ -4,10 +4,10 @@
 
 from dotenv import load_dotenv
 from dataclasses import dataclass
-from pdf2image import convert_from_bytes
+from pdf2image import convert_from_bytes  # type: ignore[import-not-found]
 from io import BytesIO
 
-from qdrant_client import QdrantClient
+from qdrant_client import QdrantClient  # type: ignore[import-not-found]
 
 QDRANT_GRPC_URL = "http://localhost:6334"
 QDRANT_COLLECTION = "MultiFormatIndexings"
diff --git a/examples/paper_metadata/main.py b/examples/paper_metadata/main.py
index 195f472a..0c42bca6 100644
--- a/examples/paper_metadata/main.py
+++ b/examples/paper_metadata/main.py
@@ -4,12 +4,12 @@
 import dataclasses
 import datetime
 
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser  # type: ignore[import-not-found]
+from marker.converters.pdf import PdfConverter  # type: ignore[import-not-found]
+from marker.models import create_model_dict  # type: ignore[import-not-found]
+from marker.output import text_from_rendered  # type: ignore[import-not-found]
 from functools import cache
-from pypdf import PdfReader, PdfWriter
+from pypdf import PdfReader, PdfWriter  # type: ignore[import-not-found]
 
 
 @cache
@@ -67,7 +67,7 @@ def pdf_to_markdown(content: bytes) -> str:
         temp_file.write(content)
         temp_file.flush()
         text, _, _ = text_from_rendered(get_marker_converter()(temp_file.name))
-        return text
+        return str(text)
 
 
 @cocoindex.flow_def(name="PaperMetadata")
diff --git a/examples/patient_intake_extraction/main.py b/examples/patient_intake_extraction/main.py
index fd7ec21b..968fadf5 100644
--- a/examples/patient_intake_extraction/main.py
+++ b/examples/patient_intake_extraction/main.py
@@ -3,8 +3,8 @@
 import dataclasses
 import os
 
-from markitdown import MarkItDown
-from openai import OpenAI
+from markitdown import MarkItDown  # type: ignore[import-not-found]
+from openai import OpenAI  # type: ignore[import-not-found]
 
 import cocoindex
 
@@ -97,7 +97,7 @@ class ToMarkdownExecutor:
     spec: ToMarkdown
     _converter: MarkItDown
 
-    def prepare(self):
+    def prepare(self) -> None:
         client = OpenAI()
         self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o")
 
@@ -106,14 +106,14 @@ def __call__(self, content: bytes, filename: str) -> str:
         with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file:
             temp_file.write(content)
             temp_file.flush()
-            text = self._converter.convert(temp_file.name).text_content
+            text: str = self._converter.convert(temp_file.name).text_content
             return text
 
 
 @cocoindex.flow_def(name="PatientIntakeExtraction")
 def patient_intake_extraction_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define a flow that extracts patient information from intake forms.
     """
diff --git a/examples/pdf_embedding/main.py b/examples/pdf_embedding/main.py
index 4bbe4564..b0fa062f 100644
--- a/examples/pdf_embedding/main.py
+++ b/examples/pdf_embedding/main.py
@@ -3,11 +3,11 @@
 import tempfile
 
 from dotenv import load_dotenv
-from marker.config.parser import ConfigParser
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.output import text_from_rendered
-from psycopg_pool import ConnectionPool
+from marker.config.parser import ConfigParser  # type: ignore[import-not-found]
+from marker.converters.pdf import PdfConverter  # type: ignore[import-not-found]
+from marker.models import create_model_dict  # type: ignore[import-not-found]
+from marker.output import text_from_rendered  # type: ignore[import-not-found]
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
 from jinja2 import Template
 
 
@@ -22,7 +22,7 @@ class PdfToMarkdownExecutor:
     spec: PdfToMarkdown
     _converter: PdfConverter
 
-    def prepare(self):
+    def prepare(self) -> None:
         config_parser = ConfigParser({})
         self._converter = PdfConverter(
             create_model_dict(), config=config_parser.generate_config_dict()
@@ -33,7 +33,7 @@ def __call__(self, content: bytes) -> str:
             temp_file.write(content)
             temp_file.flush()
             text, _, _ = text_from_rendered(self._converter(temp_file.name))
-            return text
+            return str(text)
 
 
 @cocoindex.transform_flow()
@@ -54,7 +54,7 @@ def text_to_embedding(
 @cocoindex.flow_def(name="PdfEmbedding")
 def pdf_embedding_flow(
     flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
-):
+) -> None:
     """
     Define an example flow that embeds files into a vector database.
     """
@@ -96,7 +96,9 @@ def pdf_embedding_flow(
     )
 
 
-def search(pool: ConnectionPool, query: str, top_k: int = 5):
+def search(
+    pool: ConnectionPool, query: str, top_k: int = 5
+) -> list[dict[str, str | float]]:
     # Get the table name, for the export target in the pdf_embedding_flow above.
     table_name = cocoindex.utils.get_target_default_name(
         pdf_embedding_flow, "pdf_embeddings"
@@ -113,24 +115,27 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
             """,
                 (query_vector, top_k),
             )
-            return [
+            results: list[dict[str, str | float]] = [
                 {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
                 for row in cur.fetchall()
             ]
+            return results
 
 
 # Define the search results template using Jinja2
-SEARCH_RESULTS_TEMPLATE = Template("""
+SEARCH_RESULTS_TEMPLATE = Template(
+    """
 Search results:
 {% for result in results %}
 [{{ "%.3f"|format(result.score) }}] {{ result.filename }}
     {{ result.text }}
 ---
 {% endfor %}
-""")
+"""
+)
 
 
-def _main():
+def _main() -> None:
     # Initialize the database connection pool.
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
     # Run queries in a loop to demonstrate the query capabilities.
diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py
index deef6172..84ccd546 100644
--- a/examples/postgres_source/main.py
+++ b/examples/postgres_source/main.py
@@ -1,11 +1,10 @@
 from typing import Any
 import os
-import datetime
 
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
-from pgvector.psycopg import register_vector  # type: ignore[import-untyped]
-from psycopg.rows import dict_row
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
+from pgvector.psycopg import register_vector  # type: ignore[import-not-found]
+from psycopg.rows import dict_row  # type: ignore[import-not-found]
 from numpy.typing import NDArray
 
 import numpy as np
@@ -134,7 +133,8 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, A
             """,
                 (query_vector, top_k),
             )
-            return cur.fetchall()
+            rows: list[dict[str, Any]] = cur.fetchall()
+            return rows
 
 
 def _main() -> None:
diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py
index 9a4f18a3..4affac3f 100644
--- a/examples/text_embedding/main.py
+++ b/examples/text_embedding/main.py
@@ -1,6 +1,6 @@
 from dotenv import load_dotenv
-from psycopg_pool import ConnectionPool
-from pgvector.psycopg import register_vector
+from psycopg_pool import ConnectionPool  # type: ignore[import-not-found]
+from pgvector.psycopg import register_vector  # type: ignore[import-not-found]
 from typing import Any
 import cocoindex
 import os
@@ -16,7 +16,8 @@ def text_to_embedding(
 ) -> cocoindex.DataSlice[NDArray[np.float32]]:
     """
     Embed the text using a SentenceTransformer model.
-    This is a shared logic between indexing and querying, so extract it as a function."""
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
     # You can also switch to remote embedding model:
     #   return text.transform(
     #       cocoindex.functions.EmbedText(
diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py
index 8f3f558a..33f2d5ed 100644
--- a/examples/text_embedding_qdrant/main.py
+++ b/examples/text_embedding_qdrant/main.py
@@ -1,6 +1,6 @@
 import functools
 from dotenv import load_dotenv
-from qdrant_client import QdrantClient
+from qdrant_client import QdrantClient  # type: ignore[import-not-found]
 import cocoindex
 
 # Define Qdrant connection constants
diff --git a/python/cocoindex/functions/__init__.py b/python/cocoindex/functions/__init__.py
index a54500d5..3b8792b4 100644
--- a/python/cocoindex/functions/__init__.py
+++ b/python/cocoindex/functions/__init__.py
@@ -11,6 +11,7 @@
     SplitBySeparators,
     EmbedText,
     ExtractByLlm,
+    CustomLanguageSpec
 )
 
 # Import SentenceTransformer embedding functionality
@@ -34,6 +35,7 @@
     "SplitBySeparators",
     "EmbedText",
     "ExtractByLlm",
+    "CustomLanguageSpec",
     # SentenceTransformer
     "SentenceTransformerEmbed",
     "SentenceTransformerEmbedExecutor",

From 7202b5304728629d491a1cb5f6c088b7a2f75563 Mon Sep 17 00:00:00 2001
From: Tharun K <53267275+tharun634@users.noreply.github.com>
Date: Sun, 5 Oct 2025 18:53:12 +0530
Subject: [PATCH 2/2] refactor: don't use pydantic

---
 examples/image_search/main.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/image_search/main.py b/examples/image_search/main.py
index 3d57421c..91d566cc 100644
--- a/examples/image_search/main.py
+++ b/examples/image_search/main.py
@@ -3,7 +3,8 @@
 import io
 import os
 from contextlib import asynccontextmanager
-from typing import Literal, cast, AsyncIterator, Final
+from typing import Literal, cast, AsyncIterator, Final, Optional, List
+from dataclasses import dataclass
 
 import cocoindex
 import torch
@@ -11,7 +12,6 @@
 from fastapi import FastAPI, Query  # type: ignore[import-not-found]
 from fastapi.middleware.cors import CORSMiddleware  # type: ignore[import-not-found]
 from fastapi.staticfiles import StaticFiles  # type: ignore[import-not-found]
-from pydantic import BaseModel
 from PIL import Image
 from qdrant_client import QdrantClient  # type: ignore[import-not-found]
 from transformers import CLIPModel, CLIPProcessor
@@ -138,15 +138,17 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
 app.mount("/img", StaticFiles(directory="img"), name="img")
 
 
-# --- Typed response models ---
-class SearchResult(BaseModel):
+# --- Response Dataclasses ---
+@dataclass
+class SearchResult:
     filename: str
     score: float
-    caption: str | None = None
+    caption: Optional[str] = None
 
 
-class SearchResponse(BaseModel):
-    results: list[SearchResult]
+@dataclass
+class SearchResponse:
+    results: List[SearchResult]
 
 
 # --- Search API ---