Skip to content

Commit 69d9947

Browse files
authored
Inline multimodal (#2348)
1 parent 92e700c commit 69d9947

File tree

15 files changed

+345
-54
lines changed

15 files changed

+345
-54
lines changed

education-ai-suite/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ monitoring/executionlogs/
2222
chroma_data/
2323
minio_data/
2424
extracted_images/
25+
logs/

education-ai-suite/smart-classroom/content_search/docs/dev_guide/file_ingest_and_retrieve/installation.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,13 @@ For full develop guide and API Reference, please see the [Dev Guide](docs/dev_gu
77

88
### Prerequisites
99

10-
- **Python 3.10** — only this version is verified on Windows: https://www.python.org/downloads/
11-
- **Rust compiler** — required by some dependencies: https://rust-lang.org/tools/install
12-
- **`multimodal_embedding_serving` wheel** — obtain from [this guide](https://github.com/open-edge-platform/edge-ai-libraries/blob/main/microservices/multimodal-embedding-serving/docs/user-guide/wheel-installation.md) (use verified commit `77b812f`). Place the `.whl` file in the `content_search/` folder before running `install.ps1`.
10+
- **Python 3.12** — verified on Windows: https://www.python.org/downloads/
1311

1412
### Install System Dependencies
1513

1614
The `install.ps1` will:
17-
- Creates the Python 3.10 venv
18-
- Installs `mobileclip`, `salesforce-lavis`, `requirements.txt`, and the `multimodal_embedding_serving` wheel
15+
- Creates the Python 3.12 venv
16+
- Installs `requirements_providers.txt`
1917
- Downloads and installs Tesseract OCR 5.5.0 and adds it to the user PATH
2018
- Downloads and extracts Poppler 25.12.0 and adds it to the user PATH
2119
- Install minio to content_search/providers/minio_wrapper folder

education-ai-suite/smart-classroom/content_search/install.ps1

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ $venvPython = Join-Path $PSScriptRoot "venv_content_search\Scripts\python.exe"
1616

1717
# --- Create venv ---
1818
if (-not (Test-Path $venvPython)) {
19-
Write-Host "Creating venv (Python 3.10 required)..."
20-
py -3.10 -m venv $venvDir
19+
Write-Host "Creating venv (Python 3.12 required)..."
20+
py -3.12 -m venv $venvDir
2121
} else {
2222
Write-Host "Venv already exists, skipping creation."
2323
}
@@ -26,26 +26,8 @@ if (-not (Test-Path $venvPython)) {
2626
Write-Host "Upgrading pip..."
2727
Invoke-Cmd $venvPython -m pip install --upgrade pip --quiet
2828

29-
Write-Host "Installing mobileclip..."
30-
Invoke-Cmd $venvPython -m pip install git+https://github.com/apple/ml-mobileclip.git@c16bfe5a4feb424762d6bdf5245539120a4ce9ef#egg=mobileclip --quiet
31-
32-
Write-Host "Installing salesforce-lavis..."
33-
Invoke-Cmd $venvPython -m pip install salesforce-lavis==1.0.2 --quiet
34-
35-
Write-Host "Installing requirements_310.txt..."
36-
Invoke-Cmd $venvPython -m pip install -r (Join-Path $PSScriptRoot "requirements_310.txt") --quiet
37-
38-
# --- Install multimodal_embedding_serving wheel ---
39-
$whl = Get-ChildItem -Path $PSScriptRoot -Filter "multimodal_embedding_serving*.whl" -ErrorAction SilentlyContinue | Select-Object -First 1
40-
if ($null -eq $whl) {
41-
$whl = Get-ChildItem -Path (Join-Path $PSScriptRoot "..") -Filter "multimodal_embedding_serving*.whl" -ErrorAction SilentlyContinue | Select-Object -First 1
42-
}
43-
if ($whl) {
44-
Write-Host "Installing multimodal_embedding_serving from $($whl.FullName) ..."
45-
Invoke-Cmd $venvPython -m pip install $whl.FullName --no-deps --quiet
46-
} else {
47-
Write-Warning "multimodal_embedding_serving wheel not found. Place multimodal_embedding_serving-0.1.1-py3-none-any.whl in content_search/ and re-run."
48-
}
29+
Write-Host "Installing requirements_providers.txt..."
30+
Invoke-Cmd $venvPython -m pip install -r (Join-Path $PSScriptRoot "requirements_providers.txt") --quiet
4931

5032
# --- Install Tesseract OCR ---
5133
$tesseractExe = "C:\Program Files\Tesseract-OCR\tesseract.exe"

education-ai-suite/smart-classroom/content_search/providers/chromadb_wrapper/chroma_client.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# Copyright (C) 2026 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4+
import logging
45
import chromadb
56
from utils.config_loader import config
67

8+
logger = logging.getLogger(__name__)
9+
710
_chroma_cfg = config.content_search.chromadb
811

912

@@ -17,12 +20,12 @@ def load_collection(self, collection_name: str):
1720
self.collection = self.client.get_or_create_collection(name=collection_name)
1821
return self.collection
1922
except Exception as e:
20-
print(f"Failed to load collection {collection_name}: {e}")
23+
logger.error(f"Failed to load collection '{collection_name}' (is ChromaDB running?): {e}")
2124
return None
2225

2326
def create_collection(self, collection_name: str = "default"):
2427
if self.load_collection(collection_name):
25-
print(f"Collection {collection_name} already exists and is loaded.")
28+
logger.info(f"Collection '{collection_name}' already exists and is loaded.")
2629
return
2730

2831
self.collection = self.client.create_collection(name=collection_name)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Derived from: edge-ai-libraries/microservices/multimodal-embedding-serving/src/
2+
# Original package: multimodal-embedding-serving v0.1.1
3+
# Only CLIP-related functionality retained; OpenVINO export removed.
4+
5+
from .registry import get_model_handler
6+
from .wrapper import EmbeddingModel
7+
from .clip_handler import CLIPHandler
8+
9+
__all__ = ["get_model_handler", "EmbeddingModel", "CLIPHandler"]
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Derived from: edge-ai-libraries/microservices/multimodal-embedding-serving/src/models/base.py
2+
# Original package: multimodal-embedding-serving v0.1.1
3+
# Only CLIP-related functionality retained; OpenVINO export removed.
4+
#
5+
# Copyright (C) 2026 Intel Corporation
6+
# SPDX-License-Identifier: Apache-2.0
7+
8+
from abc import ABC, abstractmethod
9+
from typing import List, Union, Dict, Any
10+
from PIL import Image
11+
import numpy as np
12+
import torch
13+
14+
15+
class BaseEmbeddingModel(ABC):
16+
"""Abstract base class for multimodal embedding models."""
17+
18+
def __init__(self, model_config: Dict[str, Any]):
19+
self.model_config = model_config
20+
self.model = None
21+
self.tokenizer = None
22+
self.preprocess = None
23+
self.device = model_config.get("device", "cpu")
24+
default_modalities = {"text", "image"}
25+
config_modalities = model_config.get("modalities")
26+
if config_modalities:
27+
self.supported_modalities = set(config_modalities)
28+
else:
29+
self.supported_modalities = default_modalities
30+
31+
@abstractmethod
32+
def load_model(self) -> None:
33+
pass
34+
35+
@abstractmethod
36+
def encode_text(self, texts: Union[str, List[str]]) -> torch.Tensor:
37+
pass
38+
39+
@abstractmethod
40+
def encode_image(self, images: Union[Image.Image, List[Image.Image], torch.Tensor]) -> torch.Tensor:
41+
pass
42+
43+
# ------------------------------------------------------------------
44+
# Optional capability hooks
45+
# ------------------------------------------------------------------
46+
47+
def supports_text(self) -> bool:
48+
return "text" in self.supported_modalities
49+
50+
def supports_image(self) -> bool:
51+
return "image" in self.supported_modalities
52+
53+
def supports_video(self) -> bool:
54+
return "video" in self.supported_modalities or self.supports_image()
55+
56+
def prepare_query(self, text: str) -> str:
57+
return text
58+
59+
def prepare_documents(self, texts: List[str]) -> List[str]:
60+
return texts
61+
62+
def get_embedding_dim(self) -> int:
63+
if self.model is None:
64+
raise RuntimeError("Model not loaded. Call load_model() first.")
65+
return 512 # Default; subclasses should override
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Derived from: edge-ai-libraries/microservices/multimodal-embedding-serving/src/models/handlers/clip_handler.py
2+
# Original package: multimodal-embedding-serving v0.1.1
3+
# Only CLIP-related functionality retained; OpenVINO export removed.
4+
#
5+
# Copyright (C) 2026 Intel Corporation
6+
# SPDX-License-Identifier: Apache-2.0
7+
8+
import logging
9+
from typing import List, Union, Dict, Any, Optional
10+
11+
import torch
12+
import torch.nn.functional as F
13+
from PIL import Image
14+
import open_clip
15+
16+
from .base import BaseEmbeddingModel
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
class CLIPHandler(BaseEmbeddingModel):
22+
"""Handler for CLIP models using the open_clip library (PyTorch only)."""
23+
24+
def __init__(self, model_config: Dict[str, Any]):
25+
super().__init__(model_config)
26+
self.model_name = model_config["model_name"]
27+
self.pretrained = model_config["pretrained"]
28+
self.device = model_config.get("device", "CPU")
29+
self._embedding_dim: Optional[int] = None
30+
31+
def load_model(self) -> None:
32+
try:
33+
self._embedding_dim = None
34+
logger.info(f"Loading CLIP model: {self.model_name} with pretrained: {self.pretrained}")
35+
36+
self.model, _, self.preprocess = open_clip.create_model_and_transforms(
37+
self.model_name,
38+
pretrained=self.pretrained,
39+
)
40+
self.tokenizer = open_clip.get_tokenizer(self.model_name)
41+
self.model.eval()
42+
logger.info(f"CLIP model {self.model_name} loaded successfully")
43+
except Exception as e:
44+
logger.error(f"Failed to load CLIP model {self.model_name}: {e}")
45+
raise
46+
47+
def encode_text(self, texts: Union[str, List[str]]) -> torch.Tensor:
48+
if isinstance(texts, str):
49+
texts = [texts]
50+
51+
tokenized = self.tokenizer(texts)
52+
53+
with torch.no_grad():
54+
text_features = self.model.encode_text(tokenized)
55+
56+
text_features = F.normalize(text_features, dim=-1)
57+
return text_features
58+
59+
def encode_image(self, images: Union[Image.Image, List[Image.Image], torch.Tensor]) -> torch.Tensor:
60+
if isinstance(images, torch.Tensor):
61+
image_tensor = images
62+
elif isinstance(images, Image.Image):
63+
image_tensor = self.preprocess(images).unsqueeze(0)
64+
else: # list of PIL Images
65+
image_tensor = torch.stack([self.preprocess(img) for img in images])
66+
67+
with torch.no_grad():
68+
image_features = self.model.encode_image(image_tensor)
69+
70+
image_features = F.normalize(image_features, dim=-1)
71+
return image_features
72+
73+
def get_embedding_dim(self) -> int:
74+
if self._embedding_dim is not None:
75+
return self._embedding_dim
76+
77+
if self.preprocess is None:
78+
raise RuntimeError("Preprocessing pipeline not initialized. Call load_model() first.")
79+
80+
image_size = self._get_preprocess_image_size()
81+
dummy_image = Image.new("RGB", (image_size, image_size), color=0)
82+
image_tensor = self.preprocess(dummy_image).unsqueeze(0)
83+
84+
if self.model is None:
85+
raise RuntimeError("Model not loaded. Call load_model() first.")
86+
87+
try:
88+
sample_param = next(self.model.parameters())
89+
device = sample_param.device
90+
dtype = sample_param.dtype
91+
except StopIteration:
92+
device = torch.device("cpu")
93+
dtype = torch.float32
94+
95+
image_tensor = image_tensor.to(device=device, dtype=dtype)
96+
with torch.no_grad():
97+
features = self.model.encode_image(image_tensor)
98+
self._embedding_dim = int(features.shape[-1])
99+
100+
return self._embedding_dim
101+
102+
def _get_preprocess_image_size(self) -> int:
103+
default_size = 224
104+
105+
if self.preprocess is None:
106+
return default_size
107+
108+
transforms = getattr(self.preprocess, "transforms", None)
109+
if not transforms:
110+
return default_size
111+
112+
for transform in transforms:
113+
size = getattr(transform, "size", None)
114+
if size is None:
115+
continue
116+
if isinstance(size, (tuple, list)) and len(size) > 0:
117+
return int(size[0])
118+
if isinstance(size, int):
119+
return int(size)
120+
121+
return default_size
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Derived from: edge-ai-libraries/microservices/multimodal-embedding-serving/src/models/registry.py
2+
# + edge-ai-libraries/microservices/multimodal-embedding-serving/src/models/config.py
3+
# Original package: multimodal-embedding-serving v0.1.1
4+
# Only CLIP model configs retained; other handlers removed.
5+
#
6+
# Copyright (C) 2026 Intel Corporation
7+
# SPDX-License-Identifier: Apache-2.0
8+
9+
import logging
10+
import os
11+
from typing import Dict, Any
12+
13+
from .base import BaseEmbeddingModel
14+
from .clip_handler import CLIPHandler
15+
16+
logger = logging.getLogger(__name__)
17+
18+
# ── CLIP model configurations ────────────────────────────────────────
19+
CLIP_CONFIGS: Dict[str, Dict[str, Any]] = {
20+
"clip-vit-b-32": {
21+
"model_name": "ViT-B-32",
22+
"pretrained": "laion2b_s34b_b79k",
23+
"image_size": 224,
24+
},
25+
"clip-vit-b-16": {
26+
"model_name": "ViT-B-16",
27+
"pretrained": "openai",
28+
"image_size": 224,
29+
},
30+
"clip-vit-l-14": {
31+
"model_name": "ViT-L-14",
32+
"pretrained": "datacomp_xl_s13b_b90k",
33+
"image_size": 224,
34+
},
35+
"clip-vit-h-14": {
36+
"model_name": "ViT-H-14",
37+
"pretrained": "laion2b_s32b_b79k",
38+
"image_size": 224,
39+
},
40+
}
41+
42+
43+
def get_model_handler(
44+
model_id: str,
45+
device: str | None = None,
46+
) -> BaseEmbeddingModel:
47+
"""Create a CLIPHandler for the given *model_id*.
48+
49+
Accepted formats:
50+
"CLIP/clip-vit-b-16" (type/name)
51+
"clip-vit-b-16" (name only)
52+
"""
53+
# Strip optional "CLIP/" prefix
54+
if "/" in model_id:
55+
_, model_name = model_id.split("/", 1)
56+
else:
57+
model_name = model_id
58+
59+
if model_name not in CLIP_CONFIGS:
60+
raise ValueError(
61+
f"Model '{model_id}' not found. "
62+
f"Available: {', '.join(CLIP_CONFIGS)}"
63+
)
64+
65+
config = CLIP_CONFIGS[model_name].copy()
66+
config["device"] = device or os.getenv("EMBEDDING_DEVICE", "CPU")
67+
68+
logger.info(f"Creating CLIPHandler for {model_id} with config: {config}")
69+
return CLIPHandler(config)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Derived from: edge-ai-libraries/microservices/multimodal-embedding-serving/src/wrapper.py
2+
# Original package: multimodal-embedding-serving v0.1.1
3+
# Only CLIP-related functionality retained; URL/base64/video helpers removed
4+
# (callers use handler.encode_image() directly).
5+
#
6+
# Copyright (C) 2026 Intel Corporation
7+
# SPDX-License-Identifier: Apache-2.0
8+
9+
from typing import List
10+
11+
from .base import BaseEmbeddingModel
12+
13+
14+
class EmbeddingModel:
15+
"""Application-level wrapper around a model handler."""
16+
17+
def __init__(self, model_handler: BaseEmbeddingModel):
18+
self.handler = model_handler
19+
self.model_config = model_handler.model_config
20+
self.device = model_handler.device
21+
self.supported_modalities = set(model_handler.supported_modalities)
22+
23+
def embed_query(self, text: str) -> List[float]:
24+
prepared_text = self.handler.prepare_query(text)
25+
embeddings = self.handler.encode_text([prepared_text])
26+
return embeddings[0].tolist()
27+
28+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
29+
prepared_texts = self.handler.prepare_documents(texts)
30+
embeddings = self.handler.encode_text(prepared_texts)
31+
return embeddings.tolist()
32+
33+
def get_embedding_length(self) -> int:
34+
return self.handler.get_embedding_dim()

0 commit comments

Comments
 (0)