Skip to content

Commit 0884090

Browse files
authored
Content Search Refactor model initialization and fix ID mapping issues (open-edge-platform#2376)
1 parent 530e160 commit 0884090

File tree

10 files changed

+108
-54
lines changed

10 files changed

+108
-54
lines changed

education-ai-suite/smart-classroom/config.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -391,16 +391,15 @@ content_search:
391391
chromadb:
392392
host: "127.0.0.1"
393393
port: 9090
394-
data_dir: '.\chromadb_wrapper\chroma_data'
394+
data_dir: '.\providers\chromadb_wrapper\chroma_data'
395395

396396
minio:
397397
server: "127.0.0.1:9000"
398398
console_address: ":9001"
399399
root_user: "minioadmin"
400400
root_password: "minioadmin"
401401
bucket: "content-search"
402-
minio_exe: '.\minio_wrapper\minio.exe'
403-
data_dir: '.\minio_wrapper\minio_data'
402+
data_dir: '.\providers\minio_wrapper\minio_data'
404403

405404
vlm:
406405
model_name: "Qwen/Qwen2.5-VL-3B-Instruct"

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/detector.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,17 @@
44

55
import os
66
import subprocess
7+
from pathlib import Path
78

89
from PIL import Image
910
import numpy as np
1011
import openvino as ov
1112

1213
from providers.file_ingest_and_retrieve.yolox_utils import preproc, multiclass_nms, demo_postprocess
1314

14-
MODEL_DIR = "./models"
15-
16-
1715
class Detector:
1816
def __init__(self, device="CPU", conf=0.85, nms=0.45, input_size=(640, 640)):
19-
self.model_path = os.path.join(MODEL_DIR, "detection_model")
17+
self.model_path = Path(os.getcwd()).parent / "models" / "detection_model"
2018
self.model_file = os.path.join(self.model_path, "yolox_s.xml")
2119
self.download_model()
2220
self.device = device

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/document_parser.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def __init__(
6161
self,
6262
chunk_size: int = 250,
6363
chunk_overlap: int = 50,
64-
extract_images: bool = True,
65-
image_output_dir: str = "./extracted_images",
64+
extract_images: bool = False,
65+
image_output_dir: Optional[str] = None,
6666
ocr_languages: Optional[List[str]] = None,
6767
use_hi_res_strategy: bool = True,
6868
embed_model=None,
@@ -76,7 +76,7 @@ def __init__(
7676
Args:
7777
chunk_size: Maximum characters per chunk (default: 250). Used only when embed_model is None.
7878
chunk_overlap: Characters overlap between chunks (default: 50). Used only when embed_model is None.
79-
extract_images: Whether to extract images from PDFs (default: True)
79+
extract_images: Whether to extract images from PDFs (default: False)
8080
image_output_dir: Directory to save extracted images (default: './extracted_images')
8181
ocr_languages: List of OCR languages (default: ['eng', 'chi_sim', 'chi'])
8282
use_hi_res_strategy: Use high-resolution parsing (slower but more accurate)
@@ -92,7 +92,10 @@ def __init__(
9292
self.chunk_size = chunk_size
9393
self.chunk_overlap = chunk_overlap
9494
self.extract_images = extract_images
95-
self.image_output_dir = ensure_directory(image_output_dir)
95+
_default_img_dir = os.path.join(os.getcwd(), "logs", "extracted_images")
96+
self.image_output_dir = image_output_dir or _default_img_dir
97+
if extract_images:
98+
ensure_directory(self.image_output_dir)
9699
self.ocr_languages = ocr_languages or ["eng", "chi_sim", "chi"]
97100
self.use_hi_res_strategy = use_hi_res_strategy
98101
self.semantic_min_chunk_size = semantic_min_chunk_size
@@ -155,6 +158,7 @@ def parse_file(self, file_path: str) -> List[BaseNode]:
155158
)
156159

157160
if ext == ".docx":
161+
DocxParagraphPicturePartitioner.output_dir = self.image_output_dir
158162
register_picture_partitioner(DocxParagraphPicturePartitioner)
159163

160164
unstructured_kwargs = {

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/indexer.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
from moviepy import VideoFileClip
99
from PIL import Image
1010

11-
from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
12-
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
13-
1411
from providers.chromadb_wrapper.chroma_client import ChromaClientWrapper
1512
from providers.file_ingest_and_retrieve.document_parser import DocumentParser
1613
from providers.file_ingest_and_retrieve.detector import Detector
1714
from providers.file_ingest_and_retrieve.utils import generate_unique_id, encode_image_to_base64
15+
from providers.file_ingest_and_retrieve.models import (
16+
get_visual_embedding_model,
17+
get_document_embedding_model,
18+
)
1819

1920
logger = logging.getLogger(__name__)
2021

@@ -26,15 +27,13 @@ def create_chroma_data(embedding, meta=None):
2627
return {"id": generate_unique_id(), "meta": meta, "vector": embedding}
2728

2829
class Indexer:
29-
def __init__(self, collection_name="content-search"):
30+
def __init__(self, collection_name="content-search", visual_embedding_model=None, document_embedding_model=None):
3031
self.client = ChromaClientWrapper()
3132
run_device = os.getenv("INGEST_DEVICE", "CPU")
3233
self.visual_collection_name = collection_name
33-
visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
34-
handler = get_model_handler(visual_model_name)
35-
handler.load_model()
3634

37-
self.visual_embedding_model = EmbeddingModel(handler)
35+
self.visual_embedding_model = visual_embedding_model or get_visual_embedding_model()
36+
3837
self.detector = Detector(device=run_device)
3938
self.visual_id_map = {}
4039
self.visual_db_inited = False
@@ -46,12 +45,7 @@ def __init__(self, collection_name="content-search"):
4645

4746
self.document_collection_name = f"{collection_name}_documents"
4847

49-
doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
50-
51-
self.document_embedding_model = OpenVINOEmbedding(
52-
model_id_or_path=doc_model_path,
53-
device=run_device,
54-
)
48+
self.document_embedding_model = document_embedding_model or get_document_embedding_model()
5549

5650
self.document_parser = DocumentParser(
5751
chunk_size=250,
@@ -304,7 +298,8 @@ def process_text(self, text: str, meta: dict) -> list:
304298
self.init_document_db_client(len(embedding))
305299

306300
node = create_chroma_data(embedding, meta_data)
307-
self._update_id_map(self.document_id_map, meta_data["file_path"], node["id"])
301+
file_path = meta_data.get("file_path", "__independent_text__")
302+
self._update_id_map(self.document_id_map, file_path, node["id"])
308303
return [node]
309304

310305
def ingest_text(self, text: str, meta: dict) -> dict:
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (C) 2026 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import logging
5+
import os
6+
from typing import Optional
7+
8+
logger = logging.getLogger(__name__)
9+
10+
# Global model cache to avoid duplicate loading
11+
_visual_embedding_model: Optional[object] = None
12+
_document_embedding_model: Optional[object] = None
13+
14+
15+
def get_visual_embedding_model():
16+
"""
17+
Lazy load and cache the visual embedding model (CLIP) once.
18+
19+
Returns:
20+
EmbeddingModel: Cached CLIP embedding model
21+
"""
22+
global _visual_embedding_model
23+
if _visual_embedding_model is None:
24+
from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
25+
26+
visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
27+
logger.info(f"Initializing visual embedding model: {visual_model_name}")
28+
29+
handler = get_model_handler(visual_model_name)
30+
handler.load_model()
31+
_visual_embedding_model = EmbeddingModel(handler)
32+
33+
logger.info("Visual embedding model initialized and cached")
34+
return _visual_embedding_model
35+
36+
37+
def get_document_embedding_model():
38+
"""
39+
Lazy load and cache the document embedding model (OpenVINOEmbedding) once.
40+
41+
Returns:
42+
OpenVINOEmbedding: Cached document embedding model
43+
"""
44+
global _document_embedding_model
45+
if _document_embedding_model is None:
46+
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
47+
48+
doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
49+
run_device = os.getenv("INGEST_DEVICE", "CPU")
50+
51+
logger.info(f"Initializing document embedding model: {doc_model_path} on device: {run_device}")
52+
53+
_document_embedding_model = OpenVINOEmbedding(
54+
model_id_or_path=doc_model_path,
55+
device=run_device,
56+
)
57+
58+
logger.info("Document embedding model initialized and cached")
59+
return _document_embedding_model

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/retriever.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,33 +6,25 @@
66
import base64
77
import io
88

9-
from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
10-
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
11-
129
from providers.chromadb_wrapper.chroma_client import ChromaClientWrapper
13-
14-
import os
10+
from providers.file_ingest_and_retrieve.models import (
11+
get_visual_embedding_model,
12+
get_document_embedding_model,
13+
)
1514

1615
class ChromaRetriever:
17-
def __init__(self, collection_name="default"):
16+
def __init__(self, collection_name="default", visual_embedding_model=None, document_embedding_model=None):
1817
self.client = ChromaClientWrapper()
1918

2019
self.visual_collection_name = collection_name
2120
self.client.load_collection(self.visual_collection_name)
22-
visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
23-
handler = get_model_handler(visual_model_name)
24-
handler.load_model()
25-
self.visual_embedding_model = EmbeddingModel(handler)
21+
22+
self.visual_embedding_model = visual_embedding_model or get_visual_embedding_model()
2623

2724
self.document_collection_name = f"{collection_name}_documents"
2825
self.client.load_collection(self.document_collection_name)
2926

30-
doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
31-
run_device = os.getenv("INGEST_DEVICE", "CPU")
32-
self.document_embedding_model = OpenVINOEmbedding(
33-
model_id_or_path=doc_model_path,
34-
device=run_device,
35-
)
27+
self.document_embedding_model = document_embedding_model or get_document_embedding_model()
3628

3729
def get_text_embedding(self, query):
3830
embedding_tensor = self.visual_embedding_model.handler.encode_text(query)

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/server.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ def format(self, record):
3838
from providers.minio_wrapper.minio_client import MinioStore
3939
from providers.file_ingest_and_retrieve.indexer import Indexer
4040
from providers.file_ingest_and_retrieve.retriever import ChromaRetriever
41+
from providers.file_ingest_and_retrieve.models import (
42+
get_visual_embedding_model,
43+
get_document_embedding_model,
44+
)
4145

4246
logger = logging.getLogger("visual_data_service")
4347

@@ -84,8 +88,11 @@ class IngestTextRequest(_IngestRequestBase):
8488

8589
_collection_name = os.getenv("CHROMA_COLLECTION_NAME", "content-search")
8690

87-
indexer = Indexer(collection_name=_collection_name)
88-
retriever = ChromaRetriever(collection_name=_collection_name)
91+
_visual_model = get_visual_embedding_model()
92+
_document_model = get_document_embedding_model()
93+
94+
indexer = Indexer(collection_name=_collection_name, visual_embedding_model=_visual_model, document_embedding_model=_document_model)
95+
retriever = ChromaRetriever(collection_name=_collection_name, visual_embedding_model=_visual_model, document_embedding_model=_document_model)
8996

9097
minio_store = MinioStore.from_config()
9198

education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,23 +77,23 @@ class DocxParagraphPicturePartitioner:
7777
Custom partitioner to extract images from DOCX paragraphs.
7878
This preserves images that might be lost with standard parsing.
7979
"""
80+
output_dir: str = os.path.join(os.getcwd(), "logs", "extracted_images")
8081

8182
@classmethod
8283
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
8384
if paragraph is None:
8485
return
8586
imgs = paragraph._element.xpath(".//pic:pic")
8687
if imgs:
87-
img_output_dir = "extracted_images"
88-
os.makedirs(img_output_dir, exist_ok=True)
88+
os.makedirs(cls.output_dir, exist_ok=True)
8989
for img in imgs:
9090
try:
9191
embed = img.xpath(".//a:blip/@r:embed")[0]
9292
related_part = opts.document.part.related_parts[embed]
9393
image_blob = related_part.blob
9494
image = PILImage.open(BytesIO(image_blob))
9595
image_filename = f"{embed}_{related_part.sha1}.png"
96-
image_path = os.path.join(img_output_dir, image_filename)
96+
image_path = os.path.join(cls.output_dir, image_filename)
9797
image.save(image_path)
9898
element_metadata = ElementMetadata(image_path=image_path)
9999
yield Image(text="IMAGE", metadata=element_metadata)

education-ai-suite/smart-classroom/content_search/providers/vlm_openvino_serving/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def initialize_model():
189189
global pipe, processor, model_dir
190190
model_name = settings.VLM_MODEL_NAME
191191
model_dir = Path(model_name.split("/")[-1])
192-
model_dir = Path("models/openvino") / model_dir
192+
model_dir = Path(os.getcwd()).parent / "models" / "openvino" / model_dir
193193
model_dir.mkdir(parents=True, exist_ok=True)
194194
weight = settings.VLM_COMPRESSION_WEIGHT_FORMAT.lower()
195195
model_dir = model_dir / weight

education-ai-suite/smart-classroom/content_search/start_services.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def _load_config_to_env(config_path: str = "config.yaml") -> None:
3636

3737
cs = data.get("content_search", {})
3838

39-
def _set(k, v):
39+
def _set(k, v):
4040
if v is not None:
4141
os.environ.setdefault(k, str(v))
4242

@@ -52,6 +52,7 @@ def _set(k, v):
5252
server_addr = str(minio.get("server", "127.0.0.1:9000"))
5353
port = server_addr.rsplit(':', 1)[-1]
5454
_set("MINIO_ADDRESS", f":{port}")
55+
_set("MINIO_CONSOLE_ADDRESS", minio.get("console_address", ":9001"))
5556
_set("MINIO_ROOT_USER", minio.get("root_user", "minioadmin"))
5657
_set("MINIO_ROOT_PASSWORD", minio.get("root_password", "minioadmin"))
5758
_set("MINIO_DATA_DIR", minio.get("data_dir", "./minio_data"))
@@ -156,10 +157,8 @@ def main() -> None:
156157
if not chroma_exe:
157158
venv_exe = CONTENT_SEARCH_DIR / "venv_content_search" / "Scripts" / "chroma.exe"
158159
chroma_exe = str(venv_exe) if venv_exe.exists() else "chroma"
159-
minio_exe = str(CONTENT_SEARCH_DIR / "providers" / os.environ.get("MINIO_EXE", "minio_wrapper/minio.exe"))
160-
if not minio_exe:
161-
provider_minio = CONTENT_SEARCH_DIR / "providers" / "minio_wrapper" / "minio.exe"
162-
minio_exe = str(provider_minio) if provider_minio.exists() else "minio"
160+
provider_minio = CONTENT_SEARCH_DIR / "providers" / "minio_wrapper" / "minio.exe"
161+
minio_exe = str(provider_minio) if provider_minio.exists() else "minio"
163162
# no service current
164163
pg_bin_dir = Path(r"C:\Program Files\PostgreSQL\16\bin")
165164
pg_exe = str(pg_bin_dir / "postgres.exe")
@@ -178,7 +177,8 @@ def main() -> None:
178177
},
179178
"minio": {
180179
"cmd": [minio_exe, "server", os.environ.get("MINIO_DATA_DIR", "./minio_data"),
181-
"--address", os.environ.get("MINIO_ADDRESS", ":9000")],
180+
"--address", os.environ.get("MINIO_ADDRESS", ":9000"),
181+
"--console-address", os.environ.get("MINIO_CONSOLE_ADDRESS", ":9001")],
182182
"cwd": CONTENT_SEARCH_DIR,
183183
"extra_env": {
184184
"MINIO_ROOT_USER": os.environ.get("MINIO_ROOT_USER", "minioadmin"),

0 commit comments

Comments
 (0)