Content Search Refactor model initialization and fix ID mapping issues (open-edge-platform#2376)

BaoHuiling · web-flow · commit 0884090b7b31 · 2026-03-31T17:07:53.000+08:00
diff --git a/education-ai-suite/smart-classroom/config.yaml b/education-ai-suite/smart-classroom/config.yaml
@@ -391,16 +391,15 @@ content_search:
   chromadb:
     host: "127.0.0.1"
     port: 9090
-    data_dir: '.\chromadb_wrapper\chroma_data'
+    data_dir: '.\providers\chromadb_wrapper\chroma_data'
 
   minio:
     server: "127.0.0.1:9000"
     console_address: ":9001"
     root_user: "minioadmin"
     root_password: "minioadmin"
     bucket: "content-search"
-    minio_exe: '.\minio_wrapper\minio.exe'
-    data_dir: '.\minio_wrapper\minio_data'
+    data_dir: '.\providers\minio_wrapper\minio_data'
 
   vlm:
     model_name: "Qwen/Qwen2.5-VL-3B-Instruct"
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/detector.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/detector.py
@@ -4,19 +4,17 @@
 
 import os
 import subprocess
+from pathlib import Path
 
 from PIL import Image
 import numpy as np
 import openvino as ov
 
 from providers.file_ingest_and_retrieve.yolox_utils import preproc, multiclass_nms, demo_postprocess
 
-MODEL_DIR = "./models"
-
-
 class Detector:
     def __init__(self, device="CPU", conf=0.85, nms=0.45, input_size=(640, 640)):
-        self.model_path = os.path.join(MODEL_DIR, "detection_model")
+        self.model_path = Path(os.getcwd()).parent / "models" / "detection_model"
         self.model_file = os.path.join(self.model_path, "yolox_s.xml")
         self.download_model()
         self.device = device
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/document_parser.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/document_parser.py
@@ -61,8 +61,8 @@ def __init__(
         self,
         chunk_size: int = 250,
         chunk_overlap: int = 50,
-        extract_images: bool = True,
-        image_output_dir: str = "./extracted_images",
+        extract_images: bool = False,
+        image_output_dir: Optional[str] = None,
         ocr_languages: Optional[List[str]] = None,
         use_hi_res_strategy: bool = True,
         embed_model=None,
@@ -76,7 +76,7 @@ def __init__(
         Args:
             chunk_size: Maximum characters per chunk (default: 250). Used only when embed_model is None.
             chunk_overlap: Characters overlap between chunks (default: 50). Used only when embed_model is None.
-            extract_images: Whether to extract images from PDFs (default: True)
+            extract_images: Whether to extract images from PDFs (default: False)
             image_output_dir: Directory to save extracted images (default: './extracted_images')
             ocr_languages: List of OCR languages (default: ['eng', 'chi_sim', 'chi'])
             use_hi_res_strategy: Use high-resolution parsing (slower but more accurate)
@@ -92,7 +92,10 @@ def __init__(
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
         self.extract_images = extract_images
-        self.image_output_dir = ensure_directory(image_output_dir)
+        _default_img_dir = os.path.join(os.getcwd(), "logs", "extracted_images")
+        self.image_output_dir = image_output_dir or _default_img_dir
+        if extract_images:
+            ensure_directory(self.image_output_dir)
         self.ocr_languages = ocr_languages or ["eng", "chi_sim", "chi"]
         self.use_hi_res_strategy = use_hi_res_strategy
         self.semantic_min_chunk_size = semantic_min_chunk_size
@@ -155,6 +158,7 @@ def parse_file(self, file_path: str) -> List[BaseNode]:
                 )
 
         if ext == ".docx":
+            DocxParagraphPicturePartitioner.output_dir = self.image_output_dir
             register_picture_partitioner(DocxParagraphPicturePartitioner)
 
         unstructured_kwargs = {
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/indexer.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/indexer.py
@@ -8,13 +8,14 @@
 from moviepy import VideoFileClip
 from PIL import Image
 
-from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
-from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
-
 from providers.chromadb_wrapper.chroma_client import ChromaClientWrapper
 from providers.file_ingest_and_retrieve.document_parser import DocumentParser
 from providers.file_ingest_and_retrieve.detector import Detector
 from providers.file_ingest_and_retrieve.utils import generate_unique_id, encode_image_to_base64
+from providers.file_ingest_and_retrieve.models import (
+    get_visual_embedding_model,
+    get_document_embedding_model,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -26,15 +27,13 @@ def create_chroma_data(embedding, meta=None):
     return {"id": generate_unique_id(), "meta": meta, "vector": embedding}
 
 class Indexer:
-    def __init__(self, collection_name="content-search"):
+    def __init__(self, collection_name="content-search", visual_embedding_model=None, document_embedding_model=None):
         self.client = ChromaClientWrapper()
         run_device = os.getenv("INGEST_DEVICE", "CPU")
         self.visual_collection_name = collection_name
-        visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
-        handler = get_model_handler(visual_model_name)
-        handler.load_model()
 
-        self.visual_embedding_model = EmbeddingModel(handler)
+        self.visual_embedding_model = visual_embedding_model or get_visual_embedding_model()
+
         self.detector = Detector(device=run_device)
         self.visual_id_map = {}
         self.visual_db_inited = False
@@ -46,12 +45,7 @@ def __init__(self, collection_name="content-search"):
 
         self.document_collection_name = f"{collection_name}_documents"
 
-        doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
-
-        self.document_embedding_model = OpenVINOEmbedding(
-            model_id_or_path=doc_model_path,
-            device=run_device,
-        )
+        self.document_embedding_model = document_embedding_model or get_document_embedding_model()
 
         self.document_parser = DocumentParser(
             chunk_size=250,
@@ -304,7 +298,8 @@ def process_text(self, text: str, meta: dict) -> list:
             self.init_document_db_client(len(embedding))
 
         node = create_chroma_data(embedding, meta_data)
-        self._update_id_map(self.document_id_map, meta_data["file_path"], node["id"])
+        file_path = meta_data.get("file_path", "__independent_text__")
+        self._update_id_map(self.document_id_map, file_path, node["id"])
         return [node]
 
     def ingest_text(self, text: str, meta: dict) -> dict:
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/models.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/models.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2026 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Global model cache to avoid duplicate loading
+_visual_embedding_model: Optional[object] = None
+_document_embedding_model: Optional[object] = None
+
+
+def get_visual_embedding_model():
+    """
+    Lazy load and cache the visual embedding model (CLIP) once.
+
+    Returns:
+        EmbeddingModel: Cached CLIP embedding model
+    """
+    global _visual_embedding_model
+    if _visual_embedding_model is None:
+        from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
+
+        visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
+        logger.info(f"Initializing visual embedding model: {visual_model_name}")
+
+        handler = get_model_handler(visual_model_name)
+        handler.load_model()
+        _visual_embedding_model = EmbeddingModel(handler)
+
+        logger.info("Visual embedding model initialized and cached")
+    return _visual_embedding_model
+
+
+def get_document_embedding_model():
+    """
+    Lazy load and cache the document embedding model (OpenVINOEmbedding) once.
+
+    Returns:
+        OpenVINOEmbedding: Cached document embedding model
+    """
+    global _document_embedding_model
+    if _document_embedding_model is None:
+        from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
+
+        doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
+        run_device = os.getenv("INGEST_DEVICE", "CPU")
+
+        logger.info(f"Initializing document embedding model: {doc_model_path} on device: {run_device}")
+
+        _document_embedding_model = OpenVINOEmbedding(
+            model_id_or_path=doc_model_path,
+            device=run_device,
+        )
+
+        logger.info("Document embedding model initialized and cached")
+    return _document_embedding_model
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/retriever.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/retriever.py
@@ -6,33 +6,25 @@
 import base64
 import io
 
-from providers.file_ingest_and_retrieve.embedding import get_model_handler, EmbeddingModel
-from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding
-
 from providers.chromadb_wrapper.chroma_client import ChromaClientWrapper
-
-import os
+from providers.file_ingest_and_retrieve.models import (
+    get_visual_embedding_model,
+    get_document_embedding_model,
+)
 
 class ChromaRetriever:
-    def __init__(self, collection_name="default"):
+    def __init__(self, collection_name="default", visual_embedding_model=None, document_embedding_model=None):
         self.client = ChromaClientWrapper()
 
         self.visual_collection_name = collection_name
         self.client.load_collection(self.visual_collection_name)
-        visual_model_name = os.getenv("VISUAL_EMBEDDING_MODEL", "CLIP/clip-vit-b-16")
-        handler = get_model_handler(visual_model_name)
-        handler.load_model()
-        self.visual_embedding_model = EmbeddingModel(handler)
+
+        self.visual_embedding_model = visual_embedding_model or get_visual_embedding_model()
 
         self.document_collection_name = f"{collection_name}_documents"
         self.client.load_collection(self.document_collection_name)
 
-        doc_model_path = os.getenv("DOC_EMBEDDING_MODEL", "BAAI/bge-small-en-v1.5")
-        run_device = os.getenv("INGEST_DEVICE", "CPU")
-        self.document_embedding_model = OpenVINOEmbedding(
-            model_id_or_path=doc_model_path,
-            device=run_device,
-        )
+        self.document_embedding_model = document_embedding_model or get_document_embedding_model()
 
     def get_text_embedding(self, query):
         embedding_tensor = self.visual_embedding_model.handler.encode_text(query)
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/server.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/server.py
@@ -38,6 +38,10 @@ def format(self, record):
 from providers.minio_wrapper.minio_client import MinioStore
 from providers.file_ingest_and_retrieve.indexer import Indexer
 from providers.file_ingest_and_retrieve.retriever import ChromaRetriever
+from providers.file_ingest_and_retrieve.models import (
+    get_visual_embedding_model,
+    get_document_embedding_model,
+)
 
 logger = logging.getLogger("visual_data_service")
 
@@ -84,8 +88,11 @@ class IngestTextRequest(_IngestRequestBase):
 
 _collection_name = os.getenv("CHROMA_COLLECTION_NAME", "content-search")
 
-indexer = Indexer(collection_name=_collection_name)
-retriever = ChromaRetriever(collection_name=_collection_name)
+_visual_model = get_visual_embedding_model()
+_document_model = get_document_embedding_model()
+
+indexer = Indexer(collection_name=_collection_name, visual_embedding_model=_visual_model, document_embedding_model=_document_model)
+retriever = ChromaRetriever(collection_name=_collection_name, visual_embedding_model=_visual_model, document_embedding_model=_document_model)
 
 minio_store = MinioStore.from_config()
 
diff --git a/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/utils.py b/education-ai-suite/smart-classroom/content_search/providers/file_ingest_and_retrieve/utils.py
@@ -77,23 +77,23 @@ class DocxParagraphPicturePartitioner:
     Custom partitioner to extract images from DOCX paragraphs.
     This preserves images that might be lost with standard parsing.
     """
+    output_dir: str = os.path.join(os.getcwd(), "logs", "extracted_images")
 
     @classmethod
     def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
         if paragraph is None:
             return
         imgs = paragraph._element.xpath(".//pic:pic")
         if imgs:
-            img_output_dir = "extracted_images"
-            os.makedirs(img_output_dir, exist_ok=True)
+            os.makedirs(cls.output_dir, exist_ok=True)
             for img in imgs:
                 try:
                     embed = img.xpath(".//a:blip/@r:embed")[0]
                     related_part = opts.document.part.related_parts[embed]
                     image_blob = related_part.blob
                     image = PILImage.open(BytesIO(image_blob))
                     image_filename = f"{embed}_{related_part.sha1}.png"
-                    image_path = os.path.join(img_output_dir, image_filename)
+                    image_path = os.path.join(cls.output_dir, image_filename)
                     image.save(image_path)
                     element_metadata = ElementMetadata(image_path=image_path)
                     yield Image(text="IMAGE", metadata=element_metadata)
diff --git a/education-ai-suite/smart-classroom/content_search/providers/vlm_openvino_serving/app.py b/education-ai-suite/smart-classroom/content_search/providers/vlm_openvino_serving/app.py
@@ -189,7 +189,7 @@ def initialize_model():
     global pipe, processor, model_dir
     model_name = settings.VLM_MODEL_NAME
     model_dir = Path(model_name.split("/")[-1])
-    model_dir = Path("models/openvino") / model_dir
+    model_dir = Path(os.getcwd()).parent / "models" / "openvino" / model_dir
     model_dir.mkdir(parents=True, exist_ok=True)
     weight = settings.VLM_COMPRESSION_WEIGHT_FORMAT.lower()
     model_dir = model_dir / weight
diff --git a/education-ai-suite/smart-classroom/content_search/start_services.py b/education-ai-suite/smart-classroom/content_search/start_services.py
@@ -36,7 +36,7 @@ def _load_config_to_env(config_path: str = "config.yaml") -> None:
 
         cs = data.get("content_search", {})
 
-        def _set(k, v): 
+        def _set(k, v):
             if v is not None:
                 os.environ.setdefault(k, str(v))
 
@@ -52,6 +52,7 @@ def _set(k, v):
         server_addr = str(minio.get("server", "127.0.0.1:9000"))
         port = server_addr.rsplit(':', 1)[-1]
         _set("MINIO_ADDRESS", f":{port}")
+        _set("MINIO_CONSOLE_ADDRESS", minio.get("console_address", ":9001"))
         _set("MINIO_ROOT_USER", minio.get("root_user", "minioadmin"))
         _set("MINIO_ROOT_PASSWORD", minio.get("root_password", "minioadmin"))
         _set("MINIO_DATA_DIR", minio.get("data_dir", "./minio_data"))
@@ -156,10 +157,8 @@ def main() -> None:
     if not chroma_exe:
         venv_exe = CONTENT_SEARCH_DIR / "venv_content_search" / "Scripts" / "chroma.exe"
         chroma_exe = str(venv_exe) if venv_exe.exists() else "chroma"
-    minio_exe = str(CONTENT_SEARCH_DIR / "providers" / os.environ.get("MINIO_EXE", "minio_wrapper/minio.exe"))
-    if not minio_exe:
-        provider_minio = CONTENT_SEARCH_DIR / "providers" / "minio_wrapper" / "minio.exe"
-        minio_exe = str(provider_minio) if provider_minio.exists() else "minio"
+    provider_minio = CONTENT_SEARCH_DIR / "providers" / "minio_wrapper" / "minio.exe"
+    minio_exe = str(provider_minio) if provider_minio.exists() else "minio"
     # no service current
     pg_bin_dir = Path(r"C:\Program Files\PostgreSQL\16\bin")
     pg_exe = str(pg_bin_dir / "postgres.exe")
@@ -178,7 +177,8 @@ def main() -> None:
         },
         "minio": {
             "cmd": [minio_exe, "server", os.environ.get("MINIO_DATA_DIR", "./minio_data"),
-                    "--address", os.environ.get("MINIO_ADDRESS", ":9000")],
+                    "--address", os.environ.get("MINIO_ADDRESS", ":9000"),
+                    "--console-address", os.environ.get("MINIO_CONSOLE_ADDRESS", ":9001")],
             "cwd": CONTENT_SEARCH_DIR,
             "extra_env": {
                 "MINIO_ROOT_USER": os.environ.get("MINIO_ROOT_USER", "minioadmin"),