Fix for issues identified during RC1 validation (10Nov) (open-edge-platform#1209)

bharagha · hteeyeoh · web-flow · commit addede344db3 · 2025-11-11T16:08:40.000+05:30
Co-authored-by: Hoong Tee, Yeoh &lt;hoong.tee.yeoh@intel.com&gt;
diff --git a/microservices/document-ingestion/pgvector/docker/compose-dev.yaml b/microservices/document-ingestion/pgvector/docker/compose-dev.yaml
@@ -9,7 +9,8 @@ services:
         http_proxy: ${http_proxy}
         https_proxy: ${https_proxy}
         no_proxy: ${no_proxy}
-    image: intel/document-ingestion:1.2.0-dev
+    #TODO: Configure image version as an env parameter 
+    image: intel/document-ingestion:1.2.2-dev
     environment:
       DEFAULT_BUCKET: "intel.gai.dev.test"
       OBJECT_PREFIX: "test"
diff --git a/microservices/document-ingestion/pgvector/docker/compose.yaml b/microservices/document-ingestion/pgvector/docker/compose.yaml
@@ -24,7 +24,8 @@ services:
         http_proxy: ${http_proxy}
         https_proxy: ${https_proxy}
         no_proxy: ${no_proxy}
-    image: intel/document-ingestion:1.2.0
+    #TODO: Configure image version as an env parameter    
+    image: intel/document-ingestion:1.2.2
     environment:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
@@ -41,8 +42,8 @@ services:
       MINIO_HOST: ${MINIO_HOST:-minio-server}
       MINIO_API_PORT: ${MINIO_API_PORT:-9000}
       # Raise error if following required env vars is not set
-      MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:?error}
-      MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:?error}
+      MINIO_ROOT_USER: ${MINIO_USER:?error}
+      MINIO_ROOT_PASSWORD: ${MINIO_PASSWD:?error}
     ports:
       - "${DATAPREP_HOST_PORT:-8000}:8000"
     depends_on:
diff --git a/microservices/document-ingestion/pgvector/docs/get-started.md b/microservices/document-ingestion/pgvector/docs/get-started.md
@@ -99,6 +99,7 @@ This method provides the fastest way to get started with the microservice.
     source ./run.sh --conf
     # This will output docker compose configs with all the environment variables resolved. The user can verify whether they are configured correctly.
     ```
+    The valid configuration will ensure the latest prebuilt image from `intel` registry is downloaded. The scripts take care of this.
 5. **Start the Microservices**:
     There are different options provided to start the microservices.
     ```bash
diff --git a/microservices/document-ingestion/pgvector/run.sh b/microservices/document-ingestion/pgvector/run.sh
@@ -118,7 +118,7 @@ if [ "$1" = "--nosetup" ] && [ "$#" -eq 1 ]; then
 
 # Verify the configuration of docker compose
 elif [ "$1" = "--conf" ] && [ "$#" -eq 1 ]; then
-    docker compose config
+    docker compose -f docker/compose.yaml config
 
 # Stop and remove containers and networks (basic down)
 elif [ "$1" = "--down" ] && [ "$#" -eq 1 ]; then
@@ -184,28 +184,42 @@ elif [ "$#" -eq 0 ]; then
 # Remove all project-related Docker images
 elif [ "$1" = "--clean" ] && [ "$#" -eq 1 ]; then
     echo "Removing all ${PROJECT_NAME} related Docker images..."
-    docker images --filter "label=project=${PROJECT_NAME}" -q | xargs -r docker rmi -f
-    # Fallback: also remove legacy images that may not have labels
-    docker images | grep "${PROJECT_NAME}" | awk '{print $3}' | xargs -r docker rmi -f
-    docker images | grep "intel/document-ingestion" | awk '{print $3}' | xargs -r docker rmi -f
+    
+    # Use docker compose to remove all images from services
+    docker compose -f docker/compose.yaml down --rmi all 2>/dev/null || true
+    
+    # Also remove dev environment images if exists
+    if [ -f "docker/compose-dev.yaml" ]; then
+        docker compose -f docker/compose.yaml -f docker/compose-dev.yaml down --rmi all 2>/dev/null || true
+    fi
+    
+    # Remove any remaining labeled images
+    docker images --filter "label=project=${PROJECT_NAME}" -q | xargs -r docker rmi -f 2>/dev/null || true
+
     echo "Cleanup completed!"
 
 # Remove specific service image using labels
 elif [ "$1" = "--clean" ] && [ "$2" = "dataprep" ] && [ "$#" -eq 2 ]; then
     echo "Removing dataprep service images..."
     docker images --filter "label=project=${PROJECT_NAME}" --filter "label=service=dataprep" -q | xargs -r docker rmi -f
     # Fallback: also remove legacy images that may not have labels
-    docker images | grep "intel/document-ingestion" | awk '{print $3}' | xargs -r docker rmi -f
+    docker images | grep "intel/document-ingestion" | awk '{print $3}' | xargs -r docker rmi -f 2>/dev/null || true
     echo "Dataprep images removed!"
 
 # Complete cleanup - stop containers, remove containers, networks, volumes, and images
 elif [ "$1" = "--purge" ] && [ "$#" -eq 1 ]; then
     echo "Performing complete cleanup..."
-    docker compose -f docker/compose.yaml down --volumes --remove-orphans
-    docker images --filter "label=project=${PROJECT_NAME}" -q | xargs -r docker rmi -f
-    # Fallback cleanup for legacy images
-    docker images | grep "${PROJECT_NAME}" | awk '{print $3}' | xargs -r docker rmi -f
-    docker images | grep "intel/document-ingestion" | awk '{print $3}' | xargs -r docker rmi -f    
+    
+    # Stop everything and remove all resources including images
+    docker compose -f docker/compose.yaml down --rmi all --volumes --remove-orphans 2>/dev/null || true
+    
+    if [ -f "docker/compose-dev.yaml" ]; then
+        docker compose -f docker/compose.yaml -f docker/compose-dev.yaml down --rmi all --volumes --remove-orphans 2>/dev/null || true
+    fi
+    
+    # Clean any remaining labeled images
+    docker images --filter "label=project=${PROJECT_NAME}" -q | xargs -r docker rmi -f 2>/dev/null || true
+        
     echo "Complete cleanup finished!"
 
 else
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/get-started.md b/sample-applications/chat-question-and-answer/docs/user-guide/get-started.md
@@ -41,11 +41,13 @@ The sample application has been validated with a few models just to validate the
 ### LLM Models validated for each model server
 | Model Server | Models Validated |
    |--------------|-------------------|
-   | `vLLM` | `Intel/neural-chat-7b-v3-3`, `Qwen/Qwen2.5-7B-Instruct`, `microsoft/Phi-3.5-mini-instruct`, `meta-llama/Llama-3.1-8B-instruct`, `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` |
+   | `vLLM` (deprecated) | `Intel/neural-chat-7b-v3-3`, `Qwen/Qwen2.5-7B-Instruct`, `microsoft/Phi-3.5-mini-instruct`, `meta-llama/Llama-3.1-8B-instruct`, `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` |
    | `OVMS` | `Intel/neural-chat-7b-v3-3`, `Qwen/Qwen2.5-7B-Instruct`, `microsoft/Phi-3.5-mini-instruct`, `meta-llama/Llama-3.1-8B-instruct`, `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` |
-   | `TGI` | `Intel/neural-chat-7b-v3-3`, `Qwen/Qwen2.5-7B-Instruct`, `microsoft/Phi-3.5-mini-instruct`, `meta-llama/Llama-3.1-8B-instruct`, `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` |
+   | `TGI` (deprecated) | `Intel/neural-chat-7b-v3-3`, `Qwen/Qwen2.5-7B-Instruct`, `microsoft/Phi-3.5-mini-instruct`, `meta-llama/Llama-3.1-8B-instruct`, `deepseek-ai/DeepSeek-R1-Distill-Qwen-7B` |
 
-Note: Limited validation was done on DeepSeek model.
+**Note:**
+1. Limited validation was done on DeepSeek model.
+2. Effective 2025.2.0 release, support for vLLM and TGI is deprecated. The functionality is not guaranteed to work and the user is advised not to use them. Should there be a strong requirement for the same, please raise an issue in github.
 
 ### Reranker Models validated
    | Model Server | Models Validated |
@@ -98,7 +100,7 @@ Visit https://huggingface.co/settings/tokens to get your token.
    export LLM_MODEL=Qwen/Qwen2.5-7B-Instruct
    export EMBEDDING_MODEL_NAME=Alibaba-NLP/gte-large-en-v1.5
    export RERANKER_MODEL=BAAI/bge-reranker-base
-   export DEVICE="CPU" #Options: CPU for VLLM and TGI. GPU is only enabled for openvino model server(OVMS) .
+   export DEVICE="CPU" #Options: GPU is enabled for openvino model server(OVMS) .
    export OTLP_ENDPOINT_TRACE=<otlp-endpoint-trace> # Optional. Set only if there is an OTLP endpoint available or can be ignored
    export OTLP_ENDPOINT=<otlp-endpoint> # Optional. Set only if there is an OTLP endpoint available or can be ignored
    ```
@@ -111,7 +113,7 @@ Visit https://huggingface.co/settings/tokens to get your token.
    export TAG=2.0.0
    source setup.sh llm=<model-server> embed=<embedding>
    # Below are the options
-   # model-server: VLLM , OVMS, TGI
+   # model-server: VLLM (deprecated), OVMS, TGI (deprecated)
    # embedding: OVMS, TEI
    ```
 
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/index.rst b/sample-applications/chat-question-and-answer/docs/user-guide/index.rst
@@ -32,7 +32,7 @@ Technical Architecture
 
 The ChatQ&A sample application includes the following components:
 
-- **LLM inference microservice**: Intel's optimized `OpenVINO Model Server (OVMS) <https://github.com/openvinotoolkit/model_server>`__ is used to efficiently run large language models on Intel hardware. Developers also have other model serving options if required. vLLM with OpenVINO backend and TGI are the options provided.
+- **LLM inference microservice**: Intel's optimized `OpenVINO Model Server (OVMS) <https://github.com/openvinotoolkit/model_server>`__ is used to efficiently run large language models on Intel hardware. Developers also have other model serving options if required. vLLM with OpenVINO backend and TGI are the options provided. (*Note: vLLM and TGI are deprecated effective 2025.2.0 release.*)
 - **Embedding inference microservice**: Intel's optimized `OpenVINO Model Server (OVMS) <https://github.com/openvinotoolkit/model_server>`__ and Huggingface `Text Embeddings Inference <https://github.com/huggingface/text-embeddings-inference>`__ microservice are the options provided to run embedding models efficiently on target Intel hardware. OVMS is the default option due to performance benefits on Intel hardware.
 - **Reranking inference microservice**: Huggingface `Text Embeddings Inference <https://github.com/huggingface/text-embeddings-inference>`__ microservice is the model serving choice available.
 - **Document ingestion microservice**: The sample `document ingestion <https://github.com/open-edge-platform/edge-ai-libraries/tree/release-1.2.0/microservices/document-ingestion/pgvector>`__ microservice allows ingestion of common document formats like PDF and DOC, and contents from web links. It supports a REST endpoint to ingest the documents. The ingestion process creates embeddings of the documents and stores them in the preferred vector database. The modular architecture allows users to customize the vector database. The sample application uses `PGvector <https://github.com/pgvector/pgvector>`__ database. The raw documents are stored in the `MinIO <https://github.com/minio/minio>`__ datastore, which is also customizable.
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/overview-architecture.md b/sample-applications/chat-question-and-answer/docs/user-guide/overview-architecture.md
@@ -96,10 +96,10 @@ The application flow is illustrated in the flow diagram below. The diagram shows
 The ChatQ&A sample application is designed with modularity in mind, allowing developers to:
 1. **Change inference microservices**:
    - The default option is OVMS for LLM and TEI for embeddings and reranker.
-   - Use other model servers like vLLM with OpenVINO backend, and TGI to host LLM models.
+   - (*Deprecated effective 2025.2.0*) Use other model servers like vLLM with OpenVINO backend, and TGI to host LLM models.
    - Mandatory requirement is OpenAI API compliance. Note that other model servers are not guaranteed to provide same performance as default options.
 2. **Load different LLM, Embedding, and Reranker models**:
-   - Use different models from Hugging Face OpenVINO model hub or vLLM model hub. The models are passed as a parameter to corresponding model servers.
+   - Use different models from Hugging Face OpenVINO model hub or vLLM model hub. The models are passed as a parameter to corresponding model servers. (*vLLM support is deprecated effective 2025.2.0*)
 3. **Use other GenAI frameworks like Haystack and LlamaIndex**:
    - Integrate the inference microservices into an application backend developed on other frameworks similar to the LangChain integration provided in this sample application.
 4. **Deploy on diverse Intel target hardware and deployment scenarios**:
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/overview.md b/sample-applications/chat-question-and-answer/docs/user-guide/overview.md
@@ -26,7 +26,7 @@ Key features include:
 
 The ChatQ&A sample application includes the following components:
 
-- **LLM inference microservice**: Intel's optimized [OpenVINO Model Server (OVMS)](https://github.com/openvinotoolkit/model_server) is used to efficiently run large language models on Intel hardware. Developers also have other model serving options if required. vLLM with OpenVINO backend and TGI are the options provided.
+- **LLM inference microservice**: Intel's optimized [OpenVINO Model Server (OVMS)](https://github.com/openvinotoolkit/model_server) is used to efficiently run large language models on Intel hardware. Developers also have other model serving options if required. vLLM with OpenVINO backend and TGI are the options provided. (*vLLM and TGI support is deprecated effective 2025.2.0*)
 - **Embedding inference microservice**: Intel's optimized [OpenVINO Model Server (OVMS)](https://github.com/openvinotoolkit/model_server) and Huggingface [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) microservice are the options provided to run embedding models efficiently on target Intel hardware. OVMS is the default option due to performance benefits on Intel hardware.
 - **Reranking inference microservice**: Huggingface [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) microservice is the model serving choice available.
 - **Document ingestion microservice**: The sample [document ingestion](../../../../microservices/document-ingestion/) microservice allows ingestion of common document formats like PDF and DOC, and contents from web links. It supports a REST endpoint to ingest the documents. The ingestion process creates embeddings of the documents and stores them in the preferred vector database. The modular architecture allows users to customize the vector database. The sample application uses [PGvector](https://github.com/pgvector/pgvector) database. The raw documents are stored in the [MinIO](https://github.com/minio/minio) datastore, which is also customizable.
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/release-notes.md b/sample-applications/chat-question-and-answer/docs/user-guide/release-notes.md
@@ -16,7 +16,7 @@
 - The upload button is temporarily disabled during chat response generation to prevent delays. File or link uploads trigger embedding generation, which runs on the same OVMS server as the LLM, potentially slowing response streaming if both run together.
 - Chat data is stored in localStorage for session continuity. After container restarts, old chats may reappear — clear your browser’s localStorage to start fresh.
 - Limited validation done on EMT-S due to EMT-S issues. Not recommended to use ChatQnA modular on EMT-S until full validation is completed.
-- TGI on EMT 3.0 on Core&trade; configuration has a long startup time due to resource constraints. Alternative is to use TGI only on Xeon® based systems.
+- TGI on EMT 3.0 on Core&trade; configuration has a long startup time due to resource constraints. Alternative is to use TGI only on Xeon® based systems. (*Low priority as TGI and vLLM is deprecated effective 2025.2.0*)
 - DeepSeek/Phi Models are observed, at times, to continue generating response in an endless loop. Close the browser and restart in such cases. 
 
 ## Previous Releases
diff --git a/sample-applications/chat-question-and-answer/docs/user-guide/system-requirements.md b/sample-applications/chat-question-and-answer/docs/user-guide/system-requirements.md
@@ -20,8 +20,8 @@ This page provides detailed hardware, software, and platform requirements to hel
 ## Minimum Configuration
 The recommended minimum configuration depends on the model serving used.
 - For OVMS based deployment, recommendation for memory is 64GB and storage is 128 GB. This is applicable for both Ubuntu and EMT 3.0.
-- For vLLM based deployment, recommendation for memory is 128GB. Minimum storage is 128GB, but check based on the model configuration. Memory configuration can be reduced by changing the default KV_CACHE_SPACE to a lower value. Lower KV_CACHE has impact on the performance and accuracy of the pipeline. This is applicable for both Ubuntu and EMT 3.0.
-- For TGI based deloyment on EMT 3.0, recommendation is to run it on Xeon® based systems. TGI on Core&trade; is observed to take a long time to startup with no guarantee that it will be functional. No such limitations on Ubuntu based systems for TGI.
+- For vLLM based deployment, recommendation for memory is 128GB. Minimum storage is 128GB, but check based on the model configuration. Memory configuration can be reduced by changing the default KV_CACHE_SPACE to a lower value. Lower KV_CACHE has impact on the performance and accuracy of the pipeline. This is applicable for both Ubuntu and EMT 3.0. (*vLLM is deprecated effective 2025.2.0*)
+- For TGI based deloyment on EMT 3.0, recommendation is to run it on Xeon® based systems. TGI on Core&trade; is observed to take a long time to startup with no guarantee that it will be functional. No such limitations on Ubuntu based systems for TGI. (*TGI is deprecated effective 2025.2.0*)
 
 Further requirements is dependent on the specific configuration of the application like KV cache, context size etc. Any changes to the default parameters of the sample application should be assessed for memory and storage implications. Raise a git issue in case of any required support for smaller configurations.
 
diff --git a/sample-applications/chat-question-and-answer/setup.sh b/sample-applications/chat-question-and-answer/setup.sh
@@ -175,8 +175,10 @@ setup_inference() {
         local service=$1
         case "${service,,}" in
                 vllm)
-                        export ENDPOINT_URL=http://vllm-service/v1
-                        export COMPOSE_PROFILES=VLLM
+                        echo "Error: vLLM support is deprecated and no longer available."
+                        echo "Please use OVMS as the Model Server instead."
+                        echo "Usage: setup.sh llm=OVMS embed=<Embedding Service>"
+                        #exit 1
                         ;;
                 ovms)
                         export ENDPOINT_URL=http://ovms-service/v3
@@ -194,8 +196,10 @@ setup_inference() {
                         cd ..
                         ;;
                 tgi)
-                        export ENDPOINT_URL=http://text-generation/v1
-                        export COMPOSE_PROFILES=TGI
+                        echo "Error: TGI support is deprecated and no longer available."
+                        echo "Please use OVMS as the Model Server instead."
+                        echo "Usage: setup.sh llm=OVMS embed=<Embedding Service>"
+                        #exit 1
                         ;;
                 *)
                         echo "Invalid Model Server option: $service"
@@ -241,8 +245,10 @@ if [[ -n "$1" && -n "$2" ]]; then
                         *)
                                 echo "Invalid argument: $arg"
                                 echo "Usage: setup.sh llm=<Model Server> embed=<Embedding Service>"
-                                echo "Model Server options: VLLM or TGI or OVMS"
+                                echo "Model Server options: OVMS"
                                 echo "Embedding Service options: TEI or OVMS"
+                                echo ""
+                                echo "Note: vLLM and TGI are deprecated and no longer supported."
                                 ;;
                 esac
         done
@@ -251,6 +257,8 @@ if [[ -n "$1" && -n "$2" ]]; then
 else
         echo "Please provide the service to start: specify Model server and Embedding service"
         echo "Usage: setup.sh llm=<Model Server> embed=<Embedding Service>"
-        echo "Model Server options: VLLM or TGI or OVMS"
+        echo "Model Server options: OVMS"
         echo "Embedding Service options: TEI or OVMS"
+        echo ""
+        echo "Note: vLLM and TGI are deprecated and no longer supported."
 fi