open-edge-platform
diff --git a/‎sample-applications/chat-question-and-answer/.dockerignore‎
Lines changed: 1 addition & 2 deletions b/‎sample-applications/chat-question-and-answer/.dockerignore‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎sample-applications/chat-question-and-answer/Dockerfile‎
Lines changed: 48 additions & 22 deletions b/‎sample-applications/chat-question-and-answer/Dockerfile‎
Lines changed: 48 additions & 22 deletions
diff --git a/‎sample-applications/chat-question-and-answer/app/chain.py‎
Lines changed: 70 additions & 32 deletions b/‎sample-applications/chat-question-and-answer/app/chain.py‎
Lines changed: 70 additions & 32 deletions
diff --git a/‎sample-applications/chat-question-and-answer/app/server.py‎
Lines changed: 8 additions & 2 deletions b/‎sample-applications/chat-question-and-answer/app/server.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/templates/egaiui-deployment.yaml‎
Lines changed: 3 additions & 1 deletion b/‎sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/templates/egaiui-deployment.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/values.yaml‎
Lines changed: 1 addition & 0 deletions b/‎sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/values.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sample-applications/chat-question-and-answer/chart/subchart/embedding/ovms/templates/ovms-embed-configmap.yaml‎
Lines changed: 4 additions & 2 deletions b/‎sample-applications/chat-question-and-answer/chart/subchart/embedding/ovms/templates/ovms-embed-configmap.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎sample-applications/chat-question-and-answer/chart/subchart/llm/ovms/templates/ovms-configmap.yaml‎
Lines changed: 3 additions & 3 deletions b/‎sample-applications/chat-question-and-answer/chart/subchart/llm/ovms/templates/ovms-configmap.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎sample-applications/chat-question-and-answer/chart/templates/egaisample-deployment.yaml‎
Lines changed: 28 additions & 10 deletions b/‎sample-applications/chat-question-and-answer/chart/templates/egaisample-deployment.yaml‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎sample-applications/chat-question-and-answer/chart/values.yaml‎
Lines changed: 4 additions & 2 deletions b/‎sample-applications/chat-question-and-answer/chart/values.yaml‎
Lines changed: 4 additions & 2 deletions
@@ -10,5 +10,4 @@ ovms_config/
 deployment_package/
 chart/
 .git/
-.env
-
+.env
@@ -1,41 +1,67 @@
 # Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FROM python:3.11-slim
+# --- Stage 1: Build dependencies ---
+FROM python:3.11-slim as python-base
 
-ENV HOME="/home/intelgai"
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 
-# Create a non-root user
-RUN groupadd -g 1001 intelgai && \
-    useradd -m -s /bin/bash -u 1001 -g 1001 intelgai && \
-    chown -R intelgai:intelgai $HOME && \
-    apt-get update -y && \
-    apt-get install -y --no-install-recommends --fix-missing \
+# Install system build dependencies
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
     libgl1-mesa-glx \
-    libjemalloc-dev && \
+    libjemalloc-dev \
+    curl && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* && \
     pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir poetry nltk && \
-    poetry config virtualenvs.create false && \
-    mkdir -p /home/intelgai/.cache/huggingface && \
-    chown intelgai:intelgai /home/intelgai/.cache/huggingface
+    pip install --no-cache-dir poetry
 
-#COPY . my-app
-#COPY pyproject.toml poetry.lock README.md app/ my-app/
-WORKDIR /my-app
+# Copy only dependency files
+WORKDIR /app
 COPY pyproject.toml poetry.lock README.md ./
+
+# Install dependencies in a temporary location
+RUN poetry config virtualenvs.create false && \
+    poetry install --only main --no-root && \
+    rm -rf ~/.cache
+
+# --- Stage 2: Final slim image ---
+FROM python:3.11-slim
+
+ENV HOME="/home/appuser"
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Add non-root user
+RUN groupadd -g 1001 appuser && \
+    useradd -m -s /bin/bash -u 1001 -g 1001 appuser && \
+    mkdir -p $HOME/.cache/huggingface && \
+    chown -R appuser:appuser $HOME
+
+# Install only runtime dependencies
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    libgl1-mesa-glx \
+    libjemalloc2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /my-app
+
+# Copy installed packages from python-base
+COPY --from=python-base /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+COPY --from=python-base /usr/local/bin /usr/local/bin
+
+# Copy application code
 COPY app ./app
 
-# Install dependencies and NLTK data as non-root user
-RUN poetry install --only main && \
-    rm -rf ~/.cache/pypoetry/* ~/.cache/pip/*
+# Set permissions
+RUN chown -R appuser:appuser /my-app
 
-USER intelgai
+USER appuser
 
 EXPOSE 8080
 
-
-CMD uvicorn app.server:app --host 0.0.0.0 --port 8080
+CMD uvicorn app.server:app --host 0.0.0.0 --port 8080
@@ -22,6 +22,8 @@
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+import openlit
+from transformers import AutoTokenizer
 
 set_verbose(True)
 
@@ -31,13 +33,25 @@
 otlp_endpoint = os.environ.get("OTLP_ENDPOINT", False)
 
 # Initialize OpenTelemetry
-trace.set_tracer_provider(TracerProvider())
-tracer = trace.get_tracer(__name__)
+if not isinstance(trace.get_tracer_provider(), TracerProvider):    
+    tracer_provider = TracerProvider()
+    trace.set_tracer_provider(tracer_provider)
+
+    # Set up OTLP exporter and span processor
+    if not otlp_endpoint:
+        logging.warning("No OTLP endpoint provided - Telemetry data will not be collected.")
+    else:
+        otlp_exporter = OTLPSpanExporter()
+        span_processor = BatchSpanProcessor(otlp_exporter)
+        tracer_provider.add_span_processor(span_processor)
+
+        openlit.init(
+            otlp_endpoint=otlp_endpoint,
+            application_name=os.environ.get("OTEL_SERVICE_NAME", "chatqna"),
+            environment=os.environ.get("OTEL_SERVICE_ENV", "chatqna"),
+        )
 
-if otlp_endpoint:
-    otlp_exporter = OTLPSpanExporter()
-    span_processor = BatchSpanProcessor(otlp_exporter)
-    trace.get_tracer_provider().add_span_processor(span_processor)
+        logging.info(f"Tracing enabled: OpenTelemetry configured using OTLP endpoint at {otlp_endpoint}")
 
 PG_CONNECTION_STRING = os.getenv("PG_CONNECTION_STRING")
 MODEL_NAME = os.getenv("EMBEDDING_MODEL","BAAI/bge-small-en-v1.5")
@@ -88,34 +102,58 @@
 prompt = ChatPromptTemplate.from_template(template)
 
 ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://localhost:8080")
+
+# Check which LLM inference backend is being used
+LLM_BACKEND = None
+if "ovms" in ENDPOINT_URL.lower():
+    LLM_BACKEND = "ovms"
+elif "text-generation" in ENDPOINT_URL.lower():
+    LLM_BACKEND = "text-generation"
+elif "vllm" in ENDPOINT_URL.lower():
+    LLM_BACKEND = "vllm"
+else:
+    LLM_BACKEND = "unknown"
+
+logging.info(f"Using LLM inference backend: {LLM_BACKEND}")
 LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
 RERANKER_ENDPOINT = os.getenv("RERANKER_ENDPOINT", "http://localhost:9090/rerank")
 callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]
-
-
-model = EGAIModelServing(
-    openai_api_key="EMPTY",
-    openai_api_base="{}".format(ENDPOINT_URL),
-    model_name=LLM_MODEL,
-    top_p=0.99,
-    temperature=0.01,
-    streaming=True,
-    callbacks=callbacks,
-)
-
-re_ranker = CustomReranker(reranking_endpoint=RERANKER_ENDPOINT)
-re_ranker_lambda = RunnableLambda(re_ranker.rerank)
-
-# RAG Chain
-chain = (
-    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
-    | re_ranker_lambda
-    | prompt
-    | model
-    | StrOutputParser()
-)
-
-
-async def process_chunks(question_text):
+tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
+
+async def process_chunks(question_text,max_tokens):
+    if LLM_BACKEND in ["vllm", "unknown"]:
+        seed_value = None
+    else:
+        seed_value = int(os.getenv("SEED", 42))
+    tokens = tokenizer.tokenize(str(prompt))
+    num_tokens = len(tokens)
+    logging.info(f"Prompt tokens for model {LLM_MODEL}: {num_tokens}")
+    output_tokens = max_tokens - num_tokens
+    logging.info(f"Output tokens for model {LLM_MODEL}: {output_tokens}")
+    model = EGAIModelServing(
+        openai_api_key="EMPTY",
+        openai_api_base="{}".format(ENDPOINT_URL),
+        model_name=LLM_MODEL,
+        top_p=0.99,
+        temperature=0.01,
+        streaming=True,
+        callbacks=callbacks,
+        seed=seed_value,
+        max_tokens=max_tokens,
+        stop=["\n\n"]
+    )
+
+    re_ranker = CustomReranker(reranking_endpoint=RERANKER_ENDPOINT)
+    re_ranker_lambda = RunnableLambda(re_ranker.rerank)
+
+    # RAG Chain
+    chain = (
+        RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
+        | re_ranker_lambda
+        | prompt
+        | model
+        | StrOutputParser()
+    )
+    # Run the chain with the question text
     async for log in chain.astream(question_text):
         yield f"data: {log}\n\n"
@@ -53,6 +53,7 @@ async def redirect_root_to_docs():
 
 class QuestionRequest(BaseModel):
     input: str
+    MAX_TOKENS: int
 
 
 @app.get("/health")
@@ -105,7 +106,9 @@ async def query_chain(payload: QuestionRequest):
     and returns a streaming response with the processed chunks of the question text.
 
     Args:
-        payload (QuestionRequest): The request payload containing the input question text.
+        payload (QuestionRequest): The request payload containing the input question text
+        MaX_TOKENS (int): The maximum number of tokens to process. Defaults to 512 if not provided.
+        or set to 4096 if provided.
 
     Returns:
         StreamingResponse: A streaming response with the processed chunks of the question text.
@@ -114,9 +117,12 @@ async def query_chain(payload: QuestionRequest):
         HTTPException: If the input question text is empty or not provided, a 422 status code is returned.
     """
     question_text = payload.input
+    max_tokens = payload.MAX_TOKENS if payload.MAX_TOKENS else 512
+    if max_tokens > 1024:
+        raise HTTPException(status_code=422, detail="MAX_TOKENS cannot be greater than 1024")
     if not question_text or question_text == "":
         raise HTTPException(status_code=422, detail="Question is required")
-    return StreamingResponse(process_chunks(question_text), media_type="text/event-stream")
+    return StreamingResponse(process_chunks(question_text,max_tokens), media_type="text/event-stream")
 
 FastAPIInstrumentor.instrument_app(app)
 
 
@@ -27,4 +27,6 @@ spec:
             - name: HTTPS_PROXY
               value: {{ .Values.global.proxy.https_proxy }}
             - name: NO_PROXY
-              value: {{ .Values.global.proxy.no_proxy }}
+              value: {{ .Values.global.proxy.no_proxy }}
+            - name: APP_MAX_TOKENS
+              value: "{{ .Values.chatqnaui.env.APP_MAX_TOKENS }}"
@@ -17,3 +17,4 @@ chatqnaui:
   env:
     APP_ENDPOINT_URL: "/v1/chatqna"
     APP_DATA_PREP_URL: "/v1/dataprep"
+    APP_MAX_TOKENS: 512
@@ -14,6 +14,8 @@ data:
     # FLATTENED_MODEL_NAME=$(echo ${model} | tr '/' '-')
     pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/0/demos/common/export_models/requirements.txt
     
+    pip3 install -U huggingface_hub[hf_xet]
+
     # Log in to Hugging Face using the provided token
     huggingface-cli login --token $2
 
@@ -27,10 +29,10 @@ data:
     mkdir models
     if [ "$gpu_enabled" = "true" ]; then
         echo "GPU is enabled, using GPU model"
-        python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json  --model_repository_path models --target_device {{ .Values.global.gpu.device }} --overwrite_models
+        python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json  --model_repository_path models --target_device {{ .Values.global.gpu.device }} --cache_size 2
     else
         echo "GPU is not enabled, using CPU model"
-        python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json  --model_repository_path models --target_device CPU --overwrite_models
+        python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json  --model_repository_path models --target_device CPU
         
     fi
 
 
@@ -12,7 +12,7 @@ data:
     weight_format=$1
     gpu_enabled=$3
     pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/0/demos/common/export_models/requirements.txt
-    
+    pip3 install -U huggingface_hub[hf_xet]
     # Log in to Hugging Face using the provided token
     huggingface-cli login --token $2
 
@@ -26,10 +26,10 @@ data:
     mkdir models
     if [ "$gpu_enabled" = "true" ]; then
         echo "GPU is enabled, using GPU model"
-        python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --overwrite_models
+        python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --cache_size 2
     else
         echo "GPU is not enabled, using CPU model"
-        python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU --overwrite_models
+        python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU
     fi
     
     cp -r models /opt/data
 
@@ -52,27 +52,29 @@ spec:
             - name: PG_CONNECTION_STRING
               value: {{ .Values.Chatqna.env.PG_CONNECTION_STRING }}{{ .Values.global.POSTGRES_USER }}:{{ .Values.global.POSTGRES_PASSWORD }}@pgvector.{{ .Release.Namespace }}:{{ .Values.Chatqna.env.PORT_DB }}
             - name: OTLP_ENDPOINT
-              value:  {{- if eq .Values.global.OTLP_ENDPOINT "<otlp-endpoint>"}} 
-                          
-                      {{- else}}
+              value:  {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
                          {{ .Values.global.OTLP_ENDPOINT }}
                       {{- end }}
             - name: OTEL_EXPORTER_OTLP_ENDPOINT
-              value:  {{- if eq .Values.global.OTLP_ENDPOINT "<otlp-endpoint>"}} 
-                          
-                      {{- else}}
+              value:  {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
                          {{ .Values.global.OTLP_ENDPOINT }}
                       {{- end }}
-            - name: SERVICE_NAME
-              value:  {{ .Values.Chatqna.env.SERVICE_NAME }}
-            - name: SERVICE_ENV
-              value:  {{ .Values.Chatqna.env.SERVICE_ENV }}
+            - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+              value:  {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
+                         {{ .Values.global.OTLP_ENDPOINT }}/v1/traces
+                      {{- end }}
+            - name: OTEL_SERVICE_NAME
+              value:  {{ .Values.Chatqna.env.OTEL_SERVICE_NAME }}
+            - name: OTEL_SERVICE_ENV
+              value:  {{ .Values.Chatqna.env.OTEL_SERVICE_ENV }}
             - name: OTEL_METRICS_EXPORTER
               value:  {{ .Values.Chatqna.env.OTEL_METRICS_EXPORTER }}
             - name: OTEL_TRACES_EXPORTER
               value:  {{ .Values.Chatqna.env.OTEL_TRACES_EXPORTER }}
             - name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL
               value:  {{ .Values.Chatqna.env.OTEL_EXPORTER_OTLP_TRACES_PROTOCOL }}  
+            - name: OTEL_SERVICE_VERSION
+              value:  {{ .Values.Chatqna.env.OTEL_SERVICE_VERSION }}
             - name: REQUESTS_CA_BUNDLE
               value:  {{ .Values.Chatqna.env.REQUESTS_CA_BUNDLE }}
             - name: ENDPOINT_URL
@@ -85,5 +87,21 @@ spec:
                     {{ end }}
             - name: FETCH_K
               value: "10"
+            - name: SEED
+              value: "{{ .Values.Chatqna.env.SEED }}"
           securityContext:
             allowPrivilegeEscalation: false
+          volumeMounts:
+            - name: ca-certificates
+              mountPath: /usr/local/share/ca-certificates
+            - name: ssl-certs
+              mountPath: /etc/ssl/certs
+      volumes:
+        - name: ca-certificates
+          hostPath:
+            path: /usr/local/share/ca-certificates
+            type: Directory
+        - name: ssl-certs
+          hostPath:
+            path: /etc/ssl/certs
+            type: Directory
@@ -53,13 +53,15 @@ Chatqna:
     INDEX_NAME: intel-rag
     FETCH_K: 10
     PORT_DB: 5432/langchain
-    SERVICE_NAME: chatqna
-    SERVICE_ENV: chatqna
     OTEL_METRICS_EXPORTER: otlp
     OTEL_TRACES_EXPORTER: otlp
     OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: http/protobuf
+    OTEL_SERVICE_NAME: chatqna
+    OTEL_SERVICE_ENV: chatqna
+    OTEL_SERVICE_VERSION: "1.0.0"
     REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
     PG_CONNECTION_STRING: postgresql+psycopg://
+    SEED: 40
 
 dataprepPgvector:
   name: document-ingestion
-Original file line number
+Diff line change
 deployment_package/
 chart/
 .git/
 -.env
+-
 +.env