Skip to content

Commit 12f101a

Browse files
authored
Chatqna Modular: MAX TOKENS added in the API (#204)
1 parent 1e41ac8 commit 12f101a

19 files changed

Lines changed: 205 additions & 1629 deletions

File tree

sample-applications/chat-question-and-answer/.dockerignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,4 @@ ovms_config/
1010
deployment_package/
1111
chart/
1212
.git/
13-
.env
14-
13+
.env
Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,67 @@
11
# Copyright (C) 2025 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
FROM python:3.11-slim
4+
# --- Stage 1: Build dependencies ---
5+
FROM python:3.11-slim as python-base
56

6-
ENV HOME="/home/intelgai"
77
ENV PYTHONDONTWRITEBYTECODE=1
88
ENV PYTHONUNBUFFERED=1
99

10-
# Create a non-root user
11-
RUN groupadd -g 1001 intelgai && \
12-
useradd -m -s /bin/bash -u 1001 -g 1001 intelgai && \
13-
chown -R intelgai:intelgai $HOME && \
14-
apt-get update -y && \
15-
apt-get install -y --no-install-recommends --fix-missing \
10+
# Install system build dependencies
11+
RUN apt-get update -y && \
12+
apt-get install -y --no-install-recommends \
1613
libgl1-mesa-glx \
17-
libjemalloc-dev && \
14+
libjemalloc-dev \
15+
curl && \
1816
apt-get clean && \
1917
rm -rf /var/lib/apt/lists/* && \
2018
pip install --no-cache-dir --upgrade pip setuptools && \
21-
pip install --no-cache-dir poetry nltk && \
22-
poetry config virtualenvs.create false && \
23-
mkdir -p /home/intelgai/.cache/huggingface && \
24-
chown intelgai:intelgai /home/intelgai/.cache/huggingface
19+
pip install --no-cache-dir poetry
2520

26-
#COPY . my-app
27-
#COPY pyproject.toml poetry.lock README.md app/ my-app/
28-
WORKDIR /my-app
21+
# Copy only dependency files
22+
WORKDIR /app
2923
COPY pyproject.toml poetry.lock README.md ./
24+
25+
# Install dependencies in a temporary location
26+
RUN poetry config virtualenvs.create false && \
27+
poetry install --only main --no-root && \
28+
rm -rf ~/.cache
29+
30+
# --- Stage 2: Final slim image ---
31+
FROM python:3.11-slim
32+
33+
ENV HOME="/home/appuser"
34+
ENV PYTHONDONTWRITEBYTECODE=1
35+
ENV PYTHONUNBUFFERED=1
36+
37+
# Add non-root user
38+
RUN groupadd -g 1001 appuser && \
39+
useradd -m -s /bin/bash -u 1001 -g 1001 appuser && \
40+
mkdir -p $HOME/.cache/huggingface && \
41+
chown -R appuser:appuser $HOME
42+
43+
# Install only runtime dependencies
44+
RUN apt-get update -y && \
45+
apt-get install -y --no-install-recommends \
46+
libgl1-mesa-glx \
47+
libjemalloc2 && \
48+
apt-get clean && \
49+
rm -rf /var/lib/apt/lists/*
50+
51+
WORKDIR /my-app
52+
53+
# Copy installed packages from python-base
54+
COPY --from=python-base /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
55+
COPY --from=python-base /usr/local/bin /usr/local/bin
56+
57+
# Copy application code
3058
COPY app ./app
3159

32-
# Install dependencies and NLTK data as non-root user
33-
RUN poetry install --only main && \
34-
rm -rf ~/.cache/pypoetry/* ~/.cache/pip/*
60+
# Set permissions
61+
RUN chown -R appuser:appuser /my-app
3562

36-
USER intelgai
63+
USER appuser
3764

3865
EXPOSE 8080
3966

40-
41-
CMD uvicorn app.server:app --host 0.0.0.0 --port 8080
67+
CMD uvicorn app.server:app --host 0.0.0.0 --port 8080

sample-applications/chat-question-and-answer/app/chain.py

Lines changed: 70 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
from opentelemetry.sdk.trace import TracerProvider
2323
from opentelemetry.sdk.trace.export import BatchSpanProcessor
2424
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
25+
import openlit
26+
from transformers import AutoTokenizer
2527

2628
set_verbose(True)
2729

@@ -31,13 +33,25 @@
3133
otlp_endpoint = os.environ.get("OTLP_ENDPOINT", False)
3234

3335
# Initialize OpenTelemetry
34-
trace.set_tracer_provider(TracerProvider())
35-
tracer = trace.get_tracer(__name__)
36+
if not isinstance(trace.get_tracer_provider(), TracerProvider):
37+
tracer_provider = TracerProvider()
38+
trace.set_tracer_provider(tracer_provider)
39+
40+
# Set up OTLP exporter and span processor
41+
if not otlp_endpoint:
42+
logging.warning("No OTLP endpoint provided - Telemetry data will not be collected.")
43+
else:
44+
otlp_exporter = OTLPSpanExporter()
45+
span_processor = BatchSpanProcessor(otlp_exporter)
46+
tracer_provider.add_span_processor(span_processor)
47+
48+
openlit.init(
49+
otlp_endpoint=otlp_endpoint,
50+
application_name=os.environ.get("OTEL_SERVICE_NAME", "chatqna"),
51+
environment=os.environ.get("OTEL_SERVICE_ENV", "chatqna"),
52+
)
3653

37-
if otlp_endpoint:
38-
otlp_exporter = OTLPSpanExporter()
39-
span_processor = BatchSpanProcessor(otlp_exporter)
40-
trace.get_tracer_provider().add_span_processor(span_processor)
54+
logging.info(f"Tracing enabled: OpenTelemetry configured using OTLP endpoint at {otlp_endpoint}")
4155

4256
PG_CONNECTION_STRING = os.getenv("PG_CONNECTION_STRING")
4357
MODEL_NAME = os.getenv("EMBEDDING_MODEL","BAAI/bge-small-en-v1.5")
@@ -88,34 +102,58 @@
88102
prompt = ChatPromptTemplate.from_template(template)
89103

90104
ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://localhost:8080")
105+
106+
# Check which LLM inference backend is being used
107+
LLM_BACKEND = None
108+
if "ovms" in ENDPOINT_URL.lower():
109+
LLM_BACKEND = "ovms"
110+
elif "text-generation" in ENDPOINT_URL.lower():
111+
LLM_BACKEND = "text-generation"
112+
elif "vllm" in ENDPOINT_URL.lower():
113+
LLM_BACKEND = "vllm"
114+
else:
115+
LLM_BACKEND = "unknown"
116+
117+
logging.info(f"Using LLM inference backend: {LLM_BACKEND}")
91118
LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
92119
RERANKER_ENDPOINT = os.getenv("RERANKER_ENDPOINT", "http://localhost:9090/rerank")
93120
callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]
94-
95-
96-
model = EGAIModelServing(
97-
openai_api_key="EMPTY",
98-
openai_api_base="{}".format(ENDPOINT_URL),
99-
model_name=LLM_MODEL,
100-
top_p=0.99,
101-
temperature=0.01,
102-
streaming=True,
103-
callbacks=callbacks,
104-
)
105-
106-
re_ranker = CustomReranker(reranking_endpoint=RERANKER_ENDPOINT)
107-
re_ranker_lambda = RunnableLambda(re_ranker.rerank)
108-
109-
# RAG Chain
110-
chain = (
111-
RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
112-
| re_ranker_lambda
113-
| prompt
114-
| model
115-
| StrOutputParser()
116-
)
117-
118-
119-
async def process_chunks(question_text):
121+
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
122+
123+
async def process_chunks(question_text,max_tokens):
124+
if LLM_BACKEND in ["vllm", "unknown"]:
125+
seed_value = None
126+
else:
127+
seed_value = int(os.getenv("SEED", 42))
128+
tokens = tokenizer.tokenize(str(prompt))
129+
num_tokens = len(tokens)
130+
logging.info(f"Prompt tokens for model {LLM_MODEL}: {num_tokens}")
131+
output_tokens = max_tokens - num_tokens
132+
logging.info(f"Output tokens for model {LLM_MODEL}: {output_tokens}")
133+
model = EGAIModelServing(
134+
openai_api_key="EMPTY",
135+
openai_api_base="{}".format(ENDPOINT_URL),
136+
model_name=LLM_MODEL,
137+
top_p=0.99,
138+
temperature=0.01,
139+
streaming=True,
140+
callbacks=callbacks,
141+
seed=seed_value,
142+
max_tokens=max_tokens,
143+
stop=["\n\n"]
144+
)
145+
146+
re_ranker = CustomReranker(reranking_endpoint=RERANKER_ENDPOINT)
147+
re_ranker_lambda = RunnableLambda(re_ranker.rerank)
148+
149+
# RAG Chain
150+
chain = (
151+
RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
152+
| re_ranker_lambda
153+
| prompt
154+
| model
155+
| StrOutputParser()
156+
)
157+
# Run the chain with the question text
120158
async for log in chain.astream(question_text):
121159
yield f"data: {log}\n\n"

sample-applications/chat-question-and-answer/app/server.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ async def redirect_root_to_docs():
5353

5454
class QuestionRequest(BaseModel):
5555
input: str
56+
MAX_TOKENS: int
5657

5758

5859
@app.get("/health")
@@ -105,7 +106,9 @@ async def query_chain(payload: QuestionRequest):
105106
and returns a streaming response with the processed chunks of the question text.
106107
107108
Args:
108-
payload (QuestionRequest): The request payload containing the input question text.
109+
payload (QuestionRequest): The request payload containing the input question text
110+
MaX_TOKENS (int): The maximum number of tokens to process. Defaults to 512 if not provided.
111+
or set to 4096 if provided.
109112
110113
Returns:
111114
StreamingResponse: A streaming response with the processed chunks of the question text.
@@ -114,9 +117,12 @@ async def query_chain(payload: QuestionRequest):
114117
HTTPException: If the input question text is empty or not provided, a 422 status code is returned.
115118
"""
116119
question_text = payload.input
120+
max_tokens = payload.MAX_TOKENS if payload.MAX_TOKENS else 512
121+
if max_tokens > 1024:
122+
raise HTTPException(status_code=422, detail="MAX_TOKENS cannot be greater than 1024")
117123
if not question_text or question_text == "":
118124
raise HTTPException(status_code=422, detail="Question is required")
119-
return StreamingResponse(process_chunks(question_text), media_type="text/event-stream")
125+
return StreamingResponse(process_chunks(question_text,max_tokens), media_type="text/event-stream")
120126

121127
FastAPIInstrumentor.instrument_app(app)
122128

sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/templates/egaiui-deployment.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,6 @@ spec:
2727
- name: HTTPS_PROXY
2828
value: {{ .Values.global.proxy.https_proxy }}
2929
- name: NO_PROXY
30-
value: {{ .Values.global.proxy.no_proxy }}
30+
value: {{ .Values.global.proxy.no_proxy }}
31+
- name: APP_MAX_TOKENS
32+
value: "{{ .Values.chatqnaui.env.APP_MAX_TOKENS }}"

sample-applications/chat-question-and-answer/chart/subchart/chatqna-ui/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ chatqnaui:
1717
env:
1818
APP_ENDPOINT_URL: "/v1/chatqna"
1919
APP_DATA_PREP_URL: "/v1/dataprep"
20+
APP_MAX_TOKENS: 512

sample-applications/chat-question-and-answer/chart/subchart/embedding/ovms/templates/ovms-embed-configmap.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ data:
1414
# FLATTENED_MODEL_NAME=$(echo ${model} | tr '/' '-')
1515
pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/0/demos/common/export_models/requirements.txt
1616
17+
pip3 install -U huggingface_hub[hf_xet]
18+
1719
# Log in to Hugging Face using the provided token
1820
huggingface-cli login --token $2
1921
@@ -27,10 +29,10 @@ data:
2729
mkdir models
2830
if [ "$gpu_enabled" = "true" ]; then
2931
echo "GPU is enabled, using GPU model"
30-
python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --overwrite_models
32+
python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --cache_size 2
3133
else
3234
echo "GPU is not enabled, using CPU model"
33-
python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU --overwrite_models
35+
python3 export_model.py embeddings --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU
3436
3537
fi
3638

sample-applications/chat-question-and-answer/chart/subchart/llm/ovms/templates/ovms-configmap.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ data:
1212
weight_format=$1
1313
gpu_enabled=$3
1414
pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/0/demos/common/export_models/requirements.txt
15-
15+
pip3 install -U huggingface_hub[hf_xet]
1616
# Log in to Hugging Face using the provided token
1717
huggingface-cli login --token $2
1818
@@ -26,10 +26,10 @@ data:
2626
mkdir models
2727
if [ "$gpu_enabled" = "true" ]; then
2828
echo "GPU is enabled, using GPU model"
29-
python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --overwrite_models
29+
python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device {{ .Values.global.gpu.device }} --cache_size 2
3030
else
3131
echo "GPU is not enabled, using CPU model"
32-
python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU --overwrite_models
32+
python export_model.py text_generation --source_model "${model}" --weight-format "${weight_format}" --config_file_path models/config.json --model_repository_path models --target_device CPU
3333
fi
3434
3535
cp -r models /opt/data

sample-applications/chat-question-and-answer/chart/templates/egaisample-deployment.yaml

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,27 +52,29 @@ spec:
5252
- name: PG_CONNECTION_STRING
5353
value: {{ .Values.Chatqna.env.PG_CONNECTION_STRING }}{{ .Values.global.POSTGRES_USER }}:{{ .Values.global.POSTGRES_PASSWORD }}@pgvector.{{ .Release.Namespace }}:{{ .Values.Chatqna.env.PORT_DB }}
5454
- name: OTLP_ENDPOINT
55-
value: {{- if eq .Values.global.OTLP_ENDPOINT "<otlp-endpoint>"}}
56-
57-
{{- else}}
55+
value: {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
5856
{{ .Values.global.OTLP_ENDPOINT }}
5957
{{- end }}
6058
- name: OTEL_EXPORTER_OTLP_ENDPOINT
61-
value: {{- if eq .Values.global.OTLP_ENDPOINT "<otlp-endpoint>"}}
62-
63-
{{- else}}
59+
value: {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
6460
{{ .Values.global.OTLP_ENDPOINT }}
6561
{{- end }}
66-
- name: SERVICE_NAME
67-
value: {{ .Values.Chatqna.env.SERVICE_NAME }}
68-
- name: SERVICE_ENV
69-
value: {{ .Values.Chatqna.env.SERVICE_ENV }}
62+
- name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
63+
value: {{- if eq .Values.global.OTLP_ENDPOINT ""}} {{- else}}
64+
{{ .Values.global.OTLP_ENDPOINT }}/v1/traces
65+
{{- end }}
66+
- name: OTEL_SERVICE_NAME
67+
value: {{ .Values.Chatqna.env.OTEL_SERVICE_NAME }}
68+
- name: OTEL_SERVICE_ENV
69+
value: {{ .Values.Chatqna.env.OTEL_SERVICE_ENV }}
7070
- name: OTEL_METRICS_EXPORTER
7171
value: {{ .Values.Chatqna.env.OTEL_METRICS_EXPORTER }}
7272
- name: OTEL_TRACES_EXPORTER
7373
value: {{ .Values.Chatqna.env.OTEL_TRACES_EXPORTER }}
7474
- name: OTEL_EXPORTER_OTLP_TRACES_PROTOCOL
7575
value: {{ .Values.Chatqna.env.OTEL_EXPORTER_OTLP_TRACES_PROTOCOL }}
76+
- name: OTEL_SERVICE_VERSION
77+
value: {{ .Values.Chatqna.env.OTEL_SERVICE_VERSION }}
7678
- name: REQUESTS_CA_BUNDLE
7779
value: {{ .Values.Chatqna.env.REQUESTS_CA_BUNDLE }}
7880
- name: ENDPOINT_URL
@@ -85,5 +87,21 @@ spec:
8587
{{ end }}
8688
- name: FETCH_K
8789
value: "10"
90+
- name: SEED
91+
value: "{{ .Values.Chatqna.env.SEED }}"
8892
securityContext:
8993
allowPrivilegeEscalation: false
94+
volumeMounts:
95+
- name: ca-certificates
96+
mountPath: /usr/local/share/ca-certificates
97+
- name: ssl-certs
98+
mountPath: /etc/ssl/certs
99+
volumes:
100+
- name: ca-certificates
101+
hostPath:
102+
path: /usr/local/share/ca-certificates
103+
type: Directory
104+
- name: ssl-certs
105+
hostPath:
106+
path: /etc/ssl/certs
107+
type: Directory

sample-applications/chat-question-and-answer/chart/values.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,15 @@ Chatqna:
5353
INDEX_NAME: intel-rag
5454
FETCH_K: 10
5555
PORT_DB: 5432/langchain
56-
SERVICE_NAME: chatqna
57-
SERVICE_ENV: chatqna
5856
OTEL_METRICS_EXPORTER: otlp
5957
OTEL_TRACES_EXPORTER: otlp
6058
OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: http/protobuf
59+
OTEL_SERVICE_NAME: chatqna
60+
OTEL_SERVICE_ENV: chatqna
61+
OTEL_SERVICE_VERSION: "1.0.0"
6162
REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
6263
PG_CONNECTION_STRING: postgresql+psycopg://
64+
SEED: 40
6365

6466
dataprepPgvector:
6567
name: document-ingestion

0 commit comments

Comments
 (0)