🐛 bugfix : Retrieval on llm-vllm service not working (opea-project#756)

krish918 · pre-commit-ci[bot] · web-flow · commit 2c0e43c318e6 · 2024-10-03T09:46:04.000+05:30
* [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 🐛 bugfix: Retrieval on llm-vllm service not working Signed-off-by: Krishna Murti <krishna.murti@intel.com> * removed LLMParamDocs instantiation Signed-off-by: Krishna Murti <krishna.murti@intel.com> * applied patch for recent updates for llm-vllm Signed-off-by: Krishna Murti <krishna.murti@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Krishna Murti <krishna.murti@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -57,13 +57,6 @@ def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, SearchedDoc])
     if not isinstance(input, SearchedDoc) and input.chat_template:
         prompt_template = PromptTemplate.from_template(input.chat_template)
         input_variables = prompt_template.input_variables
-    parameters = {
-        "max_tokens": input.max_tokens,
-        "top_p": input.top_p,
-        "temperature": input.temperature,
-        "frequency_penalty": input.frequency_penalty,
-        "presence_penalty": input.presence_penalty,
-    }
 
     if isinstance(input, SearchedDoc):
         if logflag:
@@ -81,6 +74,14 @@ def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, SearchedDoc])
         # use default llm parameter for inference
         new_input = LLMParamsDoc(query=prompt)
 
+        parameters = {
+            "max_tokens": new_input.max_tokens,
+            "top_p": new_input.top_p,
+            "temperature": new_input.temperature,
+            "frequency_penalty": new_input.frequency_penalty,
+            "presence_penalty": new_input.presence_penalty,
+        }
+
         if logflag:
             logger.info(f"[ SearchedDoc ] final input: {new_input}")
 
@@ -113,6 +114,14 @@ async def stream_generator():
 
         prompt = input.query
 
+        parameters = {
+            "max_tokens": input.max_tokens,
+            "top_p": input.top_p,
+            "temperature": input.temperature,
+            "frequency_penalty": input.frequency_penalty,
+            "presence_penalty": input.presence_penalty,
+        }
+
         if prompt_template:
             if sorted(input_variables) == ["context", "question"]:
                 prompt = prompt_template.format(question=input.query, context="\n".join(input.documents))
@@ -131,7 +140,7 @@ async def stream_generator():
 
             async def stream_generator():
                 chat_response = ""
-                async for text in llm.astream(input.query, **parameters):
+                async for text in llm.astream(prompt, **parameters):
                     chat_response += text
                     chunk_repr = repr(text.encode("utf-8"))
                     if logflag:
@@ -144,7 +153,7 @@ async def stream_generator():
             return StreamingResponse(stream_generator(), media_type="text/event-stream")
 
         else:
-            response = llm.invoke(input.query, **parameters)
+            response = llm.invoke(prompt, **parameters)
             if logflag:
                 logger.info(response)