NotPunchnox
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rkllama/api/format_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/rkllama/api/format_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rkllama/api/process.py‎
Lines changed: 323 additions & 331 deletions b/‎src/rkllama/api/process.py‎
Lines changed: 323 additions & 331 deletions
diff --git a/‎src/rkllama/api/server_utils.py‎
Lines changed: 56 additions & 39 deletions b/‎src/rkllama/api/server_utils.py‎
Lines changed: 56 additions & 39 deletions
diff --git a/‎src/rkllama/api/variables.py‎
Lines changed: 0 additions & 6 deletions b/‎src/rkllama/api/variables.py‎
Lines changed: 0 additions & 6 deletions
@@ -1,6 +1,6 @@
 # RKLLama: LLM Server and Client for Rockchip 3588/3576
 
-### [Version: 0.0.67](#New-Version)
+### [Version: 0.0.68](#New-Version)
 
 Video demo ( version 0.0.1 ):
 
 
@@ -1,6 +1,6 @@
 [project]
 name = "rkllama"
-version = "0.0.67"
+version = "0.0.68"
 authors = [
     { name="NotPunchnox", email="punchnoxpro@gmail.com" },
     { name="TomJacobsUK", email="tom@tomjacobs.co.uk" },
 
@@ -373,7 +373,7 @@ def openai_to_ollama_generate_request(openai_payload: dict) -> dict:
 
     model = openai_payload.get("model", "llama3")
     stream = openai_payload.get("stream", False)
-    images = images.get("images", [])
+    images = openai_payload.get("images", [])
 
     # Base Ollama payload
     ollama_payload = {
 
@@ -305,13 +305,14 @@ def handle_streaming(cls, model_name, final_prompt, format_spec, tools, enable_t
         # Check if multimodal or text only
         if not images:
             # Send the task of inference to the model
-            variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
         else:
             # Send the task of multimodal inference to the model
-            variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
 
-        # Wait for result pipe
-        manager_pipe = variables.worker_manager_rkllm.get_result(model_name)
+        # Get timeout
+        timeout = int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))
+
 
         def generate():
 
@@ -339,17 +340,17 @@ def generate():
 
 
             while not thread_finished or not final_sent:
-                if manager_pipe.poll(int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))):  # Timeout in seconds
-                    token = manager_pipe.recv()
+                if parent_pipe.poll(timeout):  # Timeout in seconds
+                    token = parent_pipe.recv()
                 else:
                     # Abort the current inference
                     variables.worker_manager_rkllm.workers[model_name].abort_flag.value = True
 
                     # Raise Exception
-                    logger.error(f"No response received by the Worker of the model {model_name} in {int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))} seconds.")
+                    logger.error(f"No response received by the Worker of the model {model_name} in {timeout} seconds.")
 
                     # Send message to the user
-                    token=f"Aborted inference by Timeout ({int(rkllama.config.get("model","max_seconds_waiting_worker_response"))} seconds). Try again."
+                    token=f"Aborted inference by Timeout ({timeout} seconds). Try again."
 
                     # Set finished state of the thread inference
                     thread_finished = True
@@ -359,6 +360,9 @@ def generate():
                     thread_finished = True
                     # Get the stats from the inference
                     _, prompt_token_count, token_count, prompt_eval, eval = token
+
+                    # CLose the parent pipe
+                    parent_pipe.close()
 
                 if not thread_finished:
                     count += 1
@@ -471,27 +475,28 @@ def handle_complete(cls, model_name, final_prompt, format_spec, tools, enable_th
         # Check if multimodal or text only
         if not images:
             # Send the task of inference to the model
-            variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
 
         else:
             # Send the task of multimodal inference to the model
-            variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
 
-        # Wait for result pipe
-        manager_pipe = variables.worker_manager_rkllm.get_result(model_name)
+        # Get timeout
+        timeout = int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))
 
         while not thread_finished:
-            if manager_pipe.poll(int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))):  # Timeout in seconds
-                token = manager_pipe.recv()
+            if parent_pipe.poll(timeout):  # Timeout in seconds
+                token = parent_pipe.recv()
             else:
+
                 # Abort the current inference
                 variables.worker_manager_rkllm.workers[model_name].abort_flag.value = True
 
                 # Raise Exception
-                logger.error(f"No response received by the Worker of the model {model_name} in {int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))} seconds.")
+                logger.error(f"No response received by the Worker of the model {model_name} in {timeout} seconds.")
 
                 # Send message to the user
-                token=f"Aborted inference by Timeout ({int(rkllama.config.get("model","max_seconds_waiting_worker_response"))} seconds). Try again."
+                token=f"Aborted inference by Timeout ({timeout} seconds). Try again."
 
                 # Set finished state of the thread inference
                 thread_finished = True
@@ -502,6 +507,9 @@ def handle_complete(cls, model_name, final_prompt, format_spec, tools, enable_th
                 # Get the stats from the inference
                 _, prompt_token_count, token_count, prompt_eval, eval = token
 
+                # Close the parent pipe
+                parent_pipe.close()
+
                 # Exit the loop    
                 continue
 
@@ -671,13 +679,13 @@ def handle_streaming(cls, model_name, final_prompt, format_spec, enable_thinking
         # Check if multimodal or text only
         if not images:
             # Send the task of inference to the model
-            variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
         else:
             # Send the task of multimodal inference to the model
-            variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
 
-        # Wait for result pipe
-        manager_pipe = variables.worker_manager_rkllm.get_result(model_name)
+        # Get Timeout
+        timeout = int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))
 
 
         def generate():
@@ -693,19 +701,19 @@ def generate():
             eval = None
 
             thread_finished = False
- 
+  
             while not thread_finished or not final_sent:
-                if manager_pipe.poll(int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))):  # Timeout in seconds
-                    token = manager_pipe.recv()
+                if parent_pipe.poll(timeout):  # Timeout in seconds
+                    token = parent_pipe.recv()
                 else:
                     # Abort the current inference
                     variables.worker_manager_rkllm.workers[model_name].abort_flag.value = True
 
                     # Raise Exception
-                    logger.error(f"No response received by the Worker of the model {model_name} in {int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))} seconds.")
+                    logger.error(f"No response received by the Worker of the model {model_name} in {timeout} seconds.")
 
                     # Send message to the user
-                    token=f"Aborted inference by Timeout ({int(rkllama.config.get("model","max_seconds_waiting_worker_response"))} seconds). Try again." 
+                    token=f"Aborted inference by Timeout ({timeout} seconds). Try again." 
 
                     # Set finished state of the thread inference
                     thread_finished = True
@@ -715,6 +723,9 @@ def generate():
                     thread_finished = True
                     # Get the stats from the inference
                     _, prompt_token_count, token_count, prompt_eval, eval = token
+
+                    # Close the parent pipe
+                    parent_pipe.close()
 
                 if not thread_finished:
                     count += 1
@@ -780,26 +791,26 @@ def handle_complete(cls, model_name, final_prompt, format_spec, enable_thinking,
         # Check if multimodal or text only
         if not images:
             # Send the task of inference to the model
-            variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.inference(model_name, final_prompt, prompt_cache_file)
         else:
             # Send the task of multimodal inference to the model
-            variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
+            parent_pipe = variables.worker_manager_rkllm.multimodal(model_name, final_prompt, images, prompt_cache_file)
 
-        # Wait for result pipe
-        manager_pipe = variables.worker_manager_rkllm.get_result(model_name)
+        # Get timeout 
+        timeout = int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))
 
         while not thread_finished:
-            if manager_pipe.poll(int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))):  # Timeout in seconds
-                token = manager_pipe.recv()
+            if parent_pipe.poll(timeout):  # Timeout in seconds
+                token = parent_pipe.recv()
             else:
                 # Abort the current inference
                 variables.worker_manager_rkllm.workers[model_name].abort_flag.value = True
 
                 # Raise Exception
-                logger.error(f"No response received by the Worker of the model {model_name} in {int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))} seconds.")
+                logger.error(f"No response received by the Worker of the model {model_name} in {timeout} seconds.")
 
                 # Send message to the user
-                token=f"Aborted inference by Timeout ({int(rkllama.config.get("model","max_seconds_waiting_worker_response"))} seconds). Try again." 
+                token=f"Aborted inference by Timeout ({timeout} seconds). Try again." 
 
                 # Set finished state of the thread inference
                 thread_finished = True
@@ -809,6 +820,9 @@ def handle_complete(cls, model_name, final_prompt, format_spec, enable_thinking,
                 thread_finished = True
                 # Get the stats from the inference
                 _, prompt_token_count, token_count, prompt_eval, eval = token
+
+                # Close the parent pipe
+                parent_pipe.close()
 
                 # Exit the loop
                 continue
@@ -971,26 +985,29 @@ def handle_complete(cls, model_name, input_text):
         for input in all_inputs:
 
             # Send the task of embedding to the model
-            variables.worker_manager_rkllm.embedding(model_name, input)
+            parent_pipe = variables.worker_manager_rkllm.embedding(model_name, input)
 
-            # Get the result from the input
-            manager_pipe = variables.worker_manager_rkllm.get_result(model_name)
+            # Get timeout
+            timeout = int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))
 
             # Wait for the last_embedding hidden layer return
-            if manager_pipe.poll(int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))):  # Timeout in seconds
-                last_embeddings = manager_pipe.recv()
+            if parent_pipe.poll(timeout):  # Timeout in seconds
+                last_embeddings = parent_pipe.recv()
             else:
                 # Abort the current inference
                 variables.worker_manager_rkllm.workers[model_name].abort_flag.value = True
                 # Raise Exception
-                logger.error(f"No response received by the Worker of the model {model_name} in {int(rkllama.config.get("model", "max_seconds_waiting_worker_response"))} seconds.")
+                logger.error(f"No response received by the Worker of the model {model_name} in {timeout} seconds.")
                 # Send empty embedding
                 last_embeddings = embeddings = {
                         'embedding': [],
                         'embd_size': 0,
                         'num_tokens': 0
                     }
 
+            # Close the parent pipe
+            parent_pipe.close()
+
             # Add the embedding to the list of result
             all_embeddings.append(last_embeddings["embedding"].tolist())
 
 
@@ -1,15 +1,9 @@
-import threading
 from rkllama.config import is_debug_mode
 from rkllama.api.worker import WorkerManager
 
-isLocked = False
-
 # Worker variables
 worker_manager_rkllm = WorkerManager()
 
-
-verrou = threading.Lock()
-
 model_id = ""
 system = "Tu es un assistant artificiel."
 model_config = {}  # For storing model-specific configuration